diff options
Diffstat (limited to 'src/librbd')
293 files changed, 72585 insertions, 0 deletions
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc new file mode 100644 index 00000000..b6dbcb26 --- /dev/null +++ b/src/librbd/AsyncObjectThrottle.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncObjectThrottle.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "librbd/AsyncRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +namespace librbd +{ + +template <typename T> +AsyncObjectThrottle<T>::AsyncObjectThrottle( + const AsyncRequest<T>* async_request, T &image_ctx, + const ContextFactory& context_factory, Context *ctx, + ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no) + : m_lock(util::unique_lock_name("librbd::AsyncThrottle::m_lock", this)), + m_async_request(async_request), m_image_ctx(image_ctx), + m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx), + m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0), + m_ret(0) +{ +} + +template <typename T> +void AsyncObjectThrottle<T>::start_ops(uint64_t max_concurrent) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + bool complete; + { + Mutex::Locker l(m_lock); + for (uint64_t i = 0; i < max_concurrent; ++i) { + start_next_op(); + if (m_ret < 0 && m_current_ops == 0) { + break; + } + } + complete = (m_current_ops == 0); + } + if (complete) { + // avoid re-entrant callback + m_image_ctx.op_work_queue->queue(m_ctx, m_ret); + delete this; + } +} + +template <typename T> +void AsyncObjectThrottle<T>::finish_op(int r) { + bool complete; + { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + Mutex::Locker locker(m_lock); + --m_current_ops; + if (r < 0 && r != -ENOENT && m_ret == 0) { + m_ret = r; + } + + start_next_op(); + complete = (m_current_ops == 0); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +template <typename T> +void AsyncObjectThrottle<T>::start_next_op() { + bool done = false; + while (!done) { + if (m_async_request != NULL && m_async_request->is_canceled() && + m_ret == 0) { + // allow in-flight ops to complete, but don't start new ops + m_ret = -ERESTART; + return; + } else if (m_ret != 0 || m_object_no >= m_end_object_no) { + return; + } + + uint64_t ono = m_object_no++; + C_AsyncObjectThrottle<T> *ctx = m_context_factory(*this, ono); + + int r = ctx->send(); + if (r < 0) { + m_ret = r; + delete ctx; + return; + } else if (r > 0) { + // op completed immediately + delete ctx; + } else { + ++m_current_ops; + done = true; + } + if (m_prog_ctx != NULL) { + r = m_prog_ctx->update_progress(ono, m_end_object_no); + if (r < 0) { + m_ret = r; + } + } + } +} + +} // namespace librbd + +#ifndef TEST_F +template class librbd::AsyncObjectThrottle<librbd::ImageCtx>; +#endif diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h new file mode 100644 index 00000000..e1b08962 --- /dev/null +++ b/src/librbd/AsyncObjectThrottle.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H +#define CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H + +#include "include/int_types.h" +#include "include/Context.h" + +#include <boost/function.hpp> + +namespace librbd +{ +template <typename ImageCtxT> class AsyncRequest; +class ProgressContext; +struct ImageCtx; + +class AsyncObjectThrottleFinisher { +public: + virtual ~AsyncObjectThrottleFinisher() {}; + virtual void finish_op(int r) = 0; +}; + +template <typename ImageCtxT = ImageCtx> +class C_AsyncObjectThrottle : public Context { +public: + C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher, + ImageCtxT &image_ctx) + : m_image_ctx(image_ctx), m_finisher(finisher) { + } + + virtual int send() = 0; + +protected: + ImageCtxT &m_image_ctx; + + void finish(int r) override { + m_finisher.finish_op(r); + } + +private: + AsyncObjectThrottleFinisher &m_finisher; +}; + +template <typename ImageCtxT = ImageCtx> +class AsyncObjectThrottle : public AsyncObjectThrottleFinisher { +public: + typedef boost::function< + C_AsyncObjectThrottle<ImageCtxT>* (AsyncObjectThrottle&, + uint64_t)> ContextFactory; + + AsyncObjectThrottle(const AsyncRequest<ImageCtxT> *async_request, + ImageCtxT &image_ctx, + const ContextFactory& context_factory, Context *ctx, + ProgressContext *prog_ctx, uint64_t object_no, + uint64_t end_object_no); + + void start_ops(uint64_t max_concurrent); + void finish_op(int r) override; + +private: + Mutex m_lock; + const AsyncRequest<ImageCtxT> *m_async_request; + ImageCtxT &m_image_ctx; + ContextFactory m_context_factory; + Context *m_ctx; + ProgressContext *m_prog_ctx; + uint64_t m_object_no; + uint64_t m_end_object_no; + uint64_t m_current_ops; + int m_ret; + + void start_next_op(); +}; + +} // namespace librbd + +extern template class librbd::AsyncObjectThrottle<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc new file mode 100644 index 00000000..8a76a226 --- /dev/null +++ b/src/librbd/AsyncRequest.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "common/WorkQueue.h" + +namespace librbd +{ + +template <typename T> +AsyncRequest<T>::AsyncRequest(T &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false), + m_xlist_item(this) { + ceph_assert(m_on_finish != NULL); + start_request(); +} + +template <typename T> +AsyncRequest<T>::~AsyncRequest() { +} + +template <typename T> +void AsyncRequest<T>::async_complete(int r) { + m_image_ctx.op_work_queue->queue(create_callback_context(), r); +} + +template <typename T> +librados::AioCompletion *AsyncRequest<T>::create_callback_completion() { + return util::create_rados_callback(this); +} + +template <typename T> +Context *AsyncRequest<T>::create_callback_context() { + return util::create_context_callback(this); +} + +template <typename T> +Context *AsyncRequest<T>::create_async_callback_context() { + return util::create_context_callback<AsyncRequest<T>, + &AsyncRequest<T>::async_complete>(this); +} + +template <typename T> +void AsyncRequest<T>::start_request() { + Mutex::Locker async_ops_locker(m_image_ctx.async_ops_lock); + m_image_ctx.async_requests.push_back(&m_xlist_item); +} + +template <typename T> +void AsyncRequest<T>::finish_request() { + decltype(m_image_ctx.async_requests_waiters) waiters; + { + Mutex::Locker async_ops_locker(m_image_ctx.async_ops_lock); + ceph_assert(m_xlist_item.remove_myself()); + + if (m_image_ctx.async_requests.empty()) { + waiters = std::move(m_image_ctx.async_requests_waiters); + } + } + + for (auto ctx : waiters) { + ctx->complete(0); + } +} + +} // namespace librbd + +#ifndef TEST_F +template class librbd::AsyncRequest<librbd::ImageCtx>; +#endif diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h new file mode 100644 index 00000000..f74368dc --- /dev/null +++ b/src/librbd/AsyncRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_REQUEST_H +#define CEPH_LIBRBD_ASYNC_REQUEST_H + +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "include/xlist.h" +#include "include/compat.h" + +namespace librbd { + +class ImageCtx; + +template <typename ImageCtxT = ImageCtx> +class AsyncRequest +{ +public: + AsyncRequest(ImageCtxT &image_ctx, Context *on_finish); + virtual ~AsyncRequest(); + + void complete(int r) { + if (should_complete(r)) { + r = filter_return_code(r); + finish_and_destroy(r); + } + } + + virtual void send() = 0; + + inline bool is_canceled() const { + return m_canceled; + } + inline void cancel() { + m_canceled = true; + } + +protected: + ImageCtxT &m_image_ctx; + + librados::AioCompletion *create_callback_completion(); + Context *create_callback_context(); + Context *create_async_callback_context(); + + void async_complete(int r); + + virtual bool should_complete(int r) = 0; + virtual int filter_return_code(int r) const { + return r; + } + + // NOTE: temporary until converted to new state machine format + virtual void finish_and_destroy(int r) { + finish(r); + delete this; + } + + virtual void finish(int r) { + finish_request(); + m_on_finish->complete(r); + } + +private: + Context *m_on_finish; + bool m_canceled; + typename xlist<AsyncRequest<ImageCtxT> *>::item m_xlist_item; + + void start_request(); + void finish_request(); +}; + +} // namespace librbd + +extern template class librbd::AsyncRequest<librbd::ImageCtx>; + +#endif //CEPH_LIBRBD_ASYNC_REQUEST_H diff --git a/src/librbd/BlockGuard.h b/src/librbd/BlockGuard.h new file mode 100644 index 00000000..062f1901 --- /dev/null +++ b/src/librbd/BlockGuard.h @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_BLOCK_GUARD_H +#define CEPH_LIBRBD_IO_BLOCK_GUARD_H + +#include "include/int_types.h" +#include "common/dout.h" +#include "common/Mutex.h" +#include <boost/intrusive/list.hpp> +#include <boost/intrusive/set.hpp> +#include <deque> +#include <list> +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::BlockGuard: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +struct BlockExtent { + uint64_t block_start = 0; + uint64_t block_end = 0; + + BlockExtent() { + } + BlockExtent(uint64_t block_start, uint64_t block_end) + : block_start(block_start), block_end(block_end) { + } +}; + +struct BlockGuardCell { +}; + +/** + * Helper class to restrict and order concurrent IO to the same block. The + * definition of a block is dependent upon the user of this class. It might + * represent a backing object, 512 byte sectors, etc. + */ +template <typename BlockOperation> +class BlockGuard { +private: + struct DetainedBlockExtent; + +public: + typedef std::list<BlockOperation> BlockOperations; + + BlockGuard(CephContext *cct) + : m_cct(cct), m_lock("librbd::BlockGuard::m_lock") { + } + + BlockGuard(const BlockGuard&) = delete; + BlockGuard &operator=(const BlockGuard&) = delete; + + /** + * Detain future IO for a range of blocks. the guard will assume + * ownership of the provided operation if the operation is blocked. + * @return 0 upon success and IO can be issued + * >0 if the IO is blocked, + * <0 upon error + */ + int detain(const BlockExtent &block_extent, BlockOperation *block_operation, + BlockGuardCell **cell) { + Mutex::Locker locker(m_lock); + ldout(m_cct, 20) << "block_start=" << block_extent.block_start << ", " + << "block_end=" << block_extent.block_end << ", " + << "free_slots=" << m_free_detained_block_extents.size() + << dendl; + + DetainedBlockExtent *detained_block_extent; + auto it = m_detained_block_extents.find(block_extent); + if (it != m_detained_block_extents.end()) { + // request against an already detained block + detained_block_extent = &(*it); + if (block_operation != nullptr) { + detained_block_extent->block_operations.emplace_back( + std::move(*block_operation)); + } + + // alert the caller that the IO was detained + *cell = nullptr; + return detained_block_extent->block_operations.size(); + } else { + if (!m_free_detained_block_extents.empty()) { + detained_block_extent = &m_free_detained_block_extents.front(); + detained_block_extent->block_operations.clear(); + m_free_detained_block_extents.pop_front(); + } else { + ldout(m_cct, 20) << "no free detained block cells" << dendl; + m_detained_block_extent_pool.emplace_back(); + detained_block_extent = &m_detained_block_extent_pool.back(); + } + + detained_block_extent->block_extent = block_extent; + m_detained_block_extents.insert(*detained_block_extent); + *cell = reinterpret_cast<BlockGuardCell*>(detained_block_extent); + return 0; + } + } + + /** + * Release any detained IO operations from the provided cell. + */ + void release(BlockGuardCell *cell, BlockOperations *block_operations) { + Mutex::Locker locker(m_lock); + + ceph_assert(cell != nullptr); + auto &detained_block_extent = reinterpret_cast<DetainedBlockExtent &>( + *cell); + ldout(m_cct, 20) << "block_start=" + << detained_block_extent.block_extent.block_start << ", " + << "block_end=" + << detained_block_extent.block_extent.block_end << ", " + << "pending_ops=" + << (detained_block_extent.block_operations.empty() ? + 0 : detained_block_extent.block_operations.size() - 1) + << dendl; + + *block_operations = std::move(detained_block_extent.block_operations); + m_detained_block_extents.erase(detained_block_extent.block_extent); + m_free_detained_block_extents.push_back(detained_block_extent); + } + +private: + struct DetainedBlockExtent : public boost::intrusive::list_base_hook<>, + public boost::intrusive::set_base_hook<> { + BlockExtent block_extent; + BlockOperations block_operations; + }; + + struct DetainedBlockExtentKey { + typedef BlockExtent type; + const BlockExtent &operator()(const DetainedBlockExtent &value) { + return value.block_extent; + } + }; + + struct DetainedBlockExtentCompare { + bool operator()(const BlockExtent &lhs, + const BlockExtent &rhs) const { + // check for range overlap (lhs < rhs) + if (lhs.block_end <= rhs.block_start) { + return true; + } + return false; + } + }; + + typedef std::deque<DetainedBlockExtent> DetainedBlockExtentsPool; + typedef boost::intrusive::list<DetainedBlockExtent> DetainedBlockExtents; + typedef boost::intrusive::set< + DetainedBlockExtent, + boost::intrusive::compare<DetainedBlockExtentCompare>, + boost::intrusive::key_of_value<DetainedBlockExtentKey> > + BlockExtentToDetainedBlockExtents; + + CephContext *m_cct; + + Mutex m_lock; + DetainedBlockExtentsPool m_detained_block_extent_pool; + DetainedBlockExtents m_free_detained_block_extents; + BlockExtentToDetainedBlockExtents m_detained_block_extents; + +}; + +} // namespace librbd + +#undef dout_subsys +#undef dout_prefix +#define dout_prefix *_dout + +#endif // CEPH_LIBRBD_IO_BLOCK_GUARD_H diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt new file mode 100644 index 00000000..149c19df --- /dev/null +++ b/src/librbd/CMakeLists.txt @@ -0,0 +1,197 @@ +if(Boost_VERSION VERSION_GREATER 1.73) + add_definitions(-DBOOST_ASIO_USE_TS_EXECUTOR_AS_DEFAULT) +endif() + +add_library(rbd_types STATIC + journal/Types.cc + mirroring_watcher/Types.cc + trash_watcher/Types.cc + watcher/Types.cc + WatchNotifyTypes.cc) + +set(librbd_internal_srcs + AsyncObjectThrottle.cc + AsyncRequest.cc + ConfigWatcher.cc + DeepCopyRequest.cc + ExclusiveLock.cc + ImageCtx.cc + ImageState.cc + ImageWatcher.cc + internal.cc + Journal.cc + LibrbdAdminSocketHook.cc + LibrbdWriteback.cc + ManagedLock.cc + MirroringWatcher.cc + ObjectMap.cc + Operations.cc + TrashWatcher.cc + Utils.cc + Watcher.cc + api/Config.cc + api/DiffIterate.cc + api/Group.cc + api/Image.cc + api/Migration.cc + api/Mirror.cc + api/Namespace.cc + api/Pool.cc + api/PoolMetadata.cc + api/Snapshot.cc + api/Trash.cc + cache/ImageWriteback.cc + cache/ObjectCacherObjectDispatch.cc + cache/PassthroughImageCache.cc + deep_copy/ImageCopyRequest.cc + deep_copy/MetadataCopyRequest.cc + deep_copy/ObjectCopyRequest.cc + deep_copy/SetHeadRequest.cc + deep_copy/SnapshotCopyRequest.cc + deep_copy/SnapshotCreateRequest.cc + deep_copy/Utils.cc + exclusive_lock/AutomaticPolicy.cc + exclusive_lock/PreAcquireRequest.cc + exclusive_lock/PostAcquireRequest.cc + exclusive_lock/PreReleaseRequest.cc + exclusive_lock/StandardPolicy.cc + image/AttachChildRequest.cc + image/AttachParentRequest.cc + image/CloneRequest.cc + image/CloseRequest.cc + image/CreateRequest.cc + image/DetachChildRequest.cc + image/DetachParentRequest.cc + image/ListWatchersRequest.cc + image/OpenRequest.cc + image/PreRemoveRequest.cc + image/RefreshParentRequest.cc + image/RefreshRequest.cc + image/RemoveRequest.cc + image/SetFlagsRequest.cc + image/SetSnapRequest.cc + image/ValidatePoolRequest.cc + image_watcher/NotifyLockOwner.cc + io/AioCompletion.cc + io/AsyncOperation.cc + io/CopyupRequest.cc + io/ImageDispatchSpec.cc + io/ImageRequest.cc + io/ImageRequestWQ.cc + io/ObjectDispatch.cc + io/ObjectDispatchSpec.cc + io/ObjectDispatcher.cc + io/ObjectRequest.cc + io/ReadResult.cc + io/Utils.cc + journal/CreateRequest.cc + journal/DemoteRequest.cc + journal/ObjectDispatch.cc + journal/OpenRequest.cc + journal/PromoteRequest.cc + journal/RemoveRequest.cc + journal/Replay.cc + journal/ResetRequest.cc + journal/StandardPolicy.cc + journal/Utils.cc + managed_lock/AcquireRequest.cc + managed_lock/BreakRequest.cc + managed_lock/GetLockerRequest.cc + managed_lock/ReacquireRequest.cc + managed_lock/ReleaseRequest.cc + managed_lock/Utils.cc + mirror/DemoteRequest.cc + mirror/DisableRequest.cc + mirror/EnableRequest.cc + mirror/GetInfoRequest.cc + mirror/GetStatusRequest.cc + mirror/PromoteRequest.cc + object_map/CreateRequest.cc + object_map/InvalidateRequest.cc + object_map/LockRequest.cc + object_map/RefreshRequest.cc + object_map/RemoveRequest.cc + object_map/Request.cc + object_map/ResizeRequest.cc + object_map/SnapshotCreateRequest.cc + object_map/SnapshotRemoveRequest.cc + object_map/SnapshotRollbackRequest.cc + object_map/UnlockRequest.cc + object_map/UpdateRequest.cc + operation/DisableFeaturesRequest.cc + operation/EnableFeaturesRequest.cc + operation/FlattenRequest.cc + operation/MetadataRemoveRequest.cc + operation/MetadataSetRequest.cc + operation/MigrateRequest.cc + operation/ObjectMapIterate.cc + operation/RebuildObjectMapRequest.cc + operation/RenameRequest.cc + operation/Request.cc + operation/ResizeRequest.cc + operation/SnapshotCreateRequest.cc + operation/SnapshotProtectRequest.cc + operation/SnapshotRemoveRequest.cc + operation/SnapshotRenameRequest.cc + operation/SnapshotRollbackRequest.cc + operation/SnapshotUnprotectRequest.cc + operation/SnapshotLimitRequest.cc + operation/SparsifyRequest.cc + operation/TrimRequest.cc + trash/MoveRequest.cc + trash/RemoveRequest.cc + watcher/Notifier.cc + watcher/RewatchRequest.cc + ${CMAKE_SOURCE_DIR}/src/common/ContextCompletion.cc) + +add_library(rbd_api STATIC librbd.cc) +add_library(rbd_internal STATIC + ${librbd_internal_srcs} + $<TARGET_OBJECTS:rados_snap_set_diff_obj>) +if(WITH_LTTNG) + # librbd.cc includes tracing/librbd.h + add_dependencies(rbd_api librbd-tp) + # io/AioCompletion.cc includes tracing/librbd.h + add_dependencies(rbd_internal librbd-tp) +endif() +if(WITH_LTTNG AND WITH_EVENTTRACE) + add_dependencies(rbd_internal eventtrace_tp) +endif() +target_link_libraries(rbd_internal PRIVATE + osdc) + +add_library(librbd ${CEPH_SHARED} + librbd.cc) +if(WITH_LTTNG) + add_dependencies(librbd librbd-tp) +endif() + +target_link_libraries(librbd PRIVATE + rbd_internal + rbd_types + journal + librados + cls_rbd_client + cls_lock_client + cls_journal_client + ceph-common + pthread + ${CMAKE_DL_LIBS} + ${EXTRALIBS} ${GSSAPI_LIBRARIES}) +if(HAVE_UDEV) + target_link_libraries(librbd PRIVATE + udev) +endif() +if(ENABLE_SHARED) + set_target_properties(librbd PROPERTIES + OUTPUT_NAME rbd + VERSION 1.12.0 + SOVERSION 1 + CXX_VISIBILITY_PRESET hidden + VISIBILITY_INLINES_HIDDEN ON) + if(NOT APPLE) + set_property(TARGET librbd APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--exclude-libs,ALL") + endif() +endif(ENABLE_SHARED) +install(TARGETS librbd DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/src/librbd/ConfigWatcher.cc b/src/librbd/ConfigWatcher.cc new file mode 100644 index 00000000..f8f17016 --- /dev/null +++ b/src/librbd/ConfigWatcher.cc @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ConfigWatcher.h" +#include "common/config_obs.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/api/Config.h" +#include <deque> +#include <string> +#include <vector> +#include <boost/algorithm/string/predicate.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ConfigWatcher: " \ + << __func__ << ": " + +namespace librbd { + +template <typename I> +struct ConfigWatcher<I>::Observer : public md_config_obs_t { + ConfigWatcher<I>* m_config_watcher; + + std::deque<std::string> m_config_key_strs; + mutable std::vector<const char*> m_config_keys; + + Observer(CephContext* cct, ConfigWatcher<I>* config_watcher) + : m_config_watcher(config_watcher) { + const std::string rbd_key_prefix("rbd_"); + auto& schema = cct->_conf.get_schema(); + for (auto& pair : schema) { + // watch all "rbd_" keys for simplicity + if (!boost::starts_with(pair.first, rbd_key_prefix)) { + continue; + } + + m_config_key_strs.emplace_back(pair.first); + } + + m_config_keys.reserve(m_config_key_strs.size()); + for (auto& key : m_config_key_strs) { + m_config_keys.emplace_back(key.c_str()); + } + m_config_keys.emplace_back(nullptr); + } + + const char** get_tracked_conf_keys() const override { + ceph_assert(!m_config_keys.empty()); + return &m_config_keys[0]; + } + + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override { + m_config_watcher->handle_global_config_change(changed); + } +}; + +template <typename I> +ConfigWatcher<I>::ConfigWatcher(I& image_ctx) + : m_image_ctx(image_ctx) { +} + +template <typename I> +ConfigWatcher<I>::~ConfigWatcher() { + ceph_assert(m_observer == nullptr); +} + +template <typename I> +void ConfigWatcher<I>::init() { + auto cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + m_observer = new Observer(cct, this); + cct->_conf.add_observer(m_observer); +} + +template <typename I> +void ConfigWatcher<I>::shut_down() { + auto cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(m_observer != nullptr); + cct->_conf.remove_observer(m_observer); + + delete m_observer; + m_observer = nullptr; +} + +template <typename I> +void ConfigWatcher<I>::handle_global_config_change( + std::set<std::string> changed_keys) { + + { + // ignore any global changes that are being overridden + RWLock::RLocker md_locker(m_image_ctx.md_lock); + for (auto& key : m_image_ctx.config_overrides) { + changed_keys.erase(key); + } + } + if (changed_keys.empty()) { + return; + } + + auto cct = m_image_ctx.cct; + ldout(cct, 10) << "changed_keys=" << changed_keys << dendl; + + // refresh the image to pick up any global config overrides + m_image_ctx.state->handle_update_notification(); +} + +} // namespace librbd + +template class librbd::ConfigWatcher<librbd::ImageCtx>; diff --git a/src/librbd/ConfigWatcher.h b/src/librbd/ConfigWatcher.h new file mode 100644 index 00000000..1f10c8cb --- /dev/null +++ b/src/librbd/ConfigWatcher.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CONFIG_WATCHER_H +#define CEPH_LIBRBD_CONFIG_WATCHER_H + +#include <set> +#include <string> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +template <typename ImageCtxT> +class ConfigWatcher { +public: + static ConfigWatcher* create(ImageCtxT& image_ctx) { + return new ConfigWatcher(image_ctx); + } + + ConfigWatcher(ImageCtxT& image_ctx); + ~ConfigWatcher(); + + ConfigWatcher(const ConfigWatcher&) = delete; + ConfigWatcher& operator=(const ConfigWatcher&) = delete; + + void init(); + void shut_down(); + +private: + struct Observer; + + ImageCtxT& m_image_ctx; + + Observer* m_observer = nullptr; + + void handle_global_config_change(std::set<std::string> changed); + +}; + +} // namespace librbd + +extern template class librbd::ConfigWatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_CONFIG_WATCHER_H diff --git a/src/librbd/DeepCopyRequest.cc b/src/librbd/DeepCopyRequest.cc new file mode 100644 index 00000000..3350fa3b --- /dev/null +++ b/src/librbd/DeepCopyRequest.cc @@ -0,0 +1,362 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "DeepCopyRequest.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/ImageCopyRequest.h" +#include "librbd/deep_copy/MetadataCopyRequest.h" +#include "librbd/deep_copy/SnapshotCopyRequest.h" +#include "librbd/internal.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::DeepCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { + +using namespace librbd::deep_copy; + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; +using librbd::util::unique_lock_name; + +template <typename I> +DeepCopyRequest<I>::DeepCopyRequest(I *src_image_ctx, I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const ObjectNumber &object_number, + ContextWQ *work_queue, SnapSeqs *snap_seqs, + ProgressContext *prog_ctx, + Context *on_finish) + : RefCountedObject(dst_image_ctx->cct, 1), m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start), + m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start), + m_flatten(flatten), m_object_number(object_number), + m_work_queue(work_queue), m_snap_seqs(snap_seqs), m_prog_ctx(prog_ctx), + m_on_finish(on_finish), m_cct(dst_image_ctx->cct), + m_lock(unique_lock_name("DeepCopyRequest::m_lock", this)) { +} + +template <typename I> +DeepCopyRequest<I>::~DeepCopyRequest() { + ceph_assert(m_snapshot_copy_request == nullptr); + ceph_assert(m_image_copy_request == nullptr); +} + +template <typename I> +void DeepCopyRequest<I>::send() { + if (!m_src_image_ctx->data_ctx.is_valid()) { + lderr(m_cct) << "missing data pool for source image" << dendl; + finish(-ENODEV); + return; + } + + if (!m_dst_image_ctx->data_ctx.is_valid()) { + lderr(m_cct) << "missing data pool for destination image" << dendl; + finish(-ENODEV); + return; + } + + int r = validate_copy_points(); + if (r < 0) { + finish(r); + return; + } + + send_copy_snapshots(); +} + +template <typename I> +void DeepCopyRequest<I>::cancel() { + Mutex::Locker locker(m_lock); + + ldout(m_cct, 20) << dendl; + + m_canceled = true; + + if (m_snapshot_copy_request != nullptr) { + m_snapshot_copy_request->cancel(); + } + + if (m_image_copy_request != nullptr) { + m_image_copy_request->cancel(); + } +} + +template <typename I> +void DeepCopyRequest<I>::send_copy_snapshots() { + m_lock.Lock(); + if (m_canceled) { + m_lock.Unlock(); + finish(-ECANCELED); + return; + } + + ldout(m_cct, 20) << dendl; + + Context *ctx = create_context_callback< + DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_snapshots>(this); + m_snapshot_copy_request = SnapshotCopyRequest<I>::create( + m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end, + m_dst_snap_id_start, m_flatten, m_work_queue, m_snap_seqs, ctx); + m_snapshot_copy_request->get(); + m_lock.Unlock(); + + m_snapshot_copy_request->send(); +} + +template <typename I> +void DeepCopyRequest<I>::handle_copy_snapshots(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + m_snapshot_copy_request->put(); + m_snapshot_copy_request = nullptr; + if (r == 0 && m_canceled) { + r = -ECANCELED; + } + } + + if (r == -ECANCELED) { + ldout(m_cct, 10) << "snapshot copy canceled" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to copy snapshot metadata: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_src_snap_id_end == CEPH_NOSNAP) { + (*m_snap_seqs)[CEPH_NOSNAP] = CEPH_NOSNAP; + } + + send_copy_image(); +} + +template <typename I> +void DeepCopyRequest<I>::send_copy_image() { + m_lock.Lock(); + if (m_canceled) { + m_lock.Unlock(); + finish(-ECANCELED); + return; + } + + ldout(m_cct, 20) << dendl; + + Context *ctx = create_context_callback< + DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_image>(this); + m_image_copy_request = ImageCopyRequest<I>::create( + m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end, + m_dst_snap_id_start, m_flatten, m_object_number, *m_snap_seqs, m_prog_ctx, + ctx); + m_image_copy_request->get(); + m_lock.Unlock(); + + m_image_copy_request->send(); +} + +template <typename I> +void DeepCopyRequest<I>::handle_copy_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + m_image_copy_request->put(); + m_image_copy_request = nullptr; + if (r == 0 && m_canceled) { + r = -ECANCELED; + } + } + + if (r == -ECANCELED) { + ldout(m_cct, 10) << "image copy canceled" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to copy image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_copy_object_map(); +} + +template <typename I> +void DeepCopyRequest<I>::send_copy_object_map() { + m_dst_image_ctx->owner_lock.get_read(); + m_dst_image_ctx->snap_lock.get_read(); + + if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP, + m_dst_image_ctx->snap_lock)) { + m_dst_image_ctx->snap_lock.put_read(); + m_dst_image_ctx->owner_lock.put_read(); + send_copy_metadata(); + return; + } + if (m_src_snap_id_end == CEPH_NOSNAP) { + m_dst_image_ctx->snap_lock.put_read(); + m_dst_image_ctx->owner_lock.put_read(); + send_refresh_object_map(); + return; + } + + ceph_assert(m_dst_image_ctx->object_map != nullptr); + + ldout(m_cct, 20) << dendl; + + Context *finish_op_ctx = nullptr; + int r; + if (m_dst_image_ctx->exclusive_lock != nullptr) { + finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r); + } + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + m_dst_image_ctx->snap_lock.put_read(); + m_dst_image_ctx->owner_lock.put_read(); + finish(r); + return; + } + + // rollback the object map (copy snapshot object map to HEAD) + RWLock::WLocker object_map_locker(m_dst_image_ctx->object_map_lock); + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_copy_object_map(r); + finish_op_ctx->complete(0); + }); + ceph_assert(m_snap_seqs->count(m_src_snap_id_end) > 0); + librados::snap_t copy_snap_id = (*m_snap_seqs)[m_src_snap_id_end]; + m_dst_image_ctx->object_map->rollback(copy_snap_id, ctx); + m_dst_image_ctx->snap_lock.put_read(); + m_dst_image_ctx->owner_lock.put_read(); +} + +template <typename I> +void DeepCopyRequest<I>::handle_copy_object_map(int r) { + ldout(m_cct, 20) << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to roll back object map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + send_refresh_object_map(); +} + +template <typename I> +void DeepCopyRequest<I>::send_refresh_object_map() { + int r; + Context *finish_op_ctx = nullptr; + { + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + if (m_dst_image_ctx->exclusive_lock != nullptr) { + finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r); + } + } + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + ldout(m_cct, 20) << dendl; + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_refresh_object_map(r); + finish_op_ctx->complete(0); + }); + m_object_map = m_dst_image_ctx->create_object_map(CEPH_NOSNAP); + m_object_map->open(ctx); +} + +template <typename I> +void DeepCopyRequest<I>::handle_refresh_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to open object map: " << cpp_strerror(r) + << dendl; + delete m_object_map; + + finish(r); + return; + } + + { + RWLock::WLocker snap_locker(m_dst_image_ctx->snap_lock); + RWLock::WLocker object_map_locker(m_dst_image_ctx->object_map_lock); + std::swap(m_dst_image_ctx->object_map, m_object_map); + } + delete m_object_map; + + send_copy_metadata(); +} + +template <typename I> +void DeepCopyRequest<I>::send_copy_metadata() { + ldout(m_cct, 20) << dendl; + + Context *ctx = create_context_callback< + DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_metadata>(this); + auto request = MetadataCopyRequest<I>::create(m_src_image_ctx, + m_dst_image_ctx, ctx); + request->send(); +} + +template <typename I> +void DeepCopyRequest<I>::handle_copy_metadata(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +int DeepCopyRequest<I>::validate_copy_points() { + RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock); + + if (m_src_snap_id_start != 0 && + m_src_image_ctx->snap_info.find(m_src_snap_id_start) == + m_src_image_ctx->snap_info.end()) { + lderr(m_cct) << "invalid start snap_id " << m_src_snap_id_start << dendl; + return -EINVAL; + } + + if (m_src_snap_id_end != CEPH_NOSNAP && + m_src_image_ctx->snap_info.find(m_src_snap_id_end) == + m_src_image_ctx->snap_info.end()) { + lderr(m_cct) << "invalid end snap_id " << m_src_snap_id_end << dendl; + return -EINVAL; + } + + return 0; +} + +template <typename I> +void DeepCopyRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + put(); +} + +} // namespace librbd + +template class librbd::DeepCopyRequest<librbd::ImageCtx>; diff --git a/src/librbd/DeepCopyRequest.h b/src/librbd/DeepCopyRequest.h new file mode 100644 index 00000000..24687c96 --- /dev/null +++ b/src/librbd/DeepCopyRequest.h @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_REQUEST_H + +#include "common/Mutex.h" +#include "common/RefCountedObj.h" +#include "include/int_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include "librbd/deep_copy/Types.h" + +#include <map> +#include <vector> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace deep_copy { + +template <typename> class ImageCopyRequest; +template <typename> class SnapshotCopyRequest; + +} + +template <typename ImageCtxT = ImageCtx> +class DeepCopyRequest : public RefCountedObject { +public: + static DeepCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const deep_copy::ObjectNumber &object_number, + ContextWQ *work_queue, + SnapSeqs *snap_seqs, + ProgressContext *prog_ctx, + Context *on_finish) { + return new DeepCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start, + src_snap_id_end, dst_snap_id_start, flatten, + object_number, work_queue, snap_seqs, prog_ctx, + on_finish); + } + + DeepCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, const deep_copy::ObjectNumber &object_number, + ContextWQ *work_queue, SnapSeqs *snap_seqs, + ProgressContext *prog_ctx, Context *on_finish); + ~DeepCopyRequest(); + + void send(); + void cancel(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * COPY_SNAPSHOTS + * | + * v + * COPY_IMAGE . . . . . . . . . . . . . . + * | . + * v . + * COPY_OBJECT_MAP (skip if object . + * | map disabled) . + * v . + * REFRESH_OBJECT_MAP (skip if object . (image copy canceled) + * | map disabled) . + * v . + * COPY_METADATA . + * | . + * v . + * <finish> < . . . . . . . . . . . . . . + * + * @endverbatim + */ + + typedef std::vector<librados::snap_t> SnapIds; + typedef std::map<librados::snap_t, SnapIds> SnapMap; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_src_snap_id_end; + librados::snap_t m_dst_snap_id_start; + bool m_flatten; + deep_copy::ObjectNumber m_object_number; + ContextWQ *m_work_queue; + SnapSeqs *m_snap_seqs; + ProgressContext *m_prog_ctx; + Context *m_on_finish; + + CephContext *m_cct; + Mutex m_lock; + bool m_canceled = false; + + deep_copy::SnapshotCopyRequest<ImageCtxT> *m_snapshot_copy_request = nullptr; + deep_copy::ImageCopyRequest<ImageCtxT> *m_image_copy_request = nullptr; + decltype(ImageCtxT::object_map) m_object_map = nullptr; + + void send_copy_snapshots(); + void handle_copy_snapshots(int r); + + void send_copy_image(); + void handle_copy_image(int r); + + void send_copy_object_map(); + void handle_copy_object_map(int r); + + void send_refresh_object_map(); + void handle_refresh_object_map(int r); + + void send_copy_metadata(); + void handle_copy_metadata(int r); + + int validate_copy_points(); + + void finish(int r); +}; + +} // namespace librbd + +extern template class librbd::DeepCopyRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_REQUEST_H diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc new file mode 100644 index 00000000..4db3f29a --- /dev/null +++ b/src/librbd/ExclusiveLock.cc @@ -0,0 +1,374 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ImageState.h" +#include "librbd/exclusive_lock/PreAcquireRequest.h" +#include "librbd/exclusive_lock/PostAcquireRequest.h" +#include "librbd/exclusive_lock/PreReleaseRequest.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/Utils.h" +#include "common/Mutex.h" +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ExclusiveLock: " << this << " " \ + << __func__ + +namespace librbd { + +using namespace exclusive_lock; + +template <typename I> +using ML = ManagedLock<I>; + +template <typename I> +ExclusiveLock<I>::ExclusiveLock(I &image_ctx) + : ML<I>(image_ctx.md_ctx, image_ctx.op_work_queue, image_ctx.header_oid, + image_ctx.image_watcher, managed_lock::EXCLUSIVE, + image_ctx.config.template get_val<bool>("rbd_blacklist_on_break_lock"), + image_ctx.config.template get_val<uint64_t>("rbd_blacklist_expire_seconds")), + m_image_ctx(image_ctx) { + Mutex::Locker locker(ML<I>::m_lock); + ML<I>::set_state_uninitialized(); +} + +template <typename I> +bool ExclusiveLock<I>::accept_request(OperationRequestType request_type, + int *ret_val) const { + Mutex::Locker locker(ML<I>::m_lock); + + bool accept_request = + (!ML<I>::is_state_shutdown() && ML<I>::is_state_locked() && + (m_request_blocked_count == 0 || + m_image_ctx.get_exclusive_lock_policy()->accept_blocked_request( + request_type))); + if (ret_val != nullptr) { + *ret_val = accept_request ? 0 : m_request_blocked_ret_val; + } + + ldout(m_image_ctx.cct, 20) << "=" << accept_request << " (request_type=" + << request_type << ")" << dendl; + return accept_request; +} + +template <typename I> +bool ExclusiveLock<I>::accept_ops() const { + Mutex::Locker locker(ML<I>::m_lock); + bool accept = accept_ops(ML<I>::m_lock); + ldout(m_image_ctx.cct, 20) << "=" << accept << dendl; + return accept; +} + +template <typename I> +bool ExclusiveLock<I>::accept_ops(const Mutex &lock) const { + return (!ML<I>::is_state_shutdown() && + (ML<I>::is_state_locked() || ML<I>::is_state_post_acquiring())); +} + +template <typename I> +void ExclusiveLock<I>::block_requests(int r) { + Mutex::Locker locker(ML<I>::m_lock); + + m_request_blocked_count++; + if (m_request_blocked_ret_val == 0) { + m_request_blocked_ret_val = r; + } + + ldout(m_image_ctx.cct, 20) << "r=" << r << dendl; +} + +template <typename I> +void ExclusiveLock<I>::unblock_requests() { + Mutex::Locker locker(ML<I>::m_lock); + + ceph_assert(m_request_blocked_count > 0); + m_request_blocked_count--; + if (m_request_blocked_count == 0) { + m_request_blocked_ret_val = 0; + } + + ldout(m_image_ctx.cct, 20) << dendl; +} + +template <typename I> +int ExclusiveLock<I>::get_unlocked_op_error() const { + if (m_image_ctx.image_watcher->is_blacklisted()) { + return -EBLACKLISTED; + } + return -EROFS; +} + +template <typename I> +void ExclusiveLock<I>::init(uint64_t features, Context *on_init) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ldout(m_image_ctx.cct, 10) << dendl; + + { + Mutex::Locker locker(ML<I>::m_lock); + ML<I>::set_state_initializing(); + } + + m_image_ctx.io_work_queue->block_writes(new C_InitComplete(this, features, + on_init)); +} + +template <typename I> +void ExclusiveLock<I>::shut_down(Context *on_shut_down) { + ldout(m_image_ctx.cct, 10) << dendl; + + ML<I>::shut_down(on_shut_down); + + // if stalled in request state machine -- abort + handle_peer_notification(0); +} + +template <typename I> +void ExclusiveLock<I>::handle_peer_notification(int r) { + Mutex::Locker locker(ML<I>::m_lock); + if (!ML<I>::is_state_waiting_for_lock()) { + return; + } + + ldout(m_image_ctx.cct, 10) << dendl; + ceph_assert(ML<I>::is_action_acquire_lock()); + + m_acquire_lock_peer_ret_val = r; + ML<I>::execute_next_action(); +} + +template <typename I> +Context *ExclusiveLock<I>::start_op(int* ret_val) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + Mutex::Locker locker(ML<I>::m_lock); + + if (!accept_ops(ML<I>::m_lock)) { + *ret_val = get_unlocked_op_error(); + return nullptr; + } + + m_async_op_tracker.start_op(); + return new FunctionContext([this](int r) { + m_async_op_tracker.finish_op(); + }); +} + +template <typename I> +void ExclusiveLock<I>::handle_init_complete(uint64_t features) { + ldout(m_image_ctx.cct, 10) << ": features=" << features << dendl; + + { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.clone_copy_on_read || + (features & RBD_FEATURE_JOURNALING) != 0) { + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, true); + } else { + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_WRITE, true); + } + } + + Mutex::Locker locker(ML<I>::m_lock); + ML<I>::set_state_unlocked(); +} + +template <typename I> +void ExclusiveLock<I>::shutdown_handler(int r, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + + { + RWLock::WLocker owner_locker(m_image_ctx.owner_lock); + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, false); + m_image_ctx.exclusive_lock = nullptr; + } + + m_image_ctx.io_work_queue->unblock_writes(); + m_image_ctx.image_watcher->flush(on_finish); +} + +template <typename I> +void ExclusiveLock<I>::pre_acquire_lock_handler(Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + + int acquire_lock_peer_ret_val = 0; + { + Mutex::Locker locker(ML<I>::m_lock); + std::swap(acquire_lock_peer_ret_val, m_acquire_lock_peer_ret_val); + } + + if (acquire_lock_peer_ret_val == -EROFS) { + ldout(m_image_ctx.cct, 10) << ": peer nacked lock request" << dendl; + on_finish->complete(acquire_lock_peer_ret_val); + return; + } + + PreAcquireRequest<I> *req = PreAcquireRequest<I>::create(m_image_ctx, + on_finish); + m_image_ctx.op_work_queue->queue(new FunctionContext([req](int r) { + req->send(); + })); +} + +template <typename I> +void ExclusiveLock<I>::post_acquire_lock_handler(int r, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl; + + if (r == -EROFS) { + // peer refused to release the exclusive lock + on_finish->complete(r); + return; + } else if (r < 0) { + ML<I>::m_lock.Lock(); + ceph_assert(ML<I>::is_state_acquiring()); + + // PostAcquire state machine will not run, so we need complete prepare + m_image_ctx.state->handle_prepare_lock_complete(); + + // if lock is in-use by another client, request the lock + if (ML<I>::is_action_acquire_lock() && (r == -EBUSY || r == -EAGAIN)) { + ML<I>::set_state_waiting_for_lock(); + ML<I>::m_lock.Unlock(); + + // request the lock from a peer + m_image_ctx.image_watcher->notify_request_lock(); + + // inform manage lock that we have interrupted the state machine + r = -ECANCELED; + } else { + ML<I>::m_lock.Unlock(); + + // clear error if peer owns lock + if (r == -EAGAIN) { + r = 0; + } + } + + on_finish->complete(r); + return; + } + + Mutex::Locker locker(ML<I>::m_lock); + m_pre_post_callback = on_finish; + using EL = ExclusiveLock<I>; + PostAcquireRequest<I> *req = PostAcquireRequest<I>::create(m_image_ctx, + util::create_context_callback<EL, &EL::handle_post_acquiring_lock>(this), + util::create_context_callback<EL, &EL::handle_post_acquired_lock>(this)); + + m_image_ctx.op_work_queue->queue(new FunctionContext([req](int r) { + req->send(); + })); +} + +template <typename I> +void ExclusiveLock<I>::handle_post_acquiring_lock(int r) { + ldout(m_image_ctx.cct, 10) << dendl; + + Mutex::Locker locker(ML<I>::m_lock); + + ceph_assert(r == 0); + + // lock is owned at this point + ML<I>::set_state_post_acquiring(); +} + +template <typename I> +void ExclusiveLock<I>::handle_post_acquired_lock(int r) { + ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(ML<I>::m_lock); + ceph_assert(ML<I>::is_state_acquiring() || ML<I>::is_state_post_acquiring()); + + assert (m_pre_post_callback != nullptr); + std::swap(m_pre_post_callback, on_finish); + } + + if (r >= 0) { + m_image_ctx.perfcounter->tset(l_librbd_lock_acquired_time, + ceph_clock_now()); + m_image_ctx.image_watcher->notify_acquired_lock(); + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, false); + m_image_ctx.io_work_queue->unblock_writes(); + } + + on_finish->complete(r); +} + +template <typename I> +void ExclusiveLock<I>::pre_release_lock_handler(bool shutting_down, + Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + Mutex::Locker locker(ML<I>::m_lock); + + PreReleaseRequest<I> *req = PreReleaseRequest<I>::create( + m_image_ctx, shutting_down, m_async_op_tracker, on_finish); + m_image_ctx.op_work_queue->queue(new FunctionContext([req](int r) { + req->send(); + })); +} + +template <typename I> +void ExclusiveLock<I>::post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) { + ldout(m_image_ctx.cct, 10) << ": r=" << r << " shutting_down=" + << shutting_down << dendl; + if (!shutting_down) { + { + Mutex::Locker locker(ML<I>::m_lock); + ceph_assert(ML<I>::is_state_pre_releasing() || ML<I>::is_state_releasing()); + } + + if (r >= 0) { + m_image_ctx.image_watcher->notify_released_lock(); + } + } else { + { + RWLock::WLocker owner_locker(m_image_ctx.owner_lock); + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, false); + m_image_ctx.exclusive_lock = nullptr; + } + + if (r >= 0) { + m_image_ctx.io_work_queue->unblock_writes(); + } + + m_image_ctx.image_watcher->notify_released_lock(); + } + + on_finish->complete(r); +} + +template <typename I> +void ExclusiveLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + if (r >= 0) { + m_image_ctx.image_watcher->notify_acquired_lock(); + } + + on_finish->complete(r); +} + +template <typename I> +struct ExclusiveLock<I>::C_InitComplete : public Context { + ExclusiveLock *exclusive_lock; + uint64_t features; + Context *on_init; + + C_InitComplete(ExclusiveLock *exclusive_lock, uint64_t features, + Context *on_init) + : exclusive_lock(exclusive_lock), features(features), on_init(on_init) { + } + void finish(int r) override { + if (r == 0) { + exclusive_lock->handle_init_complete(features); + } + on_init->complete(r); + } +}; + +} // namespace librbd + +template class librbd::ExclusiveLock<librbd::ImageCtx>; diff --git a/src/librbd/ExclusiveLock.h b/src/librbd/ExclusiveLock.h new file mode 100644 index 00000000..824cb00d --- /dev/null +++ b/src/librbd/ExclusiveLock.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_H + +#include "common/AsyncOpTracker.h" +#include "librbd/ManagedLock.h" +#include "librbd/exclusive_lock/Policy.h" + +namespace librbd { + +template <typename ImageCtxT = ImageCtx> +class ExclusiveLock : public ManagedLock<ImageCtxT> { +public: + static ExclusiveLock *create(ImageCtxT &image_ctx) { + return new ExclusiveLock<ImageCtxT>(image_ctx); + } + + ExclusiveLock(ImageCtxT &image_ctx); + + bool accept_request(exclusive_lock::OperationRequestType request_type, + int *ret_val) const; + bool accept_ops() const; + + void block_requests(int r); + void unblock_requests(); + + void init(uint64_t features, Context *on_init); + void shut_down(Context *on_shutdown); + + void handle_peer_notification(int r); + + int get_unlocked_op_error() const; + Context *start_op(int* ret_val); + +protected: + void shutdown_handler(int r, Context *on_finish) override; + void pre_acquire_lock_handler(Context *on_finish) override; + void post_acquire_lock_handler(int r, Context *on_finish) override; + void pre_release_lock_handler(bool shutting_down, + Context *on_finish) override; + void post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) override; + void post_reacquire_lock_handler(int r, Context *on_finish) override; + +private: + + /** + * @verbatim + * + * <start> * * > WAITING_FOR_REGISTER --------\ + * | * (watch not registered) | + * | * | + * | * * > WAITING_FOR_PEER ------------\ + * | * (request_lock busy) | + * | * | + * | * * * * * * * * * * * * * * | + * | * | + * v (init) (try_lock/request_lock) * | + * UNINITIALIZED -------> UNLOCKED ------------------------> ACQUIRING <--/ + * ^ | + * | v + * RELEASING POST_ACQUIRING + * | | + * | | + * | (release_lock) v + * PRE_RELEASING <------------------------ LOCKED + * + * <LOCKED state> + * | + * v + * REACQUIRING -------------------------------------> <finish> + * . ^ + * . | + * . . . > <RELEASE action> ---> <ACQUIRE action> ---/ + * + * <UNLOCKED/LOCKED states> + * | + * | + * v + * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> <finish> + * + * @endverbatim + */ + + struct C_InitComplete; + + ImageCtxT& m_image_ctx; + Context *m_pre_post_callback = nullptr; + + AsyncOpTracker m_async_op_tracker; + + uint32_t m_request_blocked_count = 0; + int m_request_blocked_ret_val = 0; + + int m_acquire_lock_peer_ret_val = 0; + + bool accept_ops(const Mutex &lock) const; + + void handle_init_complete(uint64_t features); + void handle_post_acquiring_lock(int r); + void handle_post_acquired_lock(int r); +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_H diff --git a/src/librbd/Features.cc b/src/librbd/Features.cc new file mode 100644 index 00000000..8623adb4 --- /dev/null +++ b/src/librbd/Features.cc @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/lexical_cast.hpp> +#include <boost/algorithm/string.hpp> + +#include "librbd/Features.h" +#include "include/rbd/features.h" + +#include <map> +#include <vector> + +static const std::map<std::string, uint64_t> RBD_FEATURE_MAP = { + {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING}, + {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2}, + {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK}, + {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP}, + {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF}, + {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN}, + {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING}, + {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL}, + {RBD_FEATURE_NAME_OPERATIONS, RBD_FEATURE_OPERATIONS}, + {RBD_FEATURE_NAME_MIGRATING, RBD_FEATURE_MIGRATING}, +}; +static_assert((RBD_FEATURE_MIGRATING << 1) > RBD_FEATURES_ALL, + "new RBD feature added"); + + +namespace librbd { + +std::string rbd_features_to_string(uint64_t features, + std::ostream *err) +{ + std::string r; + for (auto& i : RBD_FEATURE_MAP) { + if (features & i.second) { + if (r.empty()) { + r += ","; + } + r += i.first; + features &= ~i.second; + } + } + if (err && features) { + *err << "ignoring unknown feature mask 0x" + << std::hex << features << std::dec; + } + return r; +} + +uint64_t rbd_features_from_string(const std::string& orig_value, + std::ostream *err) +{ + uint64_t features = 0; + std::string value = orig_value; + boost::trim(value); + + // empty string means default features + if (!value.size()) { + return RBD_FEATURES_DEFAULT; + } + + try { + // numeric? + features = boost::lexical_cast<uint64_t>(value); + + // drop unrecognized bits + uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL); + if (unsupported_features != 0ull) { + features &= RBD_FEATURES_ALL; + if (err) { + *err << "ignoring unknown feature mask 0x" + << std::hex << unsupported_features << std::dec; + } + } + uint64_t internal_features = (features & RBD_FEATURES_INTERNAL); + if (internal_features != 0ULL) { + features &= ~RBD_FEATURES_INTERNAL; + if (err) { + *err << "ignoring internal feature mask 0x" + << std::hex << internal_features; + } + } + } catch (boost::bad_lexical_cast&) { + // feature name list? + bool errors = false; + std::vector<std::string> feature_names; + boost::split(feature_names, value, boost::is_any_of(",")); + for (auto feature_name: feature_names) { + boost::trim(feature_name); + auto feature_it = RBD_FEATURE_MAP.find(feature_name); + if (feature_it != RBD_FEATURE_MAP.end()) { + features += feature_it->second; + } else if (err) { + if (errors) { + *err << ", "; + } else { + errors = true; + } + *err << "ignoring unknown feature " << feature_name; + } + } + } + return features; +} + +} // namespace librbd diff --git a/src/librbd/Features.h b/src/librbd/Features.h new file mode 100644 index 00000000..6a88827c --- /dev/null +++ b/src/librbd/Features.h @@ -0,0 +1,16 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <ostream> + +namespace librbd { + + std::string rbd_features_to_string(uint64_t features, + std::ostream *err); + uint64_t rbd_features_from_string(const std::string& value, + std::ostream *err); + +} // librbd diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc new file mode 100644 index 00000000..ed4bc218 --- /dev/null +++ b/src/librbd/ImageCtx.cc @@ -0,0 +1,927 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <errno.h> +#include <boost/assign/list_of.hpp> +#include <stddef.h> + +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" + +#include "librbd/AsyncRequest.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/internal.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/LibrbdAdminSocketHook.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/LibrbdWriteback.h" +#include "librbd/exclusive_lock/AutomaticPolicy.h" +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/journal/StandardPolicy.h" + +#include "osdc/Striper.h" +#include <boost/bind.hpp> +#include <boost/algorithm/string/predicate.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageCtx: " + +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; + +namespace librbd { + +namespace { + +class ThreadPoolSingleton : public ThreadPool { +public: + ContextWQ *op_work_queue; + + explicit ThreadPoolSingleton(CephContext *cct) + : ThreadPool(cct, "librbd::thread_pool", "tp_librbd", 1, + "rbd_op_threads"), + op_work_queue(new ContextWQ("librbd::op_work_queue", + cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"), + this)) { + start(); + } + ~ThreadPoolSingleton() override { + op_work_queue->drain(); + delete op_work_queue; + + stop(); + } +}; + +class SafeTimerSingleton : public SafeTimer { +public: + Mutex lock; + + explicit SafeTimerSingleton(CephContext *cct) + : SafeTimer(cct, lock, true), + lock("librbd::Journal::SafeTimerSingleton::lock") { + init(); + } + ~SafeTimerSingleton() { + Mutex::Locker locker(lock); + shutdown(); + } +}; + +} // anonymous namespace + + const string ImageCtx::METADATA_CONF_PREFIX = "conf_"; + + ImageCtx::ImageCtx(const string &image_name, const string &image_id, + const char *snap, IoCtx& p, bool ro) + : cct((CephContext*)p.cct()), + config(cct->_conf), + perfcounter(NULL), + snap_id(CEPH_NOSNAP), + snap_exists(true), + read_only(ro), + exclusive_locked(false), + name(image_name), + image_watcher(NULL), + journal(NULL), + owner_lock(util::unique_lock_name("librbd::ImageCtx::owner_lock", this)), + md_lock(util::unique_lock_name("librbd::ImageCtx::md_lock", this)), + snap_lock(util::unique_lock_name("librbd::ImageCtx::snap_lock", this)), + timestamp_lock(util::unique_lock_name("librbd::ImageCtx::timestamp_lock", this)), + parent_lock(util::unique_lock_name("librbd::ImageCtx::parent_lock", this)), + object_map_lock(util::unique_lock_name("librbd::ImageCtx::object_map_lock", this)), + async_ops_lock(util::unique_lock_name("librbd::ImageCtx::async_ops_lock", this)), + copyup_list_lock(util::unique_lock_name("librbd::ImageCtx::copyup_list_lock", this)), + completed_reqs_lock(util::unique_lock_name("librbd::ImageCtx::completed_reqs_lock", this)), + extra_read_flags(0), + old_format(false), + order(0), size(0), features(0), + format_string(NULL), + id(image_id), parent(NULL), + stripe_unit(0), stripe_count(0), flags(0), + readahead(), + total_bytes_read(0), + state(new ImageState<>(this)), + operations(new Operations<>(*this)), + exclusive_lock(nullptr), object_map(nullptr), + io_work_queue(nullptr), op_work_queue(nullptr), + asok_hook(nullptr), + trace_endpoint("librbd") + { + md_ctx.dup(p); + data_ctx.dup(p); + if (snap) + snap_name = snap; + + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&header, 0, sizeof(header)); + + ThreadPool *thread_pool; + get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + io_work_queue = new io::ImageRequestWQ<>( + this, "librbd::io_work_queue", + cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"), + thread_pool); + io_object_dispatcher = new io::ObjectDispatcher<>(this); + + if (cct->_conf.get_val<bool>("rbd_auto_exclusive_lock_until_manual_request")) { + exclusive_lock_policy = new exclusive_lock::AutomaticPolicy(this); + } else { + exclusive_lock_policy = new exclusive_lock::StandardPolicy(this); + } + journal_policy = new journal::StandardPolicy<ImageCtx>(this); + } + + ImageCtx::ImageCtx(const string &image_name, const string &image_id, + uint64_t snap_id, IoCtx& p, bool ro) + : ImageCtx(image_name, image_id, "", p, ro) { + open_snap_id = snap_id; + } + + ImageCtx::~ImageCtx() { + ceph_assert(config_watcher == nullptr); + ceph_assert(image_watcher == NULL); + ceph_assert(exclusive_lock == NULL); + ceph_assert(object_map == NULL); + ceph_assert(journal == NULL); + ceph_assert(asok_hook == NULL); + + if (perfcounter) { + perf_stop(); + } + delete[] format_string; + + md_ctx.aio_flush(); + if (data_ctx.is_valid()) { + data_ctx.aio_flush(); + } + io_work_queue->drain(); + + delete io_object_dispatcher; + + delete journal_policy; + delete exclusive_lock_policy; + delete io_work_queue; + delete operations; + delete state; + } + + void ImageCtx::init() { + ceph_assert(!header_oid.empty()); + ceph_assert(old_format || !id.empty()); + + asok_hook = new LibrbdAdminSocketHook(this); + + string pname = string("librbd-") + id + string("-") + + md_ctx.get_pool_name() + string("-") + name; + if (!snap_name.empty()) { + pname += "-"; + pname += snap_name; + } + + trace_endpoint.copy_name(pname); + perf_start(pname); + + ceph_assert(image_watcher == NULL); + image_watcher = new ImageWatcher<>(*this); + } + + void ImageCtx::shutdown() { + delete image_watcher; + image_watcher = nullptr; + + delete asok_hook; + asok_hook = nullptr; + } + + void ImageCtx::init_layout(int64_t pool_id) + { + if (stripe_unit == 0 || stripe_count == 0) { + stripe_unit = 1ull << order; + stripe_count = 1; + } + + vector<uint64_t> alignments; + alignments.push_back(stripe_count << order); // object set (in file striping terminology) + alignments.push_back(stripe_unit * stripe_count); // stripe + alignments.push_back(stripe_unit); // stripe unit + readahead.set_alignments(alignments); + + layout = file_layout_t(); + layout.stripe_unit = stripe_unit; + layout.stripe_count = stripe_count; + layout.object_size = 1ull << order; + layout.pool_id = pool_id; // FIXME: pool id overflow? + + delete[] format_string; + size_t len = object_prefix.length() + 16; + format_string = new char[len]; + if (old_format) { + snprintf(format_string, len, "%s.%%012llx", object_prefix.c_str()); + } else { + snprintf(format_string, len, "%s.%%016llx", object_prefix.c_str()); + } + + ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit + << " stripe_count " << stripe_count + << " object_size " << layout.object_size + << " prefix " << object_prefix + << " format " << format_string + << dendl; + } + + void ImageCtx::perf_start(string name) { + auto perf_prio = PerfCountersBuilder::PRIO_DEBUGONLY; + if (child == nullptr) { + // ensure top-level IO stats are exported for librbd daemons + perf_prio = PerfCountersBuilder::PRIO_USEFUL; + } + + PerfCountersBuilder plb(cct, name, l_librbd_first, l_librbd_last); + + plb.add_u64_counter(l_librbd_rd, "rd", "Reads", "r", perf_prio); + plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes", "Data size in reads", + "rb", perf_prio, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_rd_latency, "rd_latency", "Latency of reads", + "rl", perf_prio); + plb.add_u64_counter(l_librbd_wr, "wr", "Writes", "w", perf_prio); + plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes", "Written data", + "wb", perf_prio, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_wr_latency, "wr_latency", "Write latency", + "wl", perf_prio); + plb.add_u64_counter(l_librbd_discard, "discard", "Discards"); + plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_discard_latency, "discard_latency", "Discard latency"); + plb.add_u64_counter(l_librbd_flush, "flush", "Flushes"); + plb.add_time_avg(l_librbd_flush_latency, "flush_latency", "Latency of flushes"); + plb.add_u64_counter(l_librbd_ws, "ws", "WriteSames"); + plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_ws_latency, "ws_latency", "WriteSame latency"); + plb.add_u64_counter(l_librbd_cmp, "cmp", "CompareAndWrites"); + plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_cmp_latency, "cmp_latency", "Latency of cmps"); + plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations"); + plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals"); + plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks"); + plb.add_u64_counter(l_librbd_snap_rename, "snap_rename", "Snap rename"); + plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications"); + plb.add_u64_counter(l_librbd_resize, "resize", "Resizes"); + plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead"); + plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_librbd_invalidate_cache, "invalidate_cache", "Cache invalidates"); + + plb.add_time(l_librbd_opened_time, "opened_time", "Opened time", + "ots", perf_prio); + plb.add_time(l_librbd_lock_acquired_time, "lock_acquired_time", + "Lock acquired time", "lats", perf_prio); + + perfcounter = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfcounter); + + perfcounter->tset(l_librbd_opened_time, ceph_clock_now()); + } + + void ImageCtx::perf_stop() { + ceph_assert(perfcounter); + cct->get_perfcounters_collection()->remove(perfcounter); + delete perfcounter; + } + + void ImageCtx::set_read_flag(unsigned flag) { + extra_read_flags |= flag; + } + + int ImageCtx::get_read_flags(snap_t snap_id) { + int flags = librados::OPERATION_NOFLAG | extra_read_flags; + if (snap_id == LIBRADOS_SNAP_HEAD) + return flags; + + if (config.get_val<bool>("rbd_balance_snap_reads")) + flags |= librados::OPERATION_BALANCE_READS; + else if (config.get_val<bool>("rbd_localize_snap_reads")) + flags |= librados::OPERATION_LOCALIZE_READS; + return flags; + } + + int ImageCtx::snap_set(uint64_t in_snap_id) { + ceph_assert(snap_lock.is_wlocked()); + auto it = snap_info.find(in_snap_id); + if (in_snap_id != CEPH_NOSNAP && it != snap_info.end()) { + snap_id = in_snap_id; + snap_namespace = it->second.snap_namespace; + snap_name = it->second.name; + snap_exists = true; + if (data_ctx.is_valid()) { + data_ctx.snap_set_read(snap_id); + } + return 0; + } + return -ENOENT; + } + + void ImageCtx::snap_unset() + { + ceph_assert(snap_lock.is_wlocked()); + snap_id = CEPH_NOSNAP; + snap_namespace = {}; + snap_name = ""; + snap_exists = true; + if (data_ctx.is_valid()) { + data_ctx.snap_set_read(snap_id); + } + } + + snap_t ImageCtx::get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace, + const string& in_snap_name) const + { + ceph_assert(snap_lock.is_locked()); + auto it = snap_ids.find({in_snap_namespace, in_snap_name}); + if (it != snap_ids.end()) { + return it->second; + } + return CEPH_NOSNAP; + } + + const SnapInfo* ImageCtx::get_snap_info(snap_t in_snap_id) const + { + ceph_assert(snap_lock.is_locked()); + map<snap_t, SnapInfo>::const_iterator it = + snap_info.find(in_snap_id); + if (it != snap_info.end()) + return &it->second; + return nullptr; + } + + int ImageCtx::get_snap_name(snap_t in_snap_id, + string *out_snap_name) const + { + ceph_assert(snap_lock.is_locked()); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *out_snap_name = info->name; + return 0; + } + return -ENOENT; + } + + int ImageCtx::get_snap_namespace(snap_t in_snap_id, + cls::rbd::SnapshotNamespace *out_snap_namespace) const + { + ceph_assert(snap_lock.is_locked()); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *out_snap_namespace = info->snap_namespace; + return 0; + } + return -ENOENT; + } + + int ImageCtx::get_parent_spec(snap_t in_snap_id, + cls::rbd::ParentImageSpec *out_pspec) const + { + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *out_pspec = info->parent.spec; + return 0; + } + return -ENOENT; + } + + uint64_t ImageCtx::get_current_size() const + { + ceph_assert(snap_lock.is_locked()); + return size; + } + + uint64_t ImageCtx::get_object_size() const + { + return 1ull << order; + } + + string ImageCtx::get_object_name(uint64_t num) const { + char buf[object_prefix.length() + 32]; + snprintf(buf, sizeof(buf), format_string, num); + return string(buf); + } + + uint64_t ImageCtx::get_stripe_unit() const + { + return stripe_unit; + } + + uint64_t ImageCtx::get_stripe_count() const + { + return stripe_count; + } + + uint64_t ImageCtx::get_stripe_period() const + { + return stripe_count * (1ull << order); + } + + utime_t ImageCtx::get_create_timestamp() const + { + return create_timestamp; + } + + utime_t ImageCtx::get_access_timestamp() const + { + return access_timestamp; + } + + utime_t ImageCtx::get_modify_timestamp() const + { + return modify_timestamp; + } + + void ImageCtx::set_access_timestamp(utime_t at) + { + ceph_assert(timestamp_lock.is_wlocked()); + access_timestamp = at; + } + + void ImageCtx::set_modify_timestamp(utime_t mt) + { + ceph_assert(timestamp_lock.is_locked()); + modify_timestamp = mt; + } + + int ImageCtx::is_snap_protected(snap_t in_snap_id, + bool *is_protected) const + { + ceph_assert(snap_lock.is_locked()); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *is_protected = + (info->protection_status == RBD_PROTECTION_STATUS_PROTECTED); + return 0; + } + return -ENOENT; + } + + int ImageCtx::is_snap_unprotected(snap_t in_snap_id, + bool *is_unprotected) const + { + ceph_assert(snap_lock.is_locked()); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *is_unprotected = + (info->protection_status == RBD_PROTECTION_STATUS_UNPROTECTED); + return 0; + } + return -ENOENT; + } + + void ImageCtx::add_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + string in_snap_name, + snap_t id, uint64_t in_size, + const ParentImageInfo &parent, + uint8_t protection_status, uint64_t flags, + utime_t timestamp) + { + ceph_assert(snap_lock.is_wlocked()); + snaps.push_back(id); + SnapInfo info(in_snap_name, in_snap_namespace, + in_size, parent, protection_status, flags, timestamp); + snap_info.insert({id, info}); + snap_ids.insert({{in_snap_namespace, in_snap_name}, id}); + } + + void ImageCtx::rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + string in_snap_name, + snap_t id) + { + ceph_assert(snap_lock.is_wlocked()); + snaps.erase(std::remove(snaps.begin(), snaps.end(), id), snaps.end()); + snap_info.erase(id); + snap_ids.erase({in_snap_namespace, in_snap_name}); + } + + uint64_t ImageCtx::get_image_size(snap_t in_snap_id) const + { + ceph_assert(snap_lock.is_locked()); + if (in_snap_id == CEPH_NOSNAP) { + if (!resize_reqs.empty() && + resize_reqs.front()->shrinking()) { + return resize_reqs.front()->get_image_size(); + } + return size; + } + + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + return info->size; + } + return 0; + } + + uint64_t ImageCtx::get_object_count(snap_t in_snap_id) const { + ceph_assert(snap_lock.is_locked()); + uint64_t image_size = get_image_size(in_snap_id); + return Striper::get_num_objects(layout, image_size); + } + + bool ImageCtx::test_features(uint64_t features) const + { + RWLock::RLocker l(snap_lock); + return test_features(features, snap_lock); + } + + bool ImageCtx::test_features(uint64_t in_features, + const RWLock &in_snap_lock) const + { + ceph_assert(snap_lock.is_locked()); + return ((features & in_features) == in_features); + } + + bool ImageCtx::test_op_features(uint64_t in_op_features) const + { + RWLock::RLocker snap_locker(snap_lock); + return test_op_features(in_op_features, snap_lock); + } + + bool ImageCtx::test_op_features(uint64_t in_op_features, + const RWLock &in_snap_lock) const + { + ceph_assert(snap_lock.is_locked()); + return ((op_features & in_op_features) == in_op_features); + } + + int ImageCtx::get_flags(librados::snap_t _snap_id, uint64_t *_flags) const + { + ceph_assert(snap_lock.is_locked()); + if (_snap_id == CEPH_NOSNAP) { + *_flags = flags; + return 0; + } + const SnapInfo *info = get_snap_info(_snap_id); + if (info) { + *_flags = info->flags; + return 0; + } + return -ENOENT; + } + + int ImageCtx::test_flags(librados::snap_t in_snap_id, + uint64_t flags, bool *flags_set) const + { + RWLock::RLocker l(snap_lock); + return test_flags(in_snap_id, flags, snap_lock, flags_set); + } + + int ImageCtx::test_flags(librados::snap_t in_snap_id, + uint64_t flags, const RWLock &in_snap_lock, + bool *flags_set) const + { + ceph_assert(snap_lock.is_locked()); + uint64_t snap_flags; + int r = get_flags(in_snap_id, &snap_flags); + if (r < 0) { + return r; + } + *flags_set = ((snap_flags & flags) == flags); + return 0; + } + + int ImageCtx::update_flags(snap_t in_snap_id, uint64_t flag, bool enabled) + { + ceph_assert(snap_lock.is_wlocked()); + uint64_t *_flags; + if (in_snap_id == CEPH_NOSNAP) { + _flags = &flags; + } else { + map<snap_t, SnapInfo>::iterator it = snap_info.find(in_snap_id); + if (it == snap_info.end()) { + return -ENOENT; + } + _flags = &it->second.flags; + } + + if (enabled) { + (*_flags) |= flag; + } else { + (*_flags) &= ~flag; + } + return 0; + } + + const ParentImageInfo* ImageCtx::get_parent_info(snap_t in_snap_id) const + { + ceph_assert(snap_lock.is_locked()); + ceph_assert(parent_lock.is_locked()); + if (in_snap_id == CEPH_NOSNAP) + return &parent_md; + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) + return &info->parent; + return NULL; + } + + int64_t ImageCtx::get_parent_pool_id(snap_t in_snap_id) const + { + const auto info = get_parent_info(in_snap_id); + if (info) + return info->spec.pool_id; + return -1; + } + + string ImageCtx::get_parent_image_id(snap_t in_snap_id) const + { + const auto info = get_parent_info(in_snap_id); + if (info) + return info->spec.image_id; + return ""; + } + + uint64_t ImageCtx::get_parent_snap_id(snap_t in_snap_id) const + { + const auto info = get_parent_info(in_snap_id); + if (info) + return info->spec.snap_id; + return CEPH_NOSNAP; + } + + int ImageCtx::get_parent_overlap(snap_t in_snap_id, uint64_t *overlap) const + { + ceph_assert(snap_lock.is_locked()); + const auto info = get_parent_info(in_snap_id); + if (info) { + *overlap = info->overlap; + return 0; + } + return -ENOENT; + } + + void ImageCtx::register_watch(Context *on_finish) { + ceph_assert(image_watcher != NULL); + image_watcher->register_watch(on_finish); + } + + uint64_t ImageCtx::prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx, + uint64_t overlap) + { + // drop extents completely beyond the overlap + while (!objectx.empty() && objectx.back().first >= overlap) + objectx.pop_back(); + + // trim final overlapping extent + if (!objectx.empty() && objectx.back().first + objectx.back().second > overlap) + objectx.back().second = overlap - objectx.back().first; + + uint64_t len = 0; + for (vector<pair<uint64_t,uint64_t> >::iterator p = objectx.begin(); + p != objectx.end(); + ++p) + len += p->second; + ldout(cct, 10) << "prune_parent_extents image overlap " << overlap + << ", object overlap " << len + << " from image extents " << objectx << dendl; + return len; + } + + void ImageCtx::cancel_async_requests() { + C_SaferCond ctx; + cancel_async_requests(&ctx); + ctx.wait(); + } + + void ImageCtx::cancel_async_requests(Context *on_finish) { + { + Mutex::Locker async_ops_locker(async_ops_lock); + if (!async_requests.empty()) { + ldout(cct, 10) << "canceling async requests: count=" + << async_requests.size() << dendl; + for (auto req : async_requests) { + ldout(cct, 10) << "canceling async request: " << req << dendl; + req->cancel(); + } + async_requests_waiters.push_back(on_finish); + return; + } + } + + on_finish->complete(0); + } + + void ImageCtx::clear_pending_completions() { + Mutex::Locker l(completed_reqs_lock); + ldout(cct, 10) << "clear pending AioCompletion: count=" + << completed_reqs.size() << dendl; + completed_reqs.clear(); + } + + void ImageCtx::apply_metadata(const std::map<std::string, bufferlist> &meta, + bool thread_safe) { + ldout(cct, 20) << __func__ << dendl; + + RWLock::WLocker md_locker(md_lock); + + // reset settings back to global defaults + for (auto& key : config_overrides) { + std::string value; + int r = cct->_conf.get_val(key, &value); + ceph_assert(r == 0); + + config.set_val(key, value); + } + config_overrides.clear(); + + // extract config overrides + for (auto meta_pair : meta) { + if (!boost::starts_with(meta_pair.first, METADATA_CONF_PREFIX)) { + continue; + } + + std::string key = meta_pair.first.substr(METADATA_CONF_PREFIX.size()); + if (!boost::starts_with(key, "rbd_")) { + // ignore non-RBD configuration keys + // TODO use option schema to determine applicable subsystem + ldout(cct, 0) << __func__ << ": ignoring config " << key << dendl; + continue; + } + + if (config.find_option(key) != nullptr) { + std::string val(meta_pair.second.c_str(), meta_pair.second.length()); + int r = config.set_val(key, val); + if (r >= 0) { + ldout(cct, 20) << __func__ << ": " << key << "=" << val << dendl; + config_overrides.insert(key); + } else { + lderr(cct) << __func__ << ": failed to set config " << key << " " + << "with value " << val << ": " << cpp_strerror(r) + << dendl; + } + } + } + + md_locker.unlock(); + +#define ASSIGN_OPTION(param, type) \ + param = config.get_val<type>("rbd_"#param) + + bool skip_partial_discard = true; + ASSIGN_OPTION(non_blocking_aio, bool); + ASSIGN_OPTION(cache, bool); + ASSIGN_OPTION(cache_writethrough_until_flush, bool); + ASSIGN_OPTION(cache_max_dirty, Option::size_t); + ASSIGN_OPTION(sparse_read_threshold_bytes, Option::size_t); + ASSIGN_OPTION(readahead_max_bytes, Option::size_t); + ASSIGN_OPTION(readahead_disable_after_bytes, Option::size_t); + ASSIGN_OPTION(clone_copy_on_read, bool); + ASSIGN_OPTION(enable_alloc_hint, bool); + ASSIGN_OPTION(mirroring_replay_delay, uint64_t); + ASSIGN_OPTION(mtime_update_interval, uint64_t); + ASSIGN_OPTION(atime_update_interval, uint64_t); + ASSIGN_OPTION(skip_partial_discard, bool); + ASSIGN_OPTION(discard_granularity_bytes, uint64_t); + ASSIGN_OPTION(blkin_trace_all, bool); + +#undef ASSIGN_OPTION + + if (sparse_read_threshold_bytes == 0) { + sparse_read_threshold_bytes = get_object_size(); + } + if (!skip_partial_discard) { + discard_granularity_bytes = 0; + } + + alloc_hint_flags = 0; + auto compression_hint = config.get_val<std::string>("rbd_compression_hint"); + if (compression_hint == "compressible") { + alloc_hint_flags |= librados::ALLOC_HINT_FLAG_COMPRESSIBLE; + } else if (compression_hint == "incompressible") { + alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; + } + + io_work_queue->apply_qos_schedule_tick_min( + config.get_val<uint64_t>("rbd_qos_schedule_tick_min")); + + io_work_queue->apply_qos_limit( + RBD_QOS_IOPS_THROTTLE, + config.get_val<uint64_t>("rbd_qos_iops_limit"), + config.get_val<uint64_t>("rbd_qos_iops_burst")); + io_work_queue->apply_qos_limit( + RBD_QOS_BPS_THROTTLE, + config.get_val<uint64_t>("rbd_qos_bps_limit"), + config.get_val<uint64_t>("rbd_qos_bps_burst")); + io_work_queue->apply_qos_limit( + RBD_QOS_READ_IOPS_THROTTLE, + config.get_val<uint64_t>("rbd_qos_read_iops_limit"), + config.get_val<uint64_t>("rbd_qos_read_iops_burst")); + io_work_queue->apply_qos_limit( + RBD_QOS_WRITE_IOPS_THROTTLE, + config.get_val<uint64_t>("rbd_qos_write_iops_limit"), + config.get_val<uint64_t>("rbd_qos_write_iops_burst")); + io_work_queue->apply_qos_limit( + RBD_QOS_READ_BPS_THROTTLE, + config.get_val<uint64_t>("rbd_qos_read_bps_limit"), + config.get_val<uint64_t>("rbd_qos_read_bps_burst")); + io_work_queue->apply_qos_limit( + RBD_QOS_WRITE_BPS_THROTTLE, + config.get_val<uint64_t>("rbd_qos_write_bps_limit"), + config.get_val<uint64_t>("rbd_qos_write_bps_burst")); + } + + ExclusiveLock<ImageCtx> *ImageCtx::create_exclusive_lock() { + return new ExclusiveLock<ImageCtx>(*this); + } + + ObjectMap<ImageCtx> *ImageCtx::create_object_map(uint64_t snap_id) { + return new ObjectMap<ImageCtx>(*this, snap_id); + } + + Journal<ImageCtx> *ImageCtx::create_journal() { + return new Journal<ImageCtx>(*this); + } + + void ImageCtx::set_image_name(const std::string &image_name) { + // update the name so rename can be invoked repeatedly + RWLock::RLocker owner_locker(owner_lock); + RWLock::WLocker snap_locker(snap_lock); + name = image_name; + if (old_format) { + header_oid = util::old_header_name(image_name); + } + } + + void ImageCtx::notify_update() { + state->handle_update_notification(); + ImageWatcher<>::notify_header_update(md_ctx, header_oid); + } + + void ImageCtx::notify_update(Context *on_finish) { + state->handle_update_notification(); + image_watcher->notify_header_update(on_finish); + } + + exclusive_lock::Policy *ImageCtx::get_exclusive_lock_policy() const { + ceph_assert(owner_lock.is_locked()); + ceph_assert(exclusive_lock_policy != nullptr); + return exclusive_lock_policy; + } + + void ImageCtx::set_exclusive_lock_policy(exclusive_lock::Policy *policy) { + ceph_assert(owner_lock.is_wlocked()); + ceph_assert(policy != nullptr); + delete exclusive_lock_policy; + exclusive_lock_policy = policy; + } + + journal::Policy *ImageCtx::get_journal_policy() const { + ceph_assert(snap_lock.is_locked()); + ceph_assert(journal_policy != nullptr); + return journal_policy; + } + + void ImageCtx::set_journal_policy(journal::Policy *policy) { + ceph_assert(snap_lock.is_wlocked()); + ceph_assert(policy != nullptr); + delete journal_policy; + journal_policy = policy; + } + + bool ImageCtx::is_writeback_cache_enabled() const { + return (cache && cache_max_dirty > 0); + } + + void ImageCtx::get_thread_pool_instance(CephContext *cct, + ThreadPool **thread_pool, + ContextWQ **op_work_queue) { + auto thread_pool_singleton = + &cct->lookup_or_create_singleton_object<ThreadPoolSingleton>( + "librbd::thread_pool", false, cct); + *thread_pool = thread_pool_singleton; + *op_work_queue = thread_pool_singleton->op_work_queue; + } + + void ImageCtx::get_timer_instance(CephContext *cct, SafeTimer **timer, + Mutex **timer_lock) { + auto safe_timer_singleton = + &cct->lookup_or_create_singleton_object<SafeTimerSingleton>( + "librbd::journal::safe_timer", false, cct); + *timer = safe_timer_singleton; + *timer_lock = &safe_timer_singleton->lock; + } +} diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h new file mode 100644 index 00000000..df22a0e8 --- /dev/null +++ b/src/librbd/ImageCtx.h @@ -0,0 +1,331 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_IMAGECTX_H +#define CEPH_LIBRBD_IMAGECTX_H + +#include "include/int_types.h" + +#include <list> +#include <map> +#include <set> +#include <string> +#include <vector> + +#include "common/config_proxy.h" +#include "common/event_socket.h" +#include "common/Mutex.h" +#include "common/Readahead.h" +#include "common/RWLock.h" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" + +#include "include/buffer_fwd.h" +#include "include/rbd/librbd.hpp" +#include "include/rbd_types.h" +#include "include/types.h" +#include "include/xlist.h" + +#include "cls/rbd/cls_rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/AsyncRequest.h" +#include "librbd/Types.h" + +class CephContext; +class ContextWQ; +class Finisher; +class PerfCounters; +class ThreadPool; +class SafeTimer; + +namespace librbd { + + template <typename> class ConfigWatcher; + template <typename> class ExclusiveLock; + template <typename> class ImageState; + template <typename> class ImageWatcher; + template <typename> class Journal; + class LibrbdAdminSocketHook; + template <typename> class ObjectMap; + template <typename> class Operations; + + namespace cache { struct ImageCache; } + namespace exclusive_lock { struct Policy; } + namespace io { + class AioCompletion; + class AsyncOperation; + template <typename> class CopyupRequest; + template <typename> class ImageRequestWQ; + template <typename> class ObjectDispatcher; + } + namespace journal { struct Policy; } + + namespace operation { + template <typename> class ResizeRequest; + } + + struct ImageCtx { + static const string METADATA_CONF_PREFIX; + + CephContext *cct; + ConfigProxy config; + std::set<std::string> config_overrides; + + PerfCounters *perfcounter; + struct rbd_obj_header_ondisk header; + ::SnapContext snapc; + std::vector<librados::snap_t> snaps; // this mirrors snapc.snaps, but is in + // a format librados can understand + std::map<librados::snap_t, SnapInfo> snap_info; + std::map<std::pair<cls::rbd::SnapshotNamespace, std::string>, librados::snap_t> snap_ids; + uint64_t open_snap_id = CEPH_NOSNAP; + uint64_t snap_id; + bool snap_exists; // false if our snap_id was deleted + // whether the image was opened read-only. cannot be changed after opening + bool read_only; + + std::map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t> lockers; + bool exclusive_locked; + std::string lock_tag; + + std::string name; + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + IoCtx data_ctx, md_ctx; + + ConfigWatcher<ImageCtx> *config_watcher = nullptr; + ImageWatcher<ImageCtx> *image_watcher; + Journal<ImageCtx> *journal; + + /** + * Lock ordering: + * + * owner_lock, md_lock, snap_lock, parent_lock, + * object_map_lock, async_op_lock, timestamp_lock + */ + RWLock owner_lock; // protects exclusive lock leadership updates + RWLock md_lock; // protects access to the mutable image metadata that + // isn't guarded by other locks below, and blocks writes + // when held exclusively, so snapshots can be consistent. + // Fields guarded include: + // total_bytes_read + // exclusive_locked + // lock_tag + // lockers + RWLock snap_lock; // protects snapshot-related member variables, + // features (and associated helper classes), and flags + RWLock timestamp_lock; + RWLock parent_lock; // protects parent_md and parent + RWLock object_map_lock; // protects object map updates and object_map itself + Mutex async_ops_lock; // protects async_ops and async_requests + Mutex copyup_list_lock; // protects copyup_waiting_list + Mutex completed_reqs_lock; // protects completed_reqs + + unsigned extra_read_flags; + + bool old_format; + uint8_t order; + uint64_t size; + uint64_t features; + std::string object_prefix; + char *format_string; + std::string header_oid; + std::string id; // only used for new-format images + ParentImageInfo parent_md; + ImageCtx *parent; + ImageCtx *child = nullptr; + MigrationInfo migration_info; + cls::rbd::GroupSpec group_spec; + uint64_t stripe_unit, stripe_count; + uint64_t flags; + uint64_t op_features = 0; + bool operations_disabled = false; + utime_t create_timestamp; + utime_t access_timestamp; + utime_t modify_timestamp; + + file_layout_t layout; + + cache::ImageCache *image_cache = nullptr; + + Readahead readahead; + uint64_t total_bytes_read; + + std::map<uint64_t, io::CopyupRequest<ImageCtx>*> copyup_list; + + xlist<io::AsyncOperation*> async_ops; + xlist<AsyncRequest<>*> async_requests; + std::list<Context*> async_requests_waiters; + + ImageState<ImageCtx> *state; + Operations<ImageCtx> *operations; + + ExclusiveLock<ImageCtx> *exclusive_lock; + ObjectMap<ImageCtx> *object_map; + + xlist<operation::ResizeRequest<ImageCtx>*> resize_reqs; + + io::ImageRequestWQ<ImageCtx> *io_work_queue; + io::ObjectDispatcher<ImageCtx> *io_object_dispatcher = nullptr; + + xlist<io::AioCompletion*> completed_reqs; + EventSocket event_socket; + + ContextWQ *op_work_queue; + + bool ignore_migrating = false; + + /// Cached latency-sensitive configuration settings + bool non_blocking_aio; + bool cache; + bool cache_writethrough_until_flush; + uint64_t cache_max_dirty; + uint64_t sparse_read_threshold_bytes; + uint64_t readahead_max_bytes; + uint64_t readahead_disable_after_bytes; + bool clone_copy_on_read; + bool enable_alloc_hint; + uint32_t alloc_hint_flags = 0U; + uint32_t discard_granularity_bytes = 0; + bool blkin_trace_all; + uint64_t mirroring_replay_delay; + uint64_t mtime_update_interval; + uint64_t atime_update_interval; + + LibrbdAdminSocketHook *asok_hook; + + exclusive_lock::Policy *exclusive_lock_policy = nullptr; + journal::Policy *journal_policy = nullptr; + + ZTracer::Endpoint trace_endpoint; + + // unit test mock helpers + static ImageCtx* create(const std::string &image_name, + const std::string &image_id, + const char *snap, IoCtx& p, bool read_only) { + return new ImageCtx(image_name, image_id, snap, p, read_only); + } + static ImageCtx* create(const std::string &image_name, + const std::string &image_id, + librados::snap_t snap_id, IoCtx& p, + bool read_only) { + return new ImageCtx(image_name, image_id, snap_id, p, read_only); + } + void destroy() { + delete this; + } + + /** + * Either image_name or image_id must be set. + * If id is not known, pass the empty std::string, + * and init() will look it up. + */ + ImageCtx(const std::string &image_name, const std::string &image_id, + const char *snap, IoCtx& p, bool read_only); + ImageCtx(const std::string &image_name, const std::string &image_id, + librados::snap_t snap_id, IoCtx& p, bool read_only); + ~ImageCtx(); + void init(); + void shutdown(); + void init_layout(int64_t pool_id); + void perf_start(std::string name); + void perf_stop(); + void set_read_flag(unsigned flag); + int get_read_flags(librados::snap_t snap_id); + int snap_set(uint64_t snap_id); + void snap_unset(); + librados::snap_t get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace, + const std::string& in_snap_name) const; + const SnapInfo* get_snap_info(librados::snap_t in_snap_id) const; + int get_snap_name(librados::snap_t in_snap_id, + std::string *out_snap_name) const; + int get_snap_namespace(librados::snap_t in_snap_id, + cls::rbd::SnapshotNamespace *out_snap_namespace) const; + int get_parent_spec(librados::snap_t in_snap_id, + cls::rbd::ParentImageSpec *pspec) const; + int is_snap_protected(librados::snap_t in_snap_id, + bool *is_protected) const; + int is_snap_unprotected(librados::snap_t in_snap_id, + bool *is_unprotected) const; + + uint64_t get_current_size() const; + uint64_t get_object_size() const; + string get_object_name(uint64_t num) const; + uint64_t get_stripe_unit() const; + uint64_t get_stripe_count() const; + uint64_t get_stripe_period() const; + utime_t get_create_timestamp() const; + utime_t get_access_timestamp() const; + utime_t get_modify_timestamp() const; + + void set_access_timestamp(utime_t at); + void set_modify_timestamp(utime_t at); + + void add_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + std::string in_snap_name, + librados::snap_t id, + uint64_t in_size, const ParentImageInfo &parent, + uint8_t protection_status, uint64_t flags, utime_t timestamp); + void rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + std::string in_snap_name, + librados::snap_t id); + uint64_t get_image_size(librados::snap_t in_snap_id) const; + uint64_t get_object_count(librados::snap_t in_snap_id) const; + bool test_features(uint64_t test_features) const; + bool test_features(uint64_t test_features, + const RWLock &in_snap_lock) const; + bool test_op_features(uint64_t op_features) const; + bool test_op_features(uint64_t op_features, + const RWLock &in_snap_lock) const; + int get_flags(librados::snap_t in_snap_id, uint64_t *flags) const; + int test_flags(librados::snap_t in_snap_id, + uint64_t test_flags, bool *flags_set) const; + int test_flags(librados::snap_t in_snap_id, + uint64_t test_flags, const RWLock &in_snap_lock, + bool *flags_set) const; + int update_flags(librados::snap_t in_snap_id, uint64_t flag, bool enabled); + + const ParentImageInfo* get_parent_info(librados::snap_t in_snap_id) const; + int64_t get_parent_pool_id(librados::snap_t in_snap_id) const; + std::string get_parent_image_id(librados::snap_t in_snap_id) const; + uint64_t get_parent_snap_id(librados::snap_t in_snap_id) const; + int get_parent_overlap(librados::snap_t in_snap_id, + uint64_t *overlap) const; + void register_watch(Context *on_finish); + uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx, + uint64_t overlap); + + void cancel_async_requests(); + void cancel_async_requests(Context *on_finish); + + void apply_metadata(const std::map<std::string, bufferlist> &meta, + bool thread_safe); + + ExclusiveLock<ImageCtx> *create_exclusive_lock(); + ObjectMap<ImageCtx> *create_object_map(uint64_t snap_id); + Journal<ImageCtx> *create_journal(); + + void clear_pending_completions(); + + void set_image_name(const std::string &name); + + void notify_update(); + void notify_update(Context *on_finish); + + exclusive_lock::Policy *get_exclusive_lock_policy() const; + void set_exclusive_lock_policy(exclusive_lock::Policy *policy); + + journal::Policy *get_journal_policy() const; + void set_journal_policy(journal::Policy *policy); + + bool is_writeback_cache_enabled() const; + + static void get_thread_pool_instance(CephContext *cct, + ThreadPool **thread_pool, + ContextWQ **op_work_queue); + static void get_timer_instance(CephContext *cct, SafeTimer **timer, + Mutex **timer_lock); + }; +} + +#endif diff --git a/src/librbd/ImageState.cc b/src/librbd/ImageState.cc new file mode 100644 index 00000000..1f0535e2 --- /dev/null +++ b/src/librbd/ImageState.cc @@ -0,0 +1,741 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ImageState.h" +#include "include/rbd/librbd.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/image/CloseRequest.h" +#include "librbd/image/OpenRequest.h" +#include "librbd/image/RefreshRequest.h" +#include "librbd/image/SetSnapRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageState: " << this << " " + +namespace librbd { + +using util::create_async_context_callback; +using util::create_context_callback; + +class ImageUpdateWatchers { +public: + + explicit ImageUpdateWatchers(CephContext *cct) : m_cct(cct), + m_lock(util::unique_lock_name("librbd::ImageUpdateWatchers::m_lock", this)) { + } + + ~ImageUpdateWatchers() { + ceph_assert(m_watchers.empty()); + ceph_assert(m_in_flight.empty()); + ceph_assert(m_pending_unregister.empty()); + ceph_assert(m_on_shut_down_finish == nullptr); + + destroy_work_queue(); + } + + void flush(Context *on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl; + { + Mutex::Locker locker(m_lock); + if (!m_in_flight.empty()) { + Context *ctx = new FunctionContext( + [this, on_finish](int r) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing flush" << dendl; + on_finish->complete(r); + }); + m_work_queue->queue(ctx, 0); + return; + } + } + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing flush" << dendl; + on_finish->complete(0); + } + + void shut_down(Context *on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_on_shut_down_finish == nullptr); + m_watchers.clear(); + if (!m_in_flight.empty()) { + m_on_shut_down_finish = on_finish; + return; + } + } + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing shut down" << dendl; + on_finish->complete(0); + } + + void register_watcher(UpdateWatchCtx *watcher, uint64_t *handle) { + ldout(m_cct, 20) << __func__ << ": watcher=" << watcher << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_shut_down_finish == nullptr); + + create_work_queue(); + + *handle = m_next_handle++; + m_watchers.insert(std::make_pair(*handle, watcher)); + } + + void unregister_watcher(uint64_t handle, Context *on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle=" + << handle << dendl; + int r = 0; + { + Mutex::Locker locker(m_lock); + auto it = m_watchers.find(handle); + if (it == m_watchers.end()) { + r = -ENOENT; + } else { + if (m_in_flight.find(handle) != m_in_flight.end()) { + ceph_assert(m_pending_unregister.find(handle) == m_pending_unregister.end()); + m_pending_unregister[handle] = on_finish; + on_finish = nullptr; + } + m_watchers.erase(it); + } + } + + if (on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing unregister" << dendl; + on_finish->complete(r); + } + } + + void notify() { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl; + + Mutex::Locker locker(m_lock); + for (auto it : m_watchers) { + send_notify(it.first, it.second); + } + } + + void send_notify(uint64_t handle, UpdateWatchCtx *watcher) { + ceph_assert(m_lock.is_locked()); + + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle=" + << handle << ", watcher=" << watcher << dendl; + + m_in_flight.insert(handle); + + Context *ctx = new FunctionContext( + [this, handle, watcher](int r) { + handle_notify(handle, watcher); + }); + + m_work_queue->queue(ctx, 0); + } + + void handle_notify(uint64_t handle, UpdateWatchCtx *watcher) { + + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle=" + << handle << ", watcher=" << watcher << dendl; + + watcher->handle_notify(); + + Context *on_unregister_finish = nullptr; + Context *on_shut_down_finish = nullptr; + + { + Mutex::Locker locker(m_lock); + + auto in_flight_it = m_in_flight.find(handle); + ceph_assert(in_flight_it != m_in_flight.end()); + m_in_flight.erase(in_flight_it); + + // If there is no more in flight notifications for this watcher + // and it is pending unregister, complete it now. + if (m_in_flight.find(handle) == m_in_flight.end()) { + auto it = m_pending_unregister.find(handle); + if (it != m_pending_unregister.end()) { + on_unregister_finish = it->second; + m_pending_unregister.erase(it); + } + } + + if (m_in_flight.empty()) { + ceph_assert(m_pending_unregister.empty()); + if (m_on_shut_down_finish != nullptr) { + std::swap(m_on_shut_down_finish, on_shut_down_finish); + } + } + } + + if (on_unregister_finish != nullptr) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing unregister" << dendl; + on_unregister_finish->complete(0); + } + + if (on_shut_down_finish != nullptr) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing shut down" << dendl; + on_shut_down_finish->complete(0); + } + } + +private: + class ThreadPoolSingleton : public ThreadPool { + public: + explicit ThreadPoolSingleton(CephContext *cct) + : ThreadPool(cct, "librbd::ImageUpdateWatchers::thread_pool", "tp_librbd", + 1) { + start(); + } + ~ThreadPoolSingleton() override { + stop(); + } + }; + + CephContext *m_cct; + Mutex m_lock; + ContextWQ *m_work_queue = nullptr; + std::map<uint64_t, UpdateWatchCtx*> m_watchers; + uint64_t m_next_handle = 0; + std::multiset<uint64_t> m_in_flight; + std::map<uint64_t, Context*> m_pending_unregister; + Context *m_on_shut_down_finish = nullptr; + + void create_work_queue() { + if (m_work_queue != nullptr) { + return; + } + auto& thread_pool = m_cct->lookup_or_create_singleton_object< + ThreadPoolSingleton>("librbd::ImageUpdateWatchers::thread_pool", + false, m_cct); + m_work_queue = new ContextWQ("librbd::ImageUpdateWatchers::op_work_queue", + m_cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"), + &thread_pool); + } + + void destroy_work_queue() { + if (m_work_queue == nullptr) { + return; + } + m_work_queue->drain(); + delete m_work_queue; + } +}; + +template <typename I> +ImageState<I>::ImageState(I *image_ctx) + : m_image_ctx(image_ctx), m_state(STATE_UNINITIALIZED), + m_lock(util::unique_lock_name("librbd::ImageState::m_lock", this)), + m_last_refresh(0), m_refresh_seq(0), + m_update_watchers(new ImageUpdateWatchers(image_ctx->cct)) { +} + +template <typename I> +ImageState<I>::~ImageState() { + ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED); + delete m_update_watchers; +} + +template <typename I> +int ImageState<I>::open(uint64_t flags) { + C_SaferCond ctx; + open(flags, &ctx); + + int r = ctx.wait(); + if (r < 0) { + delete m_image_ctx; + } + return r; +} + +template <typename I> +void ImageState<I>::open(uint64_t flags, Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_lock.Lock(); + ceph_assert(m_state == STATE_UNINITIALIZED); + m_open_flags = flags; + + Action action(ACTION_TYPE_OPEN); + action.refresh_seq = m_refresh_seq; + + execute_action_unlock(action, on_finish); +} + +template <typename I> +int ImageState<I>::close() { + C_SaferCond ctx; + close(&ctx); + + int r = ctx.wait(); + delete m_image_ctx; + return r; +} + +template <typename I> +void ImageState<I>::close(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_lock.Lock(); + ceph_assert(!is_closed()); + + Action action(ACTION_TYPE_CLOSE); + action.refresh_seq = m_refresh_seq; + execute_action_unlock(action, on_finish); +} + +template <typename I> +void ImageState<I>::handle_update_notification() { + Mutex::Locker locker(m_lock); + ++m_refresh_seq; + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": refresh_seq = " << m_refresh_seq << ", " + << "last_refresh = " << m_last_refresh << dendl; + + if (m_state == STATE_OPEN) { + m_update_watchers->notify(); + } +} + +template <typename I> +bool ImageState<I>::is_refresh_required() const { + Mutex::Locker locker(m_lock); + return (m_last_refresh != m_refresh_seq || find_pending_refresh() != nullptr); +} + +template <typename I> +int ImageState<I>::refresh() { + C_SaferCond refresh_ctx; + refresh(&refresh_ctx); + return refresh_ctx.wait(); +} + +template <typename I> +void ImageState<I>::refresh(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_lock.Lock(); + if (is_closed()) { + m_lock.Unlock(); + on_finish->complete(-ESHUTDOWN); + return; + } + + Action action(ACTION_TYPE_REFRESH); + action.refresh_seq = m_refresh_seq; + execute_action_unlock(action, on_finish); +} + +template <typename I> +int ImageState<I>::refresh_if_required() { + C_SaferCond ctx; + { + m_lock.Lock(); + Action action(ACTION_TYPE_REFRESH); + action.refresh_seq = m_refresh_seq; + + auto refresh_action = find_pending_refresh(); + if (refresh_action != nullptr) { + // if a refresh is in-flight, delay until it is finished + action = *refresh_action; + } else if (m_last_refresh == m_refresh_seq) { + m_lock.Unlock(); + return 0; + } else if (is_closed()) { + m_lock.Unlock(); + return -ESHUTDOWN; + } + + execute_action_unlock(action, &ctx); + } + + return ctx.wait(); +} + +template <typename I> +const typename ImageState<I>::Action * +ImageState<I>::find_pending_refresh() const { + ceph_assert(m_lock.is_locked()); + + auto it = std::find_if(m_actions_contexts.rbegin(), + m_actions_contexts.rend(), + [](const ActionContexts& action_contexts) { + return (action_contexts.first == ACTION_TYPE_REFRESH); + }); + if (it != m_actions_contexts.rend()) { + return &it->first; + } + return nullptr; +} + +template <typename I> +void ImageState<I>::snap_set(uint64_t snap_id, Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": snap_id=" << snap_id << dendl; + + Action action(ACTION_TYPE_SET_SNAP); + action.snap_id = snap_id; + + m_lock.Lock(); + execute_action_unlock(action, on_finish); +} + +template <typename I> +void ImageState<I>::prepare_lock(Context *on_ready) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << dendl; + + m_lock.Lock(); + if (is_closed()) { + m_lock.Unlock(); + on_ready->complete(-ESHUTDOWN); + return; + } + + Action action(ACTION_TYPE_LOCK); + action.on_ready = on_ready; + execute_action_unlock(action, nullptr); +} + +template <typename I> +void ImageState<I>::handle_prepare_lock_complete() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << dendl; + + m_lock.Lock(); + if (m_state != STATE_PREPARING_LOCK) { + m_lock.Unlock(); + return; + } + + complete_action_unlock(STATE_OPEN, 0); +} + +template <typename I> +int ImageState<I>::register_update_watcher(UpdateWatchCtx *watcher, + uint64_t *handle) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_update_watchers->register_watcher(watcher, handle); + + ldout(cct, 20) << __func__ << ": handle=" << *handle << dendl; + return 0; +} + +template <typename I> +int ImageState<I>::unregister_update_watcher(uint64_t handle) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": handle=" << handle << dendl; + + C_SaferCond ctx; + m_update_watchers->unregister_watcher(handle, &ctx); + return ctx.wait(); +} + +template <typename I> +void ImageState<I>::flush_update_watchers(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_update_watchers->flush(on_finish); +} + +template <typename I> +void ImageState<I>::shut_down_update_watchers(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_update_watchers->shut_down(on_finish); +} + +template <typename I> +bool ImageState<I>::is_transition_state() const { + switch (m_state) { + case STATE_UNINITIALIZED: + case STATE_OPEN: + case STATE_CLOSED: + return false; + case STATE_OPENING: + case STATE_CLOSING: + case STATE_REFRESHING: + case STATE_SETTING_SNAP: + case STATE_PREPARING_LOCK: + break; + } + return true; +} + +template <typename I> +bool ImageState<I>::is_closed() const { + ceph_assert(m_lock.is_locked()); + + return ((m_state == STATE_CLOSED) || + (!m_actions_contexts.empty() && + m_actions_contexts.back().first.action_type == ACTION_TYPE_CLOSE)); +} + +template <typename I> +void ImageState<I>::append_context(const Action &action, Context *context) { + ceph_assert(m_lock.is_locked()); + + ActionContexts *action_contexts = nullptr; + for (auto &action_ctxs : m_actions_contexts) { + if (action == action_ctxs.first) { + action_contexts = &action_ctxs; + break; + } + } + + if (action_contexts == nullptr) { + m_actions_contexts.push_back({action, {}}); + action_contexts = &m_actions_contexts.back(); + } + + if (context != nullptr) { + action_contexts->second.push_back(context); + } +} + +template <typename I> +void ImageState<I>::execute_next_action_unlock() { + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_actions_contexts.empty()); + switch (m_actions_contexts.front().first.action_type) { + case ACTION_TYPE_OPEN: + send_open_unlock(); + return; + case ACTION_TYPE_CLOSE: + send_close_unlock(); + return; + case ACTION_TYPE_REFRESH: + send_refresh_unlock(); + return; + case ACTION_TYPE_SET_SNAP: + send_set_snap_unlock(); + return; + case ACTION_TYPE_LOCK: + send_prepare_lock_unlock(); + return; + } + ceph_abort(); +} + +template <typename I> +void ImageState<I>::execute_action_unlock(const Action &action, + Context *on_finish) { + ceph_assert(m_lock.is_locked()); + + append_context(action, on_finish); + if (!is_transition_state()) { + execute_next_action_unlock(); + } else { + m_lock.Unlock(); + } +} + +template <typename I> +void ImageState<I>::complete_action_unlock(State next_state, int r) { + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_actions_contexts.empty()); + + ActionContexts action_contexts(std::move(m_actions_contexts.front())); + m_actions_contexts.pop_front(); + + m_state = next_state; + m_lock.Unlock(); + + for (auto ctx : action_contexts.second) { + ctx->complete(r); + } + + if (next_state != STATE_UNINITIALIZED && next_state != STATE_CLOSED) { + m_lock.Lock(); + if (!is_transition_state() && !m_actions_contexts.empty()) { + execute_next_action_unlock(); + } else { + m_lock.Unlock(); + } + } +} + +template <typename I> +void ImageState<I>::send_open_unlock() { + ceph_assert(m_lock.is_locked()); + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_state = STATE_OPENING; + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + ImageState<I>, &ImageState<I>::handle_open>(this)); + image::OpenRequest<I> *req = image::OpenRequest<I>::create( + m_image_ctx, m_open_flags, ctx); + + m_lock.Unlock(); + req->send(); +} + +template <typename I> +void ImageState<I>::handle_open(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + } + + m_lock.Lock(); + complete_action_unlock(r < 0 ? STATE_UNINITIALIZED : STATE_OPEN, r); +} + +template <typename I> +void ImageState<I>::send_close_unlock() { + ceph_assert(m_lock.is_locked()); + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_state = STATE_CLOSING; + + Context *ctx = create_context_callback< + ImageState<I>, &ImageState<I>::handle_close>(this); + image::CloseRequest<I> *req = image::CloseRequest<I>::create( + m_image_ctx, ctx); + + m_lock.Unlock(); + req->send(); +} + +template <typename I> +void ImageState<I>::handle_close(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error occurred while closing image: " << cpp_strerror(r) + << dendl; + } + + m_lock.Lock(); + complete_action_unlock(STATE_CLOSED, r); +} + +template <typename I> +void ImageState<I>::send_refresh_unlock() { + ceph_assert(m_lock.is_locked()); + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_state = STATE_REFRESHING; + ceph_assert(!m_actions_contexts.empty()); + auto &action_context = m_actions_contexts.front().first; + ceph_assert(action_context.action_type == ACTION_TYPE_REFRESH); + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + ImageState<I>, &ImageState<I>::handle_refresh>(this)); + image::RefreshRequest<I> *req = image::RefreshRequest<I>::create( + *m_image_ctx, false, false, ctx); + + m_lock.Unlock(); + req->send(); +} + +template <typename I> +void ImageState<I>::handle_refresh(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + m_lock.Lock(); + ceph_assert(!m_actions_contexts.empty()); + + ActionContexts &action_contexts(m_actions_contexts.front()); + ceph_assert(action_contexts.first.action_type == ACTION_TYPE_REFRESH); + ceph_assert(m_last_refresh <= action_contexts.first.refresh_seq); + + if (r == -ERESTART) { + ldout(cct, 5) << "incomplete refresh: not updating sequence" << dendl; + r = 0; + } else { + m_last_refresh = action_contexts.first.refresh_seq; + } + + complete_action_unlock(STATE_OPEN, r); +} + +template <typename I> +void ImageState<I>::send_set_snap_unlock() { + ceph_assert(m_lock.is_locked()); + + m_state = STATE_SETTING_SNAP; + + ceph_assert(!m_actions_contexts.empty()); + ActionContexts &action_contexts(m_actions_contexts.front()); + ceph_assert(action_contexts.first.action_type == ACTION_TYPE_SET_SNAP); + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "snap_id=" << action_contexts.first.snap_id << dendl; + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + ImageState<I>, &ImageState<I>::handle_set_snap>(this)); + image::SetSnapRequest<I> *req = image::SetSnapRequest<I>::create( + *m_image_ctx, action_contexts.first.snap_id, ctx); + + m_lock.Unlock(); + req->send(); +} + +template <typename I> +void ImageState<I>::handle_set_snap(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to set snapshot: " << cpp_strerror(r) << dendl; + } + + m_lock.Lock(); + complete_action_unlock(STATE_OPEN, r); +} + +template <typename I> +void ImageState<I>::send_prepare_lock_unlock() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + ceph_assert(m_lock.is_locked()); + m_state = STATE_PREPARING_LOCK; + + ceph_assert(!m_actions_contexts.empty()); + ActionContexts &action_contexts(m_actions_contexts.front()); + ceph_assert(action_contexts.first.action_type == ACTION_TYPE_LOCK); + + Context *on_ready = action_contexts.first.on_ready; + m_lock.Unlock(); + + if (on_ready == nullptr) { + complete_action_unlock(STATE_OPEN, 0); + return; + } + + // wake up the lock handler now that its safe to proceed + on_ready->complete(0); +} + +} // namespace librbd + +template class librbd::ImageState<librbd::ImageCtx>; diff --git a/src/librbd/ImageState.h b/src/librbd/ImageState.h new file mode 100644 index 00000000..7f28d1ee --- /dev/null +++ b/src/librbd/ImageState.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_STATE_H +#define CEPH_LIBRBD_IMAGE_STATE_H + +#include "include/int_types.h" +#include "common/Mutex.h" +#include <list> +#include <string> +#include <utility> +#include "cls/rbd/cls_rbd_types.h" + +class Context; +class RWLock; + +namespace librbd { + +class ImageCtx; +class ImageUpdateWatchers; +class UpdateWatchCtx; + +template <typename ImageCtxT = ImageCtx> +class ImageState { +public: + ImageState(ImageCtxT *image_ctx); + ~ImageState(); + + int open(uint64_t flags); + void open(uint64_t flags, Context *on_finish); + + int close(); + void close(Context *on_finish); + + void handle_update_notification(); + + bool is_refresh_required() const; + + int refresh(); + int refresh_if_required(); + void refresh(Context *on_finish); + + void snap_set(uint64_t snap_id, Context *on_finish); + + void prepare_lock(Context *on_ready); + void handle_prepare_lock_complete(); + + int register_update_watcher(UpdateWatchCtx *watcher, uint64_t *handle); + int unregister_update_watcher(uint64_t handle); + void flush_update_watchers(Context *on_finish); + void shut_down_update_watchers(Context *on_finish); + +private: + enum State { + STATE_UNINITIALIZED, + STATE_OPEN, + STATE_CLOSED, + STATE_OPENING, + STATE_CLOSING, + STATE_REFRESHING, + STATE_SETTING_SNAP, + STATE_PREPARING_LOCK + }; + + enum ActionType { + ACTION_TYPE_OPEN, + ACTION_TYPE_CLOSE, + ACTION_TYPE_REFRESH, + ACTION_TYPE_SET_SNAP, + ACTION_TYPE_LOCK + }; + + struct Action { + ActionType action_type; + uint64_t refresh_seq = 0; + uint64_t snap_id = CEPH_NOSNAP; + Context *on_ready = nullptr; + + Action(ActionType action_type) : action_type(action_type) { + } + inline bool operator==(const Action &action) const { + if (action_type != action.action_type) { + return false; + } + switch (action_type) { + case ACTION_TYPE_REFRESH: + return (refresh_seq == action.refresh_seq); + case ACTION_TYPE_SET_SNAP: + return (snap_id == action.snap_id); + case ACTION_TYPE_LOCK: + return false; + default: + return true; + } + } + }; + + typedef std::list<Context *> Contexts; + typedef std::pair<Action, Contexts> ActionContexts; + typedef std::list<ActionContexts> ActionsContexts; + + ImageCtxT *m_image_ctx; + State m_state; + + mutable Mutex m_lock; + ActionsContexts m_actions_contexts; + + uint64_t m_last_refresh; + uint64_t m_refresh_seq; + + ImageUpdateWatchers *m_update_watchers; + + uint64_t m_open_flags; + + bool is_transition_state() const; + bool is_closed() const; + + const Action *find_pending_refresh() const; + + void append_context(const Action &action, Context *context); + void execute_next_action_unlock(); + void execute_action_unlock(const Action &action, Context *context); + void complete_action_unlock(State next_state, int r); + + void send_open_unlock(); + void handle_open(int r); + + void send_close_unlock(); + void handle_close(int r); + + void send_refresh_unlock(); + void handle_refresh(int r); + + void send_set_snap_unlock(); + void handle_set_snap(int r); + + void send_prepare_lock_unlock(); + +}; + +} // namespace librbd + +extern template class librbd::ImageState<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_STATE_H diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc new file mode 100644 index 00000000..bda491bc --- /dev/null +++ b/src/librbd/ImageWatcher.cc @@ -0,0 +1,1126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ImageWatcher.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/TaskFinisher.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/image_watcher/NotifyLockOwner.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/watcher/Utils.h" +#include "include/encoding.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include <boost/bind.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageWatcher: " + +namespace librbd { + +using namespace image_watcher; +using namespace watch_notify; +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; +using librbd::watcher::util::HandlePayloadVisitor; + +using ceph::encode; +using ceph::decode; + +static const double RETRY_DELAY_SECONDS = 1.0; + +template <typename I> +struct ImageWatcher<I>::C_ProcessPayload : public Context { + ImageWatcher *image_watcher; + uint64_t notify_id; + uint64_t handle; + watch_notify::Payload payload; + + C_ProcessPayload(ImageWatcher *image_watcher_, uint64_t notify_id_, + uint64_t handle_, const watch_notify::Payload &payload) + : image_watcher(image_watcher_), notify_id(notify_id_), handle(handle_), + payload(payload) { + } + + void finish(int r) override { + image_watcher->m_async_op_tracker.start_op(); + if (image_watcher->notifications_blocked()) { + // requests are blocked -- just ack the notification + bufferlist bl; + image_watcher->acknowledge_notify(notify_id, handle, bl); + } else { + image_watcher->process_payload(notify_id, handle, payload); + } + image_watcher->m_async_op_tracker.finish_op(); + } +}; + +template <typename I> +ImageWatcher<I>::ImageWatcher(I &image_ctx) + : Watcher(image_ctx.md_ctx, image_ctx.op_work_queue, image_ctx.header_oid), + m_image_ctx(image_ctx), + m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)), + m_async_request_lock(util::unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)), + m_owner_client_id_lock(util::unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this)) +{ +} + +template <typename I> +ImageWatcher<I>::~ImageWatcher() +{ + delete m_task_finisher; +} + +template <typename I> +void ImageWatcher<I>::unregister_watch(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " unregistering image watcher" << dendl; + + cancel_async_requests(); + + on_finish = new FunctionContext([this, on_finish](int r) { + m_async_op_tracker.wait_for_ops(on_finish); + }); + FunctionContext *ctx = new FunctionContext([this, on_finish](int r) { + m_task_finisher->cancel_all(on_finish); + }); + Watcher::unregister_watch(ctx); +} + +template <typename I> +void ImageWatcher<I>::block_notifies(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + on_finish = new FunctionContext([this, on_finish](int r) { + cancel_async_requests(); + on_finish->complete(r); + }); + Watcher::block_notifies(on_finish); +} + +template <typename I> +void ImageWatcher<I>::schedule_async_progress(const AsyncRequestId &request, + uint64_t offset, uint64_t total) { + FunctionContext *ctx = new FunctionContext( + boost::bind(&ImageWatcher<I>::notify_async_progress, this, request, offset, + total)); + m_task_finisher->queue(Task(TASK_CODE_ASYNC_PROGRESS, request), ctx); +} + +template <typename I> +int ImageWatcher<I>::notify_async_progress(const AsyncRequestId &request, + uint64_t offset, uint64_t total) { + ldout(m_image_ctx.cct, 20) << this << " remote async request progress: " + << request << " @ " << offset + << "/" << total << dendl; + + send_notify(AsyncProgressPayload(request, offset, total)); + return 0; +} + +template <typename I> +void ImageWatcher<I>::schedule_async_complete(const AsyncRequestId &request, + int r) { + m_async_op_tracker.start_op(); + FunctionContext *ctx = new FunctionContext( + boost::bind(&ImageWatcher<I>::notify_async_complete, this, request, r)); + m_task_finisher->queue(ctx); +} + +template <typename I> +void ImageWatcher<I>::notify_async_complete(const AsyncRequestId &request, + int r) { + ldout(m_image_ctx.cct, 20) << this << " remote async request finished: " + << request << " = " << r << dendl; + + send_notify(AsyncCompletePayload(request, r), + new FunctionContext(boost::bind(&ImageWatcher<I>::handle_async_complete, + this, request, r, _1))); +} + +template <typename I> +void ImageWatcher<I>::handle_async_complete(const AsyncRequestId &request, + int r, int ret_val) { + ldout(m_image_ctx.cct, 20) << this << " " << __func__ << ": " + << "request=" << request << ", r=" << ret_val + << dendl; + if (ret_val < 0) { + lderr(m_image_ctx.cct) << this << " failed to notify async complete: " + << cpp_strerror(ret_val) << dendl; + if (ret_val == -ETIMEDOUT && !is_unregistered()) { + schedule_async_complete(request, r); + m_async_op_tracker.finish_op(); + return; + } + } + + RWLock::WLocker async_request_locker(m_async_request_lock); + m_async_pending.erase(request); + m_async_op_tracker.finish_op(); +} + +template <typename I> +void ImageWatcher<I>::notify_flatten(uint64_t request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, FlattenPayload(async_request_id), + prog_ctx, on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_resize(uint64_t request_id, uint64_t size, + bool allow_shrink, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + ResizePayload(size, allow_shrink, async_request_id), + prog_ctx, on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(SnapCreatePayload(snap_namespace, snap_name), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_snap_rename(const snapid_t &src_snap_id, + const std::string &dst_snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(SnapRenamePayload(src_snap_id, dst_snap_name), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_snap_remove(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(SnapRemovePayload(snap_namespace, snap_name), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_snap_protect(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(SnapProtectPayload(snap_namespace, snap_name), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_snap_unprotect(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(SnapUnprotectPayload(snap_namespace, snap_name), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_rebuild_object_map(uint64_t request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + RebuildObjectMapPayload(async_request_id), + prog_ctx, on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_rename(const std::string &image_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(RenamePayload(image_name), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_update_features(uint64_t features, bool enabled, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + notify_lock_owner(UpdateFeaturesPayload(features, enabled), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_migrate(uint64_t request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, MigratePayload(async_request_id), + prog_ctx, on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_sparsify(uint64_t request_id, size_t sparse_size, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + SparsifyPayload(async_request_id, sparse_size), prog_ctx, + on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_header_update(Context *on_finish) { + ldout(m_image_ctx.cct, 10) << this << ": " << __func__ << dendl; + + // supports legacy (empty buffer) clients + send_notify(HeaderUpdatePayload(), on_finish); +} + +template <typename I> +void ImageWatcher<I>::notify_header_update(librados::IoCtx &io_ctx, + const std::string &oid) { + // supports legacy (empty buffer) clients + bufferlist bl; + encode(NotifyMessage(HeaderUpdatePayload()), bl); + io_ctx.notify2(oid, bl, watcher::Notifier::NOTIFY_TIMEOUT, nullptr); +} + +template <typename I> +void ImageWatcher<I>::schedule_cancel_async_requests() { + FunctionContext *ctx = new FunctionContext( + boost::bind(&ImageWatcher<I>::cancel_async_requests, this)); + m_task_finisher->queue(TASK_CODE_CANCEL_ASYNC_REQUESTS, ctx); +} + +template <typename I> +void ImageWatcher<I>::cancel_async_requests() { + RWLock::WLocker l(m_async_request_lock); + for (std::map<AsyncRequestId, AsyncRequest>::iterator iter = + m_async_requests.begin(); + iter != m_async_requests.end(); ++iter) { + iter->second.first->complete(-ERESTART); + } + m_async_requests.clear(); +} + +template <typename I> +void ImageWatcher<I>::set_owner_client_id(const ClientId& client_id) { + ceph_assert(m_owner_client_id_lock.is_locked()); + m_owner_client_id = client_id; + ldout(m_image_ctx.cct, 10) << this << " current lock owner: " + << m_owner_client_id << dendl; +} + +template <typename I> +ClientId ImageWatcher<I>::get_client_id() { + RWLock::RLocker l(this->m_watch_lock); + return ClientId(m_image_ctx.md_ctx.get_instance_id(), this->m_watch_handle); +} + +template <typename I> +void ImageWatcher<I>::notify_acquired_lock() { + ldout(m_image_ctx.cct, 10) << this << " notify acquired lock" << dendl; + + ClientId client_id = get_client_id(); + { + Mutex::Locker owner_client_id_locker(m_owner_client_id_lock); + set_owner_client_id(client_id); + } + + send_notify(AcquiredLockPayload(client_id)); +} + +template <typename I> +void ImageWatcher<I>::notify_released_lock() { + ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl; + + { + Mutex::Locker owner_client_id_locker(m_owner_client_id_lock); + set_owner_client_id(ClientId()); + } + + send_notify(ReleasedLockPayload(get_client_id())); +} + +template <typename I> +void ImageWatcher<I>::schedule_request_lock(bool use_timer, int timer_delay) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + if (m_image_ctx.exclusive_lock == nullptr) { + // exclusive lock dynamically disabled via image refresh + return; + } + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + RWLock::RLocker watch_locker(this->m_watch_lock); + if (this->is_registered(this->m_watch_lock)) { + ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl; + + FunctionContext *ctx = new FunctionContext( + boost::bind(&ImageWatcher<I>::notify_request_lock, this)); + if (use_timer) { + if (timer_delay < 0) { + timer_delay = RETRY_DELAY_SECONDS; + } + m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK, + timer_delay, ctx); + } else { + m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx); + } + } +} + +template <typename I> +void ImageWatcher<I>::notify_request_lock() { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + + // ExclusiveLock state machine can be dynamically disabled or + // race with task cancel + if (m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()) { + return; + } + + ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl; + + notify_lock_owner(RequestLockPayload(get_client_id(), false), + create_context_callback< + ImageWatcher, &ImageWatcher<I>::handle_request_lock>(this)); +} + +template <typename I> +void ImageWatcher<I>::handle_request_lock(int r) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + + // ExclusiveLock state machine cannot transition -- but can be + // dynamically disabled + if (m_image_ctx.exclusive_lock == nullptr) { + return; + } + + if (r == -ETIMEDOUT) { + ldout(m_image_ctx.cct, 5) << this << " timed out requesting lock: retrying" + << dendl; + + // treat this is a dead client -- so retest acquiring the lock + m_image_ctx.exclusive_lock->handle_peer_notification(0); + } else if (r == -EROFS) { + ldout(m_image_ctx.cct, 5) << this << " peer will not release lock" << dendl; + m_image_ctx.exclusive_lock->handle_peer_notification(r); + } else if (r < 0) { + lderr(m_image_ctx.cct) << this << " error requesting lock: " + << cpp_strerror(r) << dendl; + schedule_request_lock(true); + } else { + // lock owner acked -- but resend if we don't see them release the lock + int retry_timeout = m_image_ctx.cct->_conf.template get_val<int64_t>( + "client_notify_timeout"); + ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout + << " seconds" << dendl; + schedule_request_lock(true, retry_timeout); + } +} + +template <typename I> +void ImageWatcher<I>::notify_lock_owner(const Payload& payload, + Context *on_finish) { + ceph_assert(on_finish != nullptr); + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + bufferlist bl; + encode(NotifyMessage(payload), bl); + + NotifyLockOwner *notify_lock_owner = NotifyLockOwner::create( + m_image_ctx, this->m_notifier, std::move(bl), on_finish); + notify_lock_owner->send(); +} + +template <typename I> +Context *ImageWatcher<I>::remove_async_request(const AsyncRequestId &id) { + RWLock::WLocker async_request_locker(m_async_request_lock); + auto it = m_async_requests.find(id); + if (it != m_async_requests.end()) { + Context *on_complete = it->second.first; + m_async_requests.erase(it); + return on_complete; + } + return nullptr; +} + +template <typename I> +void ImageWatcher<I>::schedule_async_request_timed_out(const AsyncRequestId &id) { + ldout(m_image_ctx.cct, 20) << "scheduling async request time out: " << id + << dendl; + + Context *ctx = new FunctionContext(boost::bind( + &ImageWatcher<I>::async_request_timed_out, this, id)); + + Task task(TASK_CODE_ASYNC_REQUEST, id); + m_task_finisher->cancel(task); + + m_task_finisher->add_event_after( + task, m_image_ctx.config.template get_val<uint64_t>("rbd_request_timed_out_seconds"), + ctx); +} + +template <typename I> +void ImageWatcher<I>::async_request_timed_out(const AsyncRequestId &id) { + Context *on_complete = remove_async_request(id); + if (on_complete != nullptr) { + ldout(m_image_ctx.cct, 5) << "async request timed out: " << id << dendl; + m_image_ctx.op_work_queue->queue(on_complete, -ETIMEDOUT); + } +} + +template <typename I> +void ImageWatcher<I>::notify_async_request(const AsyncRequestId &async_request_id, + const Payload& payload, + ProgressContext& prog_ctx, + Context *on_finish) { + ceph_assert(on_finish != nullptr); + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + ldout(m_image_ctx.cct, 10) << this << " async request: " << async_request_id + << dendl; + + Context *on_notify = new FunctionContext([this, async_request_id](int r) { + if (r < 0) { + // notification failed -- don't expect updates + Context *on_complete = remove_async_request(async_request_id); + if (on_complete != nullptr) { + on_complete->complete(r); + } + } + }); + + Context *on_complete = new FunctionContext( + [this, async_request_id, on_finish](int r) { + m_task_finisher->cancel(Task(TASK_CODE_ASYNC_REQUEST, async_request_id)); + on_finish->complete(r); + }); + + { + RWLock::WLocker async_request_locker(m_async_request_lock); + m_async_requests[async_request_id] = AsyncRequest(on_complete, &prog_ctx); + } + + schedule_async_request_timed_out(async_request_id); + notify_lock_owner(payload, on_notify); +} + +template <typename I> +int ImageWatcher<I>::prepare_async_request(const AsyncRequestId& async_request_id, + bool* new_request, Context** ctx, + ProgressContext** prog_ctx) { + if (async_request_id.client_id == get_client_id()) { + return -ERESTART; + } else { + RWLock::WLocker l(m_async_request_lock); + if (m_async_pending.count(async_request_id) == 0) { + m_async_pending.insert(async_request_id); + *new_request = true; + *prog_ctx = new RemoteProgressContext(*this, async_request_id); + *ctx = new RemoteContext(*this, async_request_id, *prog_ctx); + } else { + *new_request = false; + } + } + return 0; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const HeaderUpdatePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl; + + m_image_ctx.state->handle_update_notification(); + m_image_ctx.perfcounter->inc(l_librbd_notify); + if (ack_ctx != nullptr) { + m_image_ctx.state->flush_update_watchers(new C_ResponseMessage(ack_ctx)); + return false; + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const AcquiredLockPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement" + << dendl; + + bool cancel_async_requests = true; + if (payload.client_id.is_valid()) { + Mutex::Locker owner_client_id_locker(m_owner_client_id_lock); + if (payload.client_id == m_owner_client_id) { + cancel_async_requests = false; + } + set_owner_client_id(payload.client_id); + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + // potentially wake up the exclusive lock state machine now that + // a lock owner has advertised itself + m_image_ctx.exclusive_lock->handle_peer_notification(0); + } + if (cancel_async_requests && + (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner())) { + schedule_cancel_async_requests(); + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const ReleasedLockPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl; + + bool cancel_async_requests = true; + if (payload.client_id.is_valid()) { + Mutex::Locker l(m_owner_client_id_lock); + if (payload.client_id != m_owner_client_id) { + ldout(m_image_ctx.cct, 10) << this << " unexpected owner: " + << payload.client_id << " != " + << m_owner_client_id << dendl; + cancel_async_requests = false; + } else { + set_owner_client_id(ClientId()); + } + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (cancel_async_requests && + (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner())) { + schedule_cancel_async_requests(); + } + + // alert the exclusive lock state machine that the lock is available + if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.exclusive_lock->is_lock_owner()) { + m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK); + m_image_ctx.exclusive_lock->handle_peer_notification(0); + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const RequestLockPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl; + if (payload.client_id == get_client_id()) { + return true; + } + + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr && + m_image_ctx.exclusive_lock->is_lock_owner()) { + int r = 0; + bool accept_request = m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r); + + if (accept_request) { + ceph_assert(r == 0); + Mutex::Locker owner_client_id_locker(m_owner_client_id_lock); + if (!m_owner_client_id.is_valid()) { + return true; + } + + ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock" + << dendl; + r = m_image_ctx.get_exclusive_lock_policy()->lock_requested( + payload.force); + } + encode(ResponseMessage(r), ack_ctx->out); + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const AsyncProgressPayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_async_request_lock); + std::map<AsyncRequestId, AsyncRequest>::iterator req_it = + m_async_requests.find(payload.async_request_id); + if (req_it != m_async_requests.end()) { + ldout(m_image_ctx.cct, 20) << this << " request progress: " + << payload.async_request_id << " @ " + << payload.offset << "/" << payload.total + << dendl; + schedule_async_request_timed_out(payload.async_request_id); + req_it->second.second->update_progress(payload.offset, payload.total); + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const AsyncCompletePayload &payload, + C_NotifyAck *ack_ctx) { + Context *on_complete = remove_async_request(payload.async_request_id); + if (on_complete != nullptr) { + ldout(m_image_ctx.cct, 10) << this << " request finished: " + << payload.async_request_id << "=" + << payload.result << dendl; + on_complete->complete(payload.result); + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const FlattenPayload &payload, + C_NotifyAck *ack_ctx) { + + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + bool new_request; + Context *ctx; + ProgressContext *prog_ctx; + r = prepare_async_request(payload.async_request_id, &new_request, + &ctx, &prog_ctx); + if (r == 0 && new_request) { + ldout(m_image_ctx.cct, 10) << this << " remote flatten request: " + << payload.async_request_id << dendl; + m_image_ctx.operations->execute_flatten(*prog_ctx, ctx); + } + + encode(ResponseMessage(r), ack_ctx->out); + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const ResizePayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + bool new_request; + Context *ctx; + ProgressContext *prog_ctx; + r = prepare_async_request(payload.async_request_id, &new_request, + &ctx, &prog_ctx); + if (r == 0 && new_request) { + ldout(m_image_ctx.cct, 10) << this << " remote resize request: " + << payload.async_request_id << " " + << payload.size << " " + << payload.allow_shrink << dendl; + m_image_ctx.operations->execute_resize(payload.size, payload.allow_shrink, *prog_ctx, ctx, 0); + } + + encode(ResponseMessage(r), ack_ctx->out); + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const SnapCreatePayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: " + << payload.snap_name << dendl; + + m_image_ctx.operations->execute_snap_create(payload.snap_namespace, + payload.snap_name, + new C_ResponseMessage(ack_ctx), + 0, false); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const SnapRenamePayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_rename request: " + << payload.snap_id << " to " + << payload.snap_name << dendl; + + m_image_ctx.operations->execute_snap_rename(payload.snap_id, + payload.snap_name, + new C_ResponseMessage(ack_ctx)); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const SnapRemovePayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL; + if (cls::rbd::get_snap_namespace_type(payload.snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) { + request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE; + } + int r; + if (m_image_ctx.exclusive_lock->accept_request(request_type, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_remove request: " + << payload.snap_name << dendl; + + m_image_ctx.operations->execute_snap_remove(payload.snap_namespace, + payload.snap_name, + new C_ResponseMessage(ack_ctx)); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const SnapProtectPayload& payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_protect request: " + << payload.snap_name << dendl; + + m_image_ctx.operations->execute_snap_protect(payload.snap_namespace, + payload.snap_name, + new C_ResponseMessage(ack_ctx)); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const SnapUnprotectPayload& payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_unprotect request: " + << payload.snap_name << dendl; + + m_image_ctx.operations->execute_snap_unprotect(payload.snap_namespace, + payload.snap_name, + new C_ResponseMessage(ack_ctx)); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const RebuildObjectMapPayload& payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + bool new_request; + Context *ctx; + ProgressContext *prog_ctx; + r = prepare_async_request(payload.async_request_id, &new_request, + &ctx, &prog_ctx); + if (r == 0 && new_request) { + ldout(m_image_ctx.cct, 10) << this + << " remote rebuild object map request: " + << payload.async_request_id << dendl; + m_image_ctx.operations->execute_rebuild_object_map(*prog_ctx, ctx); + } + + encode(ResponseMessage(r), ack_ctx->out); + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const RenamePayload& payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote rename request: " + << payload.image_name << dendl; + + m_image_ctx.operations->execute_rename(payload.image_name, + new C_ResponseMessage(ack_ctx)); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const UpdateFeaturesPayload& payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + ldout(m_image_ctx.cct, 10) << this << " remote update_features request: " + << payload.features << " " + << (payload.enabled ? "enabled" : "disabled") + << dendl; + + m_image_ctx.operations->execute_update_features( + payload.features, payload.enabled, new C_ResponseMessage(ack_ctx), 0); + return false; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const MigratePayload &payload, + C_NotifyAck *ack_ctx) { + + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + bool new_request; + Context *ctx; + ProgressContext *prog_ctx; + r = prepare_async_request(payload.async_request_id, &new_request, + &ctx, &prog_ctx); + if (r == 0 && new_request) { + ldout(m_image_ctx.cct, 10) << this << " remote migrate request: " + << payload.async_request_id << dendl; + m_image_ctx.operations->execute_migrate(*prog_ctx, ctx); + } + + encode(ResponseMessage(r), ack_ctx->out); + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const SparsifyPayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) { + bool new_request; + Context *ctx; + ProgressContext *prog_ctx; + r = prepare_async_request(payload.async_request_id, &new_request, + &ctx, &prog_ctx); + if (r == 0 && new_request) { + ldout(m_image_ctx.cct, 10) << this << " remote sparsify request: " + << payload.async_request_id << dendl; + m_image_ctx.operations->execute_sparsify(payload.sparse_size, *prog_ctx, + ctx); + } + + encode(ResponseMessage(r), ack_ctx->out); + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template <typename I> +bool ImageWatcher<I>::handle_payload(const UnknownPayload &payload, + C_NotifyAck *ack_ctx) { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r) || r < 0) { + encode(ResponseMessage(-EOPNOTSUPP), ack_ctx->out); + } + } + return true; +} + +template <typename I> +void ImageWatcher<I>::process_payload(uint64_t notify_id, uint64_t handle, + const Payload &payload) { + apply_visitor(HandlePayloadVisitor<ImageWatcher<I>>(this, notify_id, handle), + payload); +} + +template <typename I> +void ImageWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + NotifyMessage notify_message; + if (bl.length() == 0) { + // legacy notification for header updates + notify_message = NotifyMessage(HeaderUpdatePayload()); + } else { + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + lderr(m_image_ctx.cct) << this << " error decoding image notification: " + << err.what() << dendl; + return; + } + } + + // if an image refresh is required, refresh before processing the request + if (notify_message.check_for_refresh() && + m_image_ctx.state->is_refresh_required()) { + m_image_ctx.state->refresh(new C_ProcessPayload(this, notify_id, handle, + notify_message.payload)); + } else { + process_payload(notify_id, handle, notify_message.payload); + } +} + +template <typename I> +void ImageWatcher<I>::handle_error(uint64_t handle, int err) { + lderr(m_image_ctx.cct) << this << " image watch failed: " << handle << ", " + << cpp_strerror(err) << dendl; + + { + Mutex::Locker l(m_owner_client_id_lock); + set_owner_client_id(ClientId()); + } + + Watcher::handle_error(handle, err); +} + +template <typename I> +void ImageWatcher<I>::handle_rewatch_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + // update the lock cookie with the new watch handle + m_image_ctx.exclusive_lock->reacquire_lock(nullptr); + } + } + + // image might have been updated while we didn't have active watch + handle_payload(HeaderUpdatePayload(), nullptr); +} + +template <typename I> +void ImageWatcher<I>::send_notify(const Payload &payload, Context *ctx) { + bufferlist bl; + + encode(NotifyMessage(payload), bl); + Watcher::send_notify(bl, nullptr, ctx); +} + +template <typename I> +void ImageWatcher<I>::RemoteContext::finish(int r) { + m_image_watcher.schedule_async_complete(m_async_request_id, r); +} + +template <typename I> +void ImageWatcher<I>::C_ResponseMessage::finish(int r) { + CephContext *cct = notify_ack->cct; + ldout(cct, 10) << this << " C_ResponseMessage: r=" << r << dendl; + + encode(ResponseMessage(r), notify_ack->out); + notify_ack->complete(0); +} + +} // namespace librbd + +template class librbd::ImageWatcher<librbd::ImageCtx>; diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h new file mode 100644 index 00000000..7d04f0b4 --- /dev/null +++ b/src/librbd/ImageWatcher.h @@ -0,0 +1,268 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_WATCHER_H +#define CEPH_LIBRBD_IMAGE_WATCHER_H + +#include "cls/rbd/cls_rbd_types.h" +#include "common/Mutex.h" +#include "common/RWLock.h" +#include "common/AsyncOpTracker.h" +#include "include/Context.h" +#include "include/rbd/librbd.hpp" +#include "librbd/Watcher.h" +#include "librbd/WatchNotifyTypes.h" +#include <set> +#include <string> +#include <utility> + +class entity_name_t; + +namespace librbd { + +namespace watcher { +namespace util { +template <typename> struct HandlePayloadVisitor; +} +} + +class ImageCtx; +template <typename> class TaskFinisher; + +template <typename ImageCtxT = ImageCtx> +class ImageWatcher : public Watcher { + friend struct watcher::util::HandlePayloadVisitor<ImageWatcher<ImageCtxT>>; + +public: + ImageWatcher(ImageCtxT& image_ctx); + ~ImageWatcher() override; + + void unregister_watch(Context *on_finish) override; + void block_notifies(Context *on_finish) override; + + void notify_flatten(uint64_t request_id, ProgressContext &prog_ctx, + Context *on_finish); + void notify_resize(uint64_t request_id, uint64_t size, bool allow_shrink, + ProgressContext &prog_ctx, Context *on_finish); + void notify_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_snap_rename(const snapid_t &src_snap_id, + const std::string &dst_snap_name, + Context *on_finish); + void notify_snap_remove(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_snap_protect(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_snap_unprotect(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_rebuild_object_map(uint64_t request_id, + ProgressContext &prog_ctx, Context *on_finish); + void notify_rename(const std::string &image_name, Context *on_finish); + + void notify_update_features(uint64_t features, bool enabled, + Context *on_finish); + + void notify_migrate(uint64_t request_id, ProgressContext &prog_ctx, + Context *on_finish); + + void notify_sparsify(uint64_t request_id, size_t sparse_size, + ProgressContext &prog_ctx, Context *on_finish); + + void notify_acquired_lock(); + void notify_released_lock(); + void notify_request_lock(); + + void notify_header_update(Context *on_finish); + static void notify_header_update(librados::IoCtx &io_ctx, + const std::string &oid); + +private: + enum TaskCode { + TASK_CODE_REQUEST_LOCK, + TASK_CODE_CANCEL_ASYNC_REQUESTS, + TASK_CODE_REREGISTER_WATCH, + TASK_CODE_ASYNC_REQUEST, + TASK_CODE_ASYNC_PROGRESS + }; + + typedef std::pair<Context *, ProgressContext *> AsyncRequest; + + class Task { + public: + Task(TaskCode task_code) : m_task_code(task_code) {} + Task(TaskCode task_code, const watch_notify::AsyncRequestId &id) + : m_task_code(task_code), m_async_request_id(id) {} + + inline bool operator<(const Task& rhs) const { + if (m_task_code != rhs.m_task_code) { + return m_task_code < rhs.m_task_code; + } else if ((m_task_code == TASK_CODE_ASYNC_REQUEST || + m_task_code == TASK_CODE_ASYNC_PROGRESS) && + m_async_request_id != rhs.m_async_request_id) { + return m_async_request_id < rhs.m_async_request_id; + } + return false; + } + private: + TaskCode m_task_code; + watch_notify::AsyncRequestId m_async_request_id; + }; + + class RemoteProgressContext : public ProgressContext { + public: + RemoteProgressContext(ImageWatcher &image_watcher, + const watch_notify::AsyncRequestId &id) + : m_image_watcher(image_watcher), m_async_request_id(id) + { + } + + int update_progress(uint64_t offset, uint64_t total) override { + m_image_watcher.schedule_async_progress(m_async_request_id, offset, + total); + return 0; + } + + private: + ImageWatcher &m_image_watcher; + watch_notify::AsyncRequestId m_async_request_id; + }; + + class RemoteContext : public Context { + public: + RemoteContext(ImageWatcher &image_watcher, + const watch_notify::AsyncRequestId &id, + ProgressContext *prog_ctx) + : m_image_watcher(image_watcher), m_async_request_id(id), + m_prog_ctx(prog_ctx) + { + } + + ~RemoteContext() override { + delete m_prog_ctx; + } + + void finish(int r) override; + + private: + ImageWatcher &m_image_watcher; + watch_notify::AsyncRequestId m_async_request_id; + ProgressContext *m_prog_ctx; + }; + + struct C_ProcessPayload; + struct C_ResponseMessage : public Context { + C_NotifyAck *notify_ack; + + C_ResponseMessage(C_NotifyAck *notify_ack) : notify_ack(notify_ack) { + } + void finish(int r) override; + }; + + ImageCtxT &m_image_ctx; + + TaskFinisher<Task> *m_task_finisher; + + RWLock m_async_request_lock; + std::map<watch_notify::AsyncRequestId, AsyncRequest> m_async_requests; + std::set<watch_notify::AsyncRequestId> m_async_pending; + + Mutex m_owner_client_id_lock; + watch_notify::ClientId m_owner_client_id; + + AsyncOpTracker m_async_op_tracker; + + void handle_register_watch(int r); + + void schedule_cancel_async_requests(); + void cancel_async_requests(); + + void set_owner_client_id(const watch_notify::ClientId &client_id); + watch_notify::ClientId get_client_id(); + + void handle_request_lock(int r); + void schedule_request_lock(bool use_timer, int timer_delay = -1); + + void notify_lock_owner(const watch_notify::Payload& payload, + Context *on_finish); + + Context *remove_async_request(const watch_notify::AsyncRequestId &id); + void schedule_async_request_timed_out(const watch_notify::AsyncRequestId &id); + void async_request_timed_out(const watch_notify::AsyncRequestId &id); + void notify_async_request(const watch_notify::AsyncRequestId &id, + const watch_notify::Payload &payload, + ProgressContext& prog_ctx, + Context *on_finish); + + void schedule_async_progress(const watch_notify::AsyncRequestId &id, + uint64_t offset, uint64_t total); + int notify_async_progress(const watch_notify::AsyncRequestId &id, + uint64_t offset, uint64_t total); + void schedule_async_complete(const watch_notify::AsyncRequestId &id, int r); + void notify_async_complete(const watch_notify::AsyncRequestId &id, int r); + void handle_async_complete(const watch_notify::AsyncRequestId &request, int r, + int ret_val); + + int prepare_async_request(const watch_notify::AsyncRequestId& id, + bool* new_request, Context** ctx, + ProgressContext** prog_ctx); + + bool handle_payload(const watch_notify::HeaderUpdatePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::AcquiredLockPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::ReleasedLockPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::RequestLockPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::AsyncProgressPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::AsyncCompletePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::FlattenPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::ResizePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapCreatePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapRenamePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapRemovePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapProtectPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapUnprotectPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::RebuildObjectMapPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::RenamePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::UpdateFeaturesPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::MigratePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SparsifyPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::UnknownPayload& payload, + C_NotifyAck *ctx); + void process_payload(uint64_t notify_id, uint64_t handle, + const watch_notify::Payload &payload); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; + void handle_error(uint64_t cookie, int err) override; + void handle_rewatch_complete(int r) override; + + void send_notify(const watch_notify::Payload& payload, + Context *ctx = nullptr); + +}; + +} // namespace librbd + +extern template class librbd::ImageWatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_WATCHER_H diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc new file mode 100644 index 00000000..1be60cbf --- /dev/null +++ b/src/librbd/Journal.cc @@ -0,0 +1,1780 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/Journal.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "cls/journal/cls_journal_types.h" +#include "journal/Journaler.h" +#include "journal/Policy.h" +#include "journal/ReplayEntry.h" +#include "journal/Settings.h" +#include "journal/Utils.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/DemoteRequest.h" +#include "librbd/journal/ObjectDispatch.h" +#include "librbd/journal/OpenRequest.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/journal/ResetRequest.h" +#include "librbd/journal/Replay.h" +#include "librbd/journal/PromoteRequest.h" + +#include <boost/scope_exit.hpp> +#include <utility> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Journal: " + +namespace librbd { + +using util::create_async_context_callback; +using util::create_context_callback; +using journal::util::C_DecodeTag; +using journal::util::C_DecodeTags; + +namespace { + +// TODO: once journaler is 100% async, remove separate threads and +// reuse ImageCtx's thread pool +class ThreadPoolSingleton : public ThreadPool { +public: + explicit ThreadPoolSingleton(CephContext *cct) + : ThreadPool(cct, "librbd::Journal", "tp_librbd_journ", 1) { + start(); + } + ~ThreadPoolSingleton() override { + stop(); + } +}; + +template <typename I> +struct C_IsTagOwner : public Context { + librados::IoCtx &io_ctx; + std::string image_id; + bool *is_tag_owner; + ContextWQ *op_work_queue; + Context *on_finish; + + CephContext *cct = nullptr; + Journaler *journaler; + cls::journal::Client client; + journal::ImageClientMeta client_meta; + uint64_t tag_tid = 0; + journal::TagData tag_data; + + C_IsTagOwner(librados::IoCtx &io_ctx, const std::string &image_id, + bool *is_tag_owner, ContextWQ *op_work_queue, Context *on_finish) + : io_ctx(io_ctx), image_id(image_id), is_tag_owner(is_tag_owner), + op_work_queue(op_work_queue), on_finish(on_finish), + cct(reinterpret_cast<CephContext*>(io_ctx.cct())), + journaler(new Journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID, + {})) { + } + + void finish(int r) override { + ldout(cct, 20) << this << " C_IsTagOwner::" << __func__ << ": r=" << r + << dendl; + if (r < 0) { + lderr(cct) << this << " C_IsTagOwner::" << __func__ << ": " + << "failed to get tag owner: " << cpp_strerror(r) << dendl; + } else { + *is_tag_owner = (tag_data.mirror_uuid == Journal<>::LOCAL_MIRROR_UUID); + } + + Journaler *journaler = this->journaler; + Context *on_finish = this->on_finish; + FunctionContext *ctx = new FunctionContext( + [journaler, on_finish](int r) { + on_finish->complete(r); + delete journaler; + }); + op_work_queue->queue(ctx, r); + } +}; + +struct C_GetTagOwner : public Context { + std::string *mirror_uuid; + Context *on_finish; + + Journaler journaler; + cls::journal::Client client; + journal::ImageClientMeta client_meta; + uint64_t tag_tid = 0; + journal::TagData tag_data; + + C_GetTagOwner(librados::IoCtx &io_ctx, const std::string &image_id, + std::string *mirror_uuid, Context *on_finish) + : mirror_uuid(mirror_uuid), on_finish(on_finish), + journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID, {}) { + } + + virtual void finish(int r) { + if (r >= 0) { + *mirror_uuid = tag_data.mirror_uuid; + } + on_finish->complete(r); + } +}; + +template <typename J> +struct GetTagsRequest { + CephContext *cct; + J *journaler; + cls::journal::Client *client; + journal::ImageClientMeta *client_meta; + uint64_t *tag_tid; + journal::TagData *tag_data; + Context *on_finish; + + Mutex lock; + + GetTagsRequest(CephContext *cct, J *journaler, cls::journal::Client *client, + journal::ImageClientMeta *client_meta, uint64_t *tag_tid, + journal::TagData *tag_data, Context *on_finish) + : cct(cct), journaler(journaler), client(client), client_meta(client_meta), + tag_tid(tag_tid), tag_data(tag_data), on_finish(on_finish), lock("lock") { + } + + /** + * @verbatim + * + * <start> + * | + * v + * GET_CLIENT * * * * * * * * * * * * + * | * + * v * + * GET_TAGS * * * * * * * * * * * * * (error) + * | * + * v * + * <finish> * * * * * * * * * * * * * + * + * @endverbatim + */ + + void send() { + send_get_client(); + } + + void send_get_client() { + ldout(cct, 20) << __func__ << dendl; + + FunctionContext *ctx = new FunctionContext( + [this](int r) { + handle_get_client(r); + }); + journaler->get_client(Journal<ImageCtx>::IMAGE_CLIENT_ID, client, ctx); + } + + void handle_get_client(int r) { + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r < 0) { + complete(r); + return; + } + + librbd::journal::ClientData client_data; + auto bl_it = client->data.cbegin(); + try { + decode(client_data, bl_it); + } catch (const buffer::error &err) { + lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": " + << "failed to decode client data" << dendl; + complete(-EBADMSG); + return; + } + + journal::ImageClientMeta *image_client_meta = + boost::get<journal::ImageClientMeta>(&client_data.client_meta); + if (image_client_meta == nullptr) { + lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": " + << "failed to get client meta" << dendl; + complete(-EINVAL); + return; + } + *client_meta = *image_client_meta; + + send_get_tags(); + } + + void send_get_tags() { + ldout(cct, 20) << __func__ << dendl; + + FunctionContext *ctx = new FunctionContext( + [this](int r) { + handle_get_tags(r); + }); + C_DecodeTags *tags_ctx = new C_DecodeTags(cct, &lock, tag_tid, tag_data, + ctx); + journaler->get_tags(client_meta->tag_class, &tags_ctx->tags, tags_ctx); + } + + void handle_get_tags(int r) { + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + complete(r); + } + + void complete(int r) { + on_finish->complete(r); + delete this; + } +}; + +template <typename J> +void get_tags(CephContext *cct, J *journaler, + cls::journal::Client *client, + journal::ImageClientMeta *client_meta, + uint64_t *tag_tid, journal::TagData *tag_data, + Context *on_finish) { + ldout(cct, 20) << __func__ << dendl; + + GetTagsRequest<J> *req = + new GetTagsRequest<J>(cct, journaler, client, client_meta, tag_tid, + tag_data, on_finish); + req->send(); +} + +template <typename J> +int allocate_journaler_tag(CephContext *cct, J *journaler, + uint64_t tag_class, + const journal::TagPredecessor &predecessor, + const std::string &mirror_uuid, + cls::journal::Tag *new_tag) { + journal::TagData tag_data; + tag_data.mirror_uuid = mirror_uuid; + tag_data.predecessor = predecessor; + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + C_SaferCond allocate_tag_ctx; + journaler->allocate_tag(tag_class, tag_bl, new_tag, &allocate_tag_ctx); + + int r = allocate_tag_ctx.wait(); + if (r < 0) { + lderr(cct) << __func__ << ": " + << "failed to allocate tag: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +} // anonymous namespace + +// client id for local image +template <typename I> +const std::string Journal<I>::IMAGE_CLIENT_ID(""); + +// mirror uuid to use for local images +template <typename I> +const std::string Journal<I>::LOCAL_MIRROR_UUID(""); + +// mirror uuid to use for orphaned (demoted) images +template <typename I> +const std::string Journal<I>::ORPHAN_MIRROR_UUID("<orphan>"); + +template <typename I> +std::ostream &operator<<(std::ostream &os, + const typename Journal<I>::State &state) { + switch (state) { + case Journal<I>::STATE_UNINITIALIZED: + os << "Uninitialized"; + break; + case Journal<I>::STATE_INITIALIZING: + os << "Initializing"; + break; + case Journal<I>::STATE_REPLAYING: + os << "Replaying"; + break; + case Journal<I>::STATE_FLUSHING_RESTART: + os << "FlushingRestart"; + break; + case Journal<I>::STATE_RESTARTING_REPLAY: + os << "RestartingReplay"; + break; + case Journal<I>::STATE_FLUSHING_REPLAY: + os << "FlushingReplay"; + break; + case Journal<I>::STATE_READY: + os << "Ready"; + break; + case Journal<I>::STATE_STOPPING: + os << "Stopping"; + break; + case Journal<I>::STATE_CLOSING: + os << "Closing"; + break; + case Journal<I>::STATE_CLOSED: + os << "Closed"; + break; + default: + os << "Unknown (" << static_cast<uint32_t>(state) << ")"; + break; + } + return os; +} + +template <typename I> +Journal<I>::Journal(I &image_ctx) + : m_image_ctx(image_ctx), m_journaler(NULL), + m_lock("Journal<I>::m_lock"), m_state(STATE_UNINITIALIZED), + m_error_result(0), m_replay_handler(this), m_close_pending(false), + m_event_lock("Journal<I>::m_event_lock"), m_event_tid(0), + m_blocking_writes(false), m_journal_replay(NULL), + m_metadata_listener(this) { + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << ": ictx=" << &m_image_ctx << dendl; + + auto thread_pool_singleton = + &cct->lookup_or_create_singleton_object<ThreadPoolSingleton>( + "librbd::journal::thread_pool", false, cct); + m_work_queue = new ContextWQ("librbd::journal::work_queue", + cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"), + thread_pool_singleton); + ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock); +} + +template <typename I> +Journal<I>::~Journal() { + if (m_work_queue != nullptr) { + m_work_queue->drain(); + delete m_work_queue; + } + + ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED); + ceph_assert(m_journaler == NULL); + ceph_assert(m_journal_replay == NULL); + ceph_assert(m_wait_for_state_contexts.empty()); +} + +template <typename I> +bool Journal<I>::is_journal_supported(I &image_ctx) { + ceph_assert(image_ctx.snap_lock.is_locked()); + return ((image_ctx.features & RBD_FEATURE_JOURNALING) && + !image_ctx.read_only && image_ctx.snap_id == CEPH_NOSNAP); +} + +template <typename I> +int Journal<I>::create(librados::IoCtx &io_ctx, const std::string &image_id, + uint8_t order, uint8_t splay_width, + const std::string &object_pool) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 5) << __func__ << ": image=" << image_id << dendl; + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond cond; + journal::TagData tag_data(LOCAL_MIRROR_UUID); + journal::CreateRequest<I> *req = journal::CreateRequest<I>::create( + io_ctx, image_id, order, splay_width, object_pool, cls::journal::Tag::TAG_CLASS_NEW, + tag_data, IMAGE_CLIENT_ID, op_work_queue, &cond); + req->send(); + + return cond.wait(); +} + +template <typename I> +int Journal<I>::remove(librados::IoCtx &io_ctx, const std::string &image_id) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 5) << __func__ << ": image=" << image_id << dendl; + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond cond; + journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create( + io_ctx, image_id, IMAGE_CLIENT_ID, op_work_queue, &cond); + req->send(); + + return cond.wait(); +} + +template <typename I> +int Journal<I>::reset(librados::IoCtx &io_ctx, const std::string &image_id) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 5) << __func__ << ": image=" << image_id << dendl; + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond cond; + auto req = journal::ResetRequest<I>::create(io_ctx, image_id, IMAGE_CLIENT_ID, + Journal<>::LOCAL_MIRROR_UUID, + op_work_queue, &cond); + req->send(); + + return cond.wait(); +} + +template <typename I> +void Journal<I>::is_tag_owner(I *image_ctx, bool *owner, + Context *on_finish) { + Journal<I>::is_tag_owner(image_ctx->md_ctx, image_ctx->id, owner, + image_ctx->op_work_queue, on_finish); +} + +template <typename I> +void Journal<I>::is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, + bool *is_tag_owner, ContextWQ *op_work_queue, + Context *on_finish) { + CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 20) << __func__ << dendl; + + C_IsTagOwner<I> *is_tag_owner_ctx = new C_IsTagOwner<I>( + io_ctx, image_id, is_tag_owner, op_work_queue, on_finish); + get_tags(cct, is_tag_owner_ctx->journaler, &is_tag_owner_ctx->client, + &is_tag_owner_ctx->client_meta, &is_tag_owner_ctx->tag_tid, + &is_tag_owner_ctx->tag_data, is_tag_owner_ctx); +} + +template <typename I> +void Journal<I>::get_tag_owner(IoCtx& io_ctx, std::string& image_id, + std::string *mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) { + CephContext *cct = static_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << __func__ << dendl; + + auto ctx = new C_GetTagOwner(io_ctx, image_id, mirror_uuid, on_finish); + get_tags(cct, &ctx->journaler, &ctx->client, &ctx->client_meta, &ctx->tag_tid, + &ctx->tag_data, create_async_context_callback(op_work_queue, ctx)); +} + +template <typename I> +int Journal<I>::request_resync(I *image_ctx) { + CephContext *cct = image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + Journaler journaler(image_ctx->md_ctx, image_ctx->id, IMAGE_CLIENT_ID, {}); + + Mutex lock("lock"); + journal::ImageClientMeta client_meta; + uint64_t tag_tid; + journal::TagData tag_data; + + C_SaferCond open_ctx; + auto open_req = journal::OpenRequest<I>::create(image_ctx, &journaler, &lock, + &client_meta, &tag_tid, + &tag_data, &open_ctx); + open_req->send(); + + BOOST_SCOPE_EXIT_ALL(&journaler) { + journaler.shut_down(); + }; + + int r = open_ctx.wait(); + if (r < 0) { + return r; + } + + client_meta.resync_requested = true; + + journal::ClientData client_data(client_meta); + bufferlist client_data_bl; + encode(client_data, client_data_bl); + + C_SaferCond update_client_ctx; + journaler.update_client(client_data_bl, &update_client_ctx); + + r = update_client_ctx.wait(); + if (r < 0) { + lderr(cct) << __func__ << ": " + << "failed to update client: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template <typename I> +void Journal<I>::promote(I *image_ctx, Context *on_finish) { + CephContext *cct = image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + auto promote_req = journal::PromoteRequest<I>::create(image_ctx, false, + on_finish); + promote_req->send(); +} + +template <typename I> +void Journal<I>::demote(I *image_ctx, Context *on_finish) { + CephContext *cct = image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + auto req = journal::DemoteRequest<I>::create(*image_ctx, on_finish); + req->send(); +} + +template <typename I> +bool Journal<I>::is_journal_ready() const { + Mutex::Locker locker(m_lock); + return (m_state == STATE_READY); +} + +template <typename I> +bool Journal<I>::is_journal_replaying() const { + Mutex::Locker locker(m_lock); + return is_journal_replaying(m_lock); +} + +template <typename I> +bool Journal<I>::is_journal_replaying(const Mutex &) const { + ceph_assert(m_lock.is_locked()); + return (m_state == STATE_REPLAYING || + m_state == STATE_FLUSHING_REPLAY || + m_state == STATE_FLUSHING_RESTART || + m_state == STATE_RESTARTING_REPLAY); +} + +template <typename I> +bool Journal<I>::is_journal_appending() const { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + Mutex::Locker locker(m_lock); + return (m_state == STATE_READY && + !m_image_ctx.get_journal_policy()->append_disabled()); +} + +template <typename I> +void Journal<I>::wait_for_journal_ready(Context *on_ready) { + on_ready = create_async_context_callback(m_image_ctx, on_ready); + + Mutex::Locker locker(m_lock); + if (m_state == STATE_READY) { + on_ready->complete(m_error_result); + } else { + wait_for_steady_state(on_ready); + } +} + +template <typename I> +void Journal<I>::open(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + on_finish = create_async_context_callback(m_image_ctx, on_finish); + + // inject our handler into the object dispatcher chain + m_image_ctx.io_object_dispatcher->register_object_dispatch( + journal::ObjectDispatch<I>::create(&m_image_ctx, this)); + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_UNINITIALIZED); + wait_for_steady_state(on_finish); + create_journaler(); +} + +template <typename I> +void Journal<I>::close(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + on_finish = new FunctionContext([this, on_finish](int r) { + // remove our handler from object dispatcher chain - preserve error + auto ctx = new FunctionContext([on_finish, r](int _) { + on_finish->complete(r); + }); + m_image_ctx.io_object_dispatcher->shut_down_object_dispatch( + io::OBJECT_DISPATCH_LAYER_JOURNAL, ctx); + }); + on_finish = create_async_context_callback(m_image_ctx, on_finish); + + Mutex::Locker locker(m_lock); + while (m_listener_notify) { + m_listener_cond.Wait(m_lock); + } + + Listeners listeners(m_listeners); + m_listener_notify = true; + m_lock.Unlock(); + for (auto listener : listeners) { + listener->handle_close(); + } + + m_lock.Lock(); + m_listener_notify = false; + m_listener_cond.Signal(); + + ceph_assert(m_state != STATE_UNINITIALIZED); + if (m_state == STATE_CLOSED) { + on_finish->complete(m_error_result); + return; + } + + if (m_state == STATE_READY) { + stop_recording(); + } + + m_close_pending = true; + wait_for_steady_state(on_finish); +} + +template <typename I> +bool Journal<I>::is_tag_owner() const { + Mutex::Locker locker(m_lock); + return is_tag_owner(m_lock); +} + +template <typename I> +bool Journal<I>::is_tag_owner(const Mutex &) const { + ceph_assert(m_lock.is_locked()); + return (m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID); +} + +template <typename I> +uint64_t Journal<I>::get_tag_tid() const { + Mutex::Locker locker(m_lock); + return m_tag_tid; +} + +template <typename I> +journal::TagData Journal<I>::get_tag_data() const { + Mutex::Locker locker(m_lock); + return m_tag_data; +} + +template <typename I> +void Journal<I>::allocate_local_tag(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + journal::TagPredecessor predecessor; + predecessor.mirror_uuid = LOCAL_MIRROR_UUID; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_journaler != nullptr && is_tag_owner(m_lock)); + + cls::journal::Client client; + int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to retrieve client: " << cpp_strerror(r) << dendl; + m_image_ctx.op_work_queue->queue(on_finish, r); + return; + } + + // since we are primary, populate the predecessor with our known commit + // position + ceph_assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID); + if (!client.commit_position.object_positions.empty()) { + auto position = client.commit_position.object_positions.front(); + predecessor.commit_valid = true; + predecessor.tag_tid = position.tag_tid; + predecessor.entry_tid = position.entry_tid; + } + } + + allocate_tag(LOCAL_MIRROR_UUID, predecessor, on_finish); +} + +template <typename I> +void Journal<I>::allocate_tag(const std::string &mirror_uuid, + const journal::TagPredecessor &predecessor, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": mirror_uuid=" << mirror_uuid + << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_journaler != nullptr); + + journal::TagData tag_data; + tag_data.mirror_uuid = mirror_uuid; + tag_data.predecessor = predecessor; + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + C_DecodeTag *decode_tag_ctx = new C_DecodeTag(cct, &m_lock, &m_tag_tid, + &m_tag_data, on_finish); + m_journaler->allocate_tag(m_tag_class, tag_bl, &decode_tag_ctx->tag, + decode_tag_ctx); +} + +template <typename I> +void Journal<I>::flush_commit_position(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_journaler != nullptr); + m_journaler->flush_commit_position(on_finish); +} + +template <typename I> +void Journal<I>::user_flushed() { + if (m_state == STATE_READY && !m_user_flushed.exchange(true) && + m_image_ctx.config.template get_val<bool>("rbd_journal_object_writethrough_until_flush")) { + Mutex::Locker locker(m_lock); + if (m_state == STATE_READY) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + ceph_assert(m_journaler != nullptr); + m_journaler->set_append_batch_options( + m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_flush_interval"), + m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_object_flush_bytes"), + m_image_ctx.config.template get_val<double>("rbd_journal_object_flush_age")); + } else { + m_user_flushed = false; + } + } +} + +template <typename I> +uint64_t Journal<I>::append_write_event(uint64_t offset, size_t length, + const bufferlist &bl, + bool flush_entry) { + ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size()); + uint64_t max_write_data_size = + m_max_append_size - journal::AioWriteEvent::get_fixed_size(); + + // ensure that the write event fits within the journal entry + Bufferlists bufferlists; + uint64_t bytes_remaining = length; + uint64_t event_offset = 0; + do { + uint64_t event_length = std::min(bytes_remaining, max_write_data_size); + + bufferlist event_bl; + event_bl.substr_of(bl, event_offset, event_length); + journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset, + event_length, + event_bl), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + + event_offset += event_length; + bytes_remaining -= event_length; + } while (bytes_remaining > 0); + + return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, offset, + length, flush_entry, 0); +} + +template <typename I> +uint64_t Journal<I>::append_io_event(journal::EventEntry &&event_entry, + uint64_t offset, size_t length, + bool flush_entry, int filter_ret_val) { + bufferlist bl; + event_entry.timestamp = ceph_clock_now(); + encode(event_entry, bl); + return append_io_events(event_entry.get_event_type(), {bl}, offset, length, + flush_entry, filter_ret_val); +} + +template <typename I> +uint64_t Journal<I>::append_io_events(journal::EventType event_type, + const Bufferlists &bufferlists, + uint64_t offset, size_t length, + bool flush_entry, int filter_ret_val) { + ceph_assert(!bufferlists.empty()); + + uint64_t tid; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_READY); + + tid = ++m_event_tid; + ceph_assert(tid != 0); + } + + Futures futures; + for (auto &bl : bufferlists) { + ceph_assert(bl.length() <= m_max_append_size); + futures.push_back(m_journaler->append(m_tag_tid, bl)); + } + + { + Mutex::Locker event_locker(m_event_lock); + m_events[tid] = Event(futures, offset, length, filter_ret_val); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": " + << "event=" << event_type << ", " + << "offset=" << offset << ", " + << "length=" << length << ", " + << "flush=" << flush_entry << ", tid=" << tid << dendl; + + Context *on_safe = create_async_context_callback( + m_image_ctx, new C_IOEventSafe(this, tid)); + if (flush_entry) { + futures.back().flush(on_safe); + } else { + futures.back().wait(on_safe); + } + + return tid; +} + +template <typename I> +void Journal<I>::commit_io_event(uint64_t tid, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + "r=" << r << dendl; + + Mutex::Locker event_locker(m_event_lock); + typename Events::iterator it = m_events.find(tid); + if (it == m_events.end()) { + return; + } + complete_event(it, r); +} + +template <typename I> +void Journal<I>::commit_io_event_extent(uint64_t tid, uint64_t offset, + uint64_t length, int r) { + ceph_assert(length > 0); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + << "offset=" << offset << ", " + << "length=" << length << ", " + << "r=" << r << dendl; + + Mutex::Locker event_locker(m_event_lock); + typename Events::iterator it = m_events.find(tid); + if (it == m_events.end()) { + return; + } + + Event &event = it->second; + if (event.ret_val == 0 && r < 0) { + event.ret_val = r; + } + + ExtentInterval extent; + extent.insert(offset, length); + + ExtentInterval intersect; + intersect.intersection_of(extent, event.pending_extents); + + event.pending_extents.subtract(intersect); + if (!event.pending_extents.empty()) { + ldout(cct, 20) << this << " " << __func__ << ": " + << "pending extents: " << event.pending_extents << dendl; + return; + } + complete_event(it, event.ret_val); +} + +template <typename I> +void Journal<I>::append_op_event(uint64_t op_tid, + journal::EventEntry &&event_entry, + Context *on_safe) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + bufferlist bl; + event_entry.timestamp = ceph_clock_now(); + encode(event_entry, bl); + + Future future; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_READY); + + future = m_journaler->append(m_tag_tid, bl); + + // delay committing op event to ensure consistent replay + ceph_assert(m_op_futures.count(op_tid) == 0); + m_op_futures[op_tid] = future; + } + + on_safe = create_async_context_callback(m_image_ctx, on_safe); + on_safe = new FunctionContext([this, on_safe](int r) { + // ensure all committed IO before this op is committed + m_journaler->flush_commit_position(on_safe); + }); + future.flush(on_safe); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "op_tid=" << op_tid << ", " + << "event=" << event_entry.get_event_type() << dendl; +} + +template <typename I> +void Journal<I>::commit_op_event(uint64_t op_tid, int r, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << ", " + << "r=" << r << dendl; + + journal::EventEntry event_entry((journal::OpFinishEvent(op_tid, r)), + ceph_clock_now()); + + bufferlist bl; + encode(event_entry, bl); + + Future op_start_future; + Future op_finish_future; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_READY); + + // ready to commit op event + auto it = m_op_futures.find(op_tid); + ceph_assert(it != m_op_futures.end()); + op_start_future = it->second; + m_op_futures.erase(it); + + op_finish_future = m_journaler->append(m_tag_tid, bl); + } + + op_finish_future.flush(create_async_context_callback( + m_image_ctx, new C_OpEventSafe(this, op_tid, op_start_future, + op_finish_future, on_safe))); +} + +template <typename I> +void Journal<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_journal_replay != nullptr); + m_journal_replay->replay_op_ready(op_tid, on_resume); + } +} + +template <typename I> +void Journal<I>::flush_event(uint64_t tid, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + << "on_safe=" << on_safe << dendl; + + Future future; + { + Mutex::Locker event_locker(m_event_lock); + future = wait_event(m_lock, tid, on_safe); + } + + if (future.is_valid()) { + future.flush(nullptr); + } +} + +template <typename I> +void Journal<I>::wait_event(uint64_t tid, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + << "on_safe=" << on_safe << dendl; + + Mutex::Locker event_locker(m_event_lock); + wait_event(m_lock, tid, on_safe); +} + +template <typename I> +typename Journal<I>::Future Journal<I>::wait_event(Mutex &lock, uint64_t tid, + Context *on_safe) { + ceph_assert(m_event_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + typename Events::iterator it = m_events.find(tid); + ceph_assert(it != m_events.end()); + + Event &event = it->second; + if (event.safe) { + // journal entry already safe + ldout(cct, 20) << this << " " << __func__ << ": " + << "journal entry already safe" << dendl; + m_image_ctx.op_work_queue->queue(on_safe, event.ret_val); + return Future(); + } + + event.on_safe_contexts.push_back(create_async_context_callback(m_image_ctx, + on_safe)); + return event.futures.back(); +} + +template <typename I> +void Journal<I>::start_external_replay(journal::Replay<I> **journal_replay, + Context *on_start) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_READY); + ceph_assert(m_journal_replay == nullptr); + + on_start = util::create_async_context_callback(m_image_ctx, on_start); + on_start = new FunctionContext( + [this, journal_replay, on_start](int r) { + handle_start_external_replay(r, journal_replay, on_start); + }); + + // safely flush all in-flight events before starting external replay + m_journaler->stop_append(util::create_async_context_callback(m_image_ctx, + on_start)); +} + +template <typename I> +void Journal<I>::handle_start_external_replay(int r, + journal::Replay<I> **journal_replay, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_READY); + ceph_assert(m_journal_replay == nullptr); + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to stop recording: " << cpp_strerror(r) << dendl; + *journal_replay = nullptr; + + // get back to a sane-state + start_append(); + on_finish->complete(r); + return; + } + + transition_state(STATE_REPLAYING, 0); + m_journal_replay = journal::Replay<I>::create(m_image_ctx); + *journal_replay = m_journal_replay; + on_finish->complete(0); +} + +template <typename I> +void Journal<I>::stop_external_replay() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_journal_replay != nullptr); + ceph_assert(m_state == STATE_REPLAYING); + + delete m_journal_replay; + m_journal_replay = nullptr; + + if (m_close_pending) { + destroy_journaler(0); + return; + } + + start_append(); +} + +template <typename I> +void Journal<I>::create_journaler() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_RESTARTING_REPLAY); + ceph_assert(m_journaler == NULL); + + transition_state(STATE_INITIALIZING, 0); + ::journal::Settings settings; + settings.commit_interval = + m_image_ctx.config.template get_val<double>("rbd_journal_commit_age"); + settings.max_payload_bytes = + m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_max_payload_bytes"); + settings.max_concurrent_object_sets = + m_image_ctx.config.template get_val<uint64_t>("rbd_journal_max_concurrent_object_sets"); + // TODO: a configurable filter to exclude certain peers from being + // disconnected. + settings.whitelisted_laggy_clients = {IMAGE_CLIENT_ID}; + + m_journaler = new Journaler(m_work_queue, m_timer, m_timer_lock, + m_image_ctx.md_ctx, m_image_ctx.id, + IMAGE_CLIENT_ID, settings); + m_journaler->add_listener(&m_metadata_listener); + + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + Journal<I>, &Journal<I>::handle_open>(this)); + auto open_req = journal::OpenRequest<I>::create(&m_image_ctx, m_journaler, + &m_lock, &m_client_meta, + &m_tag_tid, &m_tag_data, ctx); + open_req->send(); +} + +template <typename I> +void Journal<I>::destroy_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + ceph_assert(m_lock.is_locked()); + + delete m_journal_replay; + m_journal_replay = NULL; + + m_journaler->remove_listener(&m_metadata_listener); + + transition_state(STATE_CLOSING, r); + + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + Journal<I>, &Journal<I>::handle_journal_destroyed>(this)); + ctx = new FunctionContext( + [this, ctx](int r) { + Mutex::Locker locker(m_lock); + m_journaler->shut_down(ctx); + }); + m_async_journal_op_tracker.wait(m_image_ctx, ctx); +} + +template <typename I> +void Journal<I>::recreate_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + + delete m_journal_replay; + m_journal_replay = NULL; + + m_journaler->remove_listener(&m_metadata_listener); + + transition_state(STATE_RESTARTING_REPLAY, r); + m_journaler->shut_down(create_async_context_callback( + m_image_ctx, create_context_callback< + Journal<I>, &Journal<I>::handle_journal_destroyed>(this))); +} + +template <typename I> +void Journal<I>::complete_event(typename Events::iterator it, int r) { + ceph_assert(m_event_lock.is_locked()); + ceph_assert(m_state == STATE_READY); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << it->first << " " + << "r=" << r << dendl; + + Event &event = it->second; + if (r < 0 && r == event.filter_ret_val) { + // ignore allowed error codes + r = 0; + } + if (r < 0) { + // event recorded to journal but failed to update disk, we cannot + // commit this IO event. this event must be replayed. + ceph_assert(event.safe); + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit IO to disk, replay required: " + << cpp_strerror(r) << dendl; + } + + event.committed_io = true; + if (event.safe) { + if (r >= 0) { + for (auto &future : event.futures) { + m_journaler->committed(future); + } + } + m_events.erase(it); + } +} + +template <typename I> +void Journal<I>::start_append() { + ceph_assert(m_lock.is_locked()); + + m_journaler->start_append( + m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_max_in_flight_appends")); + if (!m_image_ctx.config.template get_val<bool>("rbd_journal_object_writethrough_until_flush")) { + m_journaler->set_append_batch_options( + m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_flush_interval"), + m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_object_flush_bytes"), + m_image_ctx.config.template get_val<double>("rbd_journal_object_flush_age")); + } + + transition_state(STATE_READY, 0); +} + +template <typename I> +void Journal<I>::handle_open(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_INITIALIZING); + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to initialize journal: " << cpp_strerror(r) + << dendl; + destroy_journaler(r); + return; + } + + m_tag_class = m_client_meta.tag_class; + m_max_append_size = m_journaler->get_max_append_size(); + ldout(cct, 20) << this << " " << __func__ << ": " + << "tag_class=" << m_tag_class << ", " + << "max_append_size=" << m_max_append_size << dendl; + + transition_state(STATE_REPLAYING, 0); + m_journal_replay = journal::Replay<I>::create(m_image_ctx); + m_journaler->start_replay(&m_replay_handler); +} + +template <typename I> +void Journal<I>::handle_replay_ready() { + CephContext *cct = m_image_ctx.cct; + ReplayEntry replay_entry; + { + Mutex::Locker locker(m_lock); + if (m_state != STATE_REPLAYING) { + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + if (!m_journaler->try_pop_front(&replay_entry)) { + return; + } + + // only one entry should be in-flight at a time + ceph_assert(!m_processing_entry); + m_processing_entry = true; + } + + bufferlist data = replay_entry.get_data(); + auto it = data.cbegin(); + + journal::EventEntry event_entry; + int r = m_journal_replay->decode(&it, &event_entry); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to decode journal event entry" << dendl; + handle_replay_process_safe(replay_entry, r); + return; + } + + Context *on_ready = create_context_callback< + Journal<I>, &Journal<I>::handle_replay_process_ready>(this); + Context *on_commit = new C_ReplayProcessSafe(this, std::move(replay_entry)); + m_journal_replay->process(event_entry, on_ready, on_commit); +} + +template <typename I> +void Journal<I>::handle_replay_complete(int r) { + CephContext *cct = m_image_ctx.cct; + + bool cancel_ops = false; + { + Mutex::Locker locker(m_lock); + if (m_state != STATE_REPLAYING) { + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + if (r < 0) { + cancel_ops = true; + transition_state(STATE_FLUSHING_RESTART, r); + } else { + // state might change back to FLUSHING_RESTART on flush error + transition_state(STATE_FLUSHING_REPLAY, 0); + } + } + + Context *ctx = new FunctionContext([this, cct](int r) { + ldout(cct, 20) << this << " handle_replay_complete: " + << "handle shut down replay" << dendl; + + State state; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + state = m_state; + } + + if (state == STATE_FLUSHING_RESTART) { + handle_flushing_restart(0); + } else { + handle_flushing_replay(); + } + }); + ctx = new FunctionContext([this, ctx](int r) { + // ensure the commit position is flushed to disk + m_journaler->flush_commit_position(ctx); + }); + ctx = new FunctionContext([this, cct, cancel_ops, ctx](int r) { + ldout(cct, 20) << this << " handle_replay_complete: " + << "shut down replay" << dendl; + m_journal_replay->shut_down(cancel_ops, ctx); + }); + m_journaler->stop_replay(ctx); +} + +template <typename I> +void Journal<I>::handle_replay_process_ready(int r) { + // journal::Replay is ready for more events -- attempt to pop another + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(r == 0); + { + Mutex::Locker locker(m_lock); + ceph_assert(m_processing_entry); + m_processing_entry = false; + } + handle_replay_ready(); +} + +template <typename I> +void Journal<I>::handle_replay_process_safe(ReplayEntry replay_entry, int r) { + CephContext *cct = m_image_ctx.cct; + + m_lock.Lock(); + ceph_assert(m_state == STATE_REPLAYING || + m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + if (r < 0) { + if (r != -ECANCELED) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit journal event to disk: " + << cpp_strerror(r) << dendl; + } + + if (m_state == STATE_REPLAYING) { + // abort the replay if we have an error + transition_state(STATE_FLUSHING_RESTART, r); + m_lock.Unlock(); + + // stop replay, shut down, and restart + Context* ctx = create_context_callback< + Journal<I>, &Journal<I>::handle_flushing_restart>(this); + ctx = new FunctionContext([this, ctx](int r) { + // ensure the commit position is flushed to disk + m_journaler->flush_commit_position(ctx); + }); + ctx = new FunctionContext([this, cct, ctx](int r) { + ldout(cct, 20) << this << " handle_replay_process_safe: " + << "shut down replay" << dendl; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_FLUSHING_RESTART); + } + + m_journal_replay->shut_down(true, ctx); + }); + m_journaler->stop_replay(ctx); + return; + } else if (m_state == STATE_FLUSHING_REPLAY) { + // end-of-replay flush in-progress -- we need to restart replay + transition_state(STATE_FLUSHING_RESTART, r); + m_lock.Unlock(); + return; + } + } else { + // only commit the entry if written successfully + m_journaler->committed(replay_entry); + } + m_lock.Unlock(); +} + +template <typename I> +void Journal<I>::handle_flushing_restart(int r) { + Mutex::Locker locker(m_lock); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(r == 0); + ceph_assert(m_state == STATE_FLUSHING_RESTART); + if (m_close_pending) { + destroy_journaler(r); + return; + } + + recreate_journaler(r); +} + +template <typename I> +void Journal<I>::handle_flushing_replay() { + Mutex::Locker locker(m_lock); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(m_state == STATE_FLUSHING_REPLAY || + m_state == STATE_FLUSHING_RESTART); + if (m_close_pending) { + destroy_journaler(0); + return; + } else if (m_state == STATE_FLUSHING_RESTART) { + // failed to replay one-or-more events -- restart + recreate_journaler(0); + return; + } + + delete m_journal_replay; + m_journal_replay = NULL; + + m_error_result = 0; + start_append(); +} + +template <typename I> +void Journal<I>::handle_recording_stopped(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_STOPPING); + + destroy_journaler(r); +} + +template <typename I> +void Journal<I>::handle_journal_destroyed(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << this << " " << __func__ + << "error detected while closing journal: " << cpp_strerror(r) + << dendl; + } + + Mutex::Locker locker(m_lock); + delete m_journaler; + m_journaler = nullptr; + + ceph_assert(m_state == STATE_CLOSING || m_state == STATE_RESTARTING_REPLAY); + if (m_state == STATE_RESTARTING_REPLAY) { + create_journaler(); + return; + } + + transition_state(STATE_CLOSED, r); +} + +template <typename I> +void Journal<I>::handle_io_event_safe(int r, uint64_t tid) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", " + << "tid=" << tid << dendl; + + // journal will be flushed before closing + ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit IO event: " << cpp_strerror(r) << dendl; + } + + Contexts on_safe_contexts; + { + Mutex::Locker event_locker(m_event_lock); + typename Events::iterator it = m_events.find(tid); + ceph_assert(it != m_events.end()); + + Event &event = it->second; + on_safe_contexts.swap(event.on_safe_contexts); + + if (r < 0 || event.committed_io) { + // failed journal write so IO won't be sent -- or IO extent was + // overwritten by future IO operations so this was a no-op IO event + event.ret_val = r; + for (auto &future : event.futures) { + m_journaler->committed(future); + } + } + + if (event.committed_io) { + m_events.erase(it); + } else { + event.safe = true; + } + } + + ldout(cct, 20) << this << " " << __func__ << ": " + << "completing tid=" << tid << dendl; + + // alert the cache about the journal event status + for (Contexts::iterator it = on_safe_contexts.begin(); + it != on_safe_contexts.end(); ++it) { + (*it)->complete(r); + } +} + +template <typename I> +void Journal<I>::handle_op_event_safe(int r, uint64_t tid, + const Future &op_start_future, + const Future &op_finish_future, + Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", " + << "tid=" << tid << dendl; + + // journal will be flushed before closing + ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit op event: " << cpp_strerror(r) << dendl; + } + + m_journaler->committed(op_start_future); + m_journaler->committed(op_finish_future); + + // reduce the replay window after committing an op event + m_journaler->flush_commit_position(on_safe); +} + +template <typename I> +void Journal<I>::stop_recording() { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_journaler != NULL); + + ceph_assert(m_state == STATE_READY); + transition_state(STATE_STOPPING, 0); + + m_journaler->stop_append(util::create_async_context_callback( + m_image_ctx, create_context_callback< + Journal<I>, &Journal<I>::handle_recording_stopped>(this))); +} + +template <typename I> +void Journal<I>::transition_state(State state, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl; + ceph_assert(m_lock.is_locked()); + m_state = state; + + if (m_error_result == 0 && r < 0) { + m_error_result = r; + } + + if (is_steady_state()) { + Contexts wait_for_state_contexts(std::move(m_wait_for_state_contexts)); + for (auto ctx : wait_for_state_contexts) { + ctx->complete(m_error_result); + } + } +} + +template <typename I> +bool Journal<I>::is_steady_state() const { + ceph_assert(m_lock.is_locked()); + switch (m_state) { + case STATE_READY: + case STATE_CLOSED: + return true; + case STATE_UNINITIALIZED: + case STATE_INITIALIZING: + case STATE_REPLAYING: + case STATE_FLUSHING_RESTART: + case STATE_RESTARTING_REPLAY: + case STATE_FLUSHING_REPLAY: + case STATE_STOPPING: + case STATE_CLOSING: + break; + } + return false; +} + +template <typename I> +void Journal<I>::wait_for_steady_state(Context *on_state) { + ceph_assert(m_lock.is_locked()); + ceph_assert(!is_steady_state()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": on_state=" << on_state + << dendl; + m_wait_for_state_contexts.push_back(on_state); +} + +template <typename I> +int Journal<I>::is_resync_requested(bool *do_resync) { + Mutex::Locker l(m_lock); + return check_resync_requested(do_resync); +} + +template <typename I> +int Journal<I>::check_resync_requested(bool *do_resync) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(do_resync != nullptr); + + cls::journal::Client client; + int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to retrieve client: " << cpp_strerror(r) << dendl; + return r; + } + + librbd::journal::ClientData client_data; + auto bl_it = client.data.cbegin(); + try { + decode(client_data, bl_it); + } catch (const buffer::error &err) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to decode client data: " << err << dendl; + return -EINVAL; + } + + journal::ImageClientMeta *image_client_meta = + boost::get<journal::ImageClientMeta>(&client_data.client_meta); + if (image_client_meta == nullptr) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to access image client meta struct" << dendl; + return -EINVAL; + } + + *do_resync = image_client_meta->resync_requested; + + return 0; +} + +struct C_RefreshTags : public Context { + util::AsyncOpTracker &async_op_tracker; + Context *on_finish = nullptr; + + Mutex lock; + uint64_t tag_tid = 0; + journal::TagData tag_data; + + explicit C_RefreshTags(util::AsyncOpTracker &async_op_tracker) + : async_op_tracker(async_op_tracker), + lock("librbd::Journal::C_RefreshTags::lock") { + async_op_tracker.start_op(); + } + ~C_RefreshTags() override { + async_op_tracker.finish_op(); + } + + void finish(int r) override { + on_finish->complete(r); + } +}; + +template <typename I> +void Journal<I>::handle_metadata_updated() { + CephContext *cct = m_image_ctx.cct; + Mutex::Locker locker(m_lock); + + if (m_state != STATE_READY && !is_journal_replaying(m_lock)) { + return; + } else if (is_tag_owner(m_lock)) { + ldout(cct, 20) << this << " " << __func__ << ": primary image" << dendl; + return; + } else if (m_listeners.empty()) { + ldout(cct, 20) << this << " " << __func__ << ": no listeners" << dendl; + return; + } + + uint64_t refresh_sequence = ++m_refresh_sequence; + ldout(cct, 20) << this << " " << __func__ << ": " + << "refresh_sequence=" << refresh_sequence << dendl; + + // pull the most recent tags from the journal, decode, and + // update the internal tag state + C_RefreshTags *refresh_ctx = new C_RefreshTags(m_async_journal_op_tracker); + refresh_ctx->on_finish = new FunctionContext( + [this, refresh_sequence, refresh_ctx](int r) { + handle_refresh_metadata(refresh_sequence, refresh_ctx->tag_tid, + refresh_ctx->tag_data, r); + }); + C_DecodeTags *decode_tags_ctx = new C_DecodeTags( + cct, &refresh_ctx->lock, &refresh_ctx->tag_tid, + &refresh_ctx->tag_data, refresh_ctx); + m_journaler->get_tags(m_tag_tid == 0 ? 0 : m_tag_tid - 1, m_tag_class, + &decode_tags_ctx->tags, decode_tags_ctx); +} + +template <typename I> +void Journal<I>::handle_refresh_metadata(uint64_t refresh_sequence, + uint64_t tag_tid, + journal::TagData tag_data, int r) { + CephContext *cct = m_image_ctx.cct; + Mutex::Locker locker(m_lock); + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": failed to refresh metadata: " + << cpp_strerror(r) << dendl; + return; + } else if (m_state != STATE_READY && !is_journal_replaying(m_lock)) { + return; + } else if (refresh_sequence != m_refresh_sequence) { + // another, more up-to-date refresh is in-flight + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": " + << "refresh_sequence=" << refresh_sequence << ", " + << "tag_tid=" << tag_tid << ", " + << "tag_data=" << tag_data << dendl; + while (m_listener_notify) { + m_listener_cond.Wait(m_lock); + } + + bool was_tag_owner = is_tag_owner(m_lock); + if (m_tag_tid < tag_tid) { + m_tag_tid = tag_tid; + m_tag_data = tag_data; + } + bool promoted_to_primary = (!was_tag_owner && is_tag_owner(m_lock)); + + bool resync_requested = false; + r = check_resync_requested(&resync_requested); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to check if a resync was requested" << dendl; + return; + } + + Listeners listeners(m_listeners); + m_listener_notify = true; + m_lock.Unlock(); + + if (promoted_to_primary) { + for (auto listener : listeners) { + listener->handle_promoted(); + } + } else if (resync_requested) { + for (auto listener : listeners) { + listener->handle_resync(); + } + } + + m_lock.Lock(); + m_listener_notify = false; + m_listener_cond.Signal(); +} + +template <typename I> +void Journal<I>::add_listener(journal::Listener *listener) { + Mutex::Locker locker(m_lock); + m_listeners.insert(listener); +} + +template <typename I> +void Journal<I>::remove_listener(journal::Listener *listener) { + Mutex::Locker locker(m_lock); + while (m_listener_notify) { + m_listener_cond.Wait(m_lock); + } + m_listeners.erase(listener); +} + +} // namespace librbd + +#ifndef TEST_F +template class librbd::Journal<librbd::ImageCtx>; +#endif diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h new file mode 100644 index 00000000..e63cc4a7 --- /dev/null +++ b/src/librbd/Journal.h @@ -0,0 +1,386 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_H +#define CEPH_LIBRBD_JOURNAL_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "include/interval_set.h" +#include "include/rados/librados_fwd.hpp" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/WorkQueue.h" +#include "journal/Future.h" +#include "journal/JournalMetadataListener.h" +#include "journal/ReplayEntry.h" +#include "journal/ReplayHandler.h" +#include "librbd/Utils.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +#include <algorithm> +#include <list> +#include <string> +#include <atomic> +#include <unordered_map> + +class SafeTimer; +namespace journal { +class Journaler; +} + +namespace librbd { + +class ImageCtx; + +namespace journal { template <typename> class Replay; } + +template <typename ImageCtxT = ImageCtx> +class Journal { +public: + /** + * @verbatim + * + * <start> + * | + * v + * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> FLUSHING ---> READY + * | * . ^ * . * | + * | * . | * . * | + * | * . | (error) * . . . . . . . * | + * | * . | * . * | + * | * . | v . * | + * | * . | FLUSHING_RESTART . * | + * | * . | | . * | + * | * . | | . * | + * | * . | v . * v + * | * . | RESTARTING < * * * * * STOPPING + * | * . | | . | + * | * . | | . | + * | * * * * * * . \-------------/ . | + * | * (error) . . | + * | * . . . . . . . . . . . . . . . . | + * | * . . | + * | v v v | + * | CLOSED <----- CLOSING <---------------------------------------/ + * | | + * | v + * \---> <finish> + * + * @endverbatim + */ + enum State { + STATE_UNINITIALIZED, + STATE_INITIALIZING, + STATE_REPLAYING, + STATE_FLUSHING_RESTART, + STATE_RESTARTING_REPLAY, + STATE_FLUSHING_REPLAY, + STATE_READY, + STATE_STOPPING, + STATE_CLOSING, + STATE_CLOSED + }; + + static const std::string IMAGE_CLIENT_ID; + static const std::string LOCAL_MIRROR_UUID; + static const std::string ORPHAN_MIRROR_UUID; + + Journal(ImageCtxT &image_ctx); + ~Journal(); + + static bool is_journal_supported(ImageCtxT &image_ctx); + static int create(librados::IoCtx &io_ctx, const std::string &image_id, + uint8_t order, uint8_t splay_width, + const std::string &object_pool); + static int remove(librados::IoCtx &io_ctx, const std::string &image_id); + static int reset(librados::IoCtx &io_ctx, const std::string &image_id); + + static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner, + Context *on_finish); + static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, + bool *is_tag_owner, ContextWQ *op_work_queue, + Context *on_finish); + static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, + std::string *mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish); + static int request_resync(ImageCtxT *image_ctx); + static void promote(ImageCtxT *image_ctx, Context *on_finish); + static void demote(ImageCtxT *image_ctx, Context *on_finish); + + bool is_journal_ready() const; + bool is_journal_replaying() const; + bool is_journal_appending() const; + + void wait_for_journal_ready(Context *on_ready); + + void open(Context *on_finish); + void close(Context *on_finish); + + bool is_tag_owner() const; + uint64_t get_tag_tid() const; + journal::TagData get_tag_data() const; + + void allocate_local_tag(Context *on_finish); + void allocate_tag(const std::string &mirror_uuid, + const journal::TagPredecessor &predecessor, + Context *on_finish); + + void flush_commit_position(Context *on_finish); + + void user_flushed(); + + uint64_t append_write_event(uint64_t offset, size_t length, + const bufferlist &bl, + bool flush_entry); + uint64_t append_io_event(journal::EventEntry &&event_entry, + uint64_t offset, size_t length, + bool flush_entry, int filter_ret_val); + void commit_io_event(uint64_t tid, int r); + void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length, + int r); + + void append_op_event(uint64_t op_tid, journal::EventEntry &&event_entry, + Context *on_safe); + void commit_op_event(uint64_t tid, int r, Context *on_safe); + void replay_op_ready(uint64_t op_tid, Context *on_resume); + + void flush_event(uint64_t tid, Context *on_safe); + void wait_event(uint64_t tid, Context *on_safe); + + uint64_t allocate_op_tid() { + uint64_t op_tid = ++m_op_tid; + ceph_assert(op_tid != 0); + return op_tid; + } + + void start_external_replay(journal::Replay<ImageCtxT> **journal_replay, + Context *on_start); + void stop_external_replay(); + + void add_listener(journal::Listener *listener); + void remove_listener(journal::Listener *listener); + + int is_resync_requested(bool *do_resync); + + inline ContextWQ *get_work_queue() { + return m_work_queue; + } + +private: + ImageCtxT &m_image_ctx; + + // mock unit testing support + typedef journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef typename TypeTraits::Future Future; + typedef typename TypeTraits::ReplayEntry ReplayEntry; + + typedef std::list<bufferlist> Bufferlists; + typedef std::list<Context *> Contexts; + typedef std::list<Future> Futures; + typedef interval_set<uint64_t> ExtentInterval; + + struct Event { + Futures futures; + Contexts on_safe_contexts; + ExtentInterval pending_extents; + int filter_ret_val = 0; + bool committed_io = false; + bool safe = false; + int ret_val = 0; + + Event() { + } + Event(const Futures &_futures, uint64_t offset, size_t length, + int filter_ret_val) + : futures(_futures), filter_ret_val(filter_ret_val) { + if (length > 0) { + pending_extents.insert(offset, length); + } + } + }; + + typedef std::unordered_map<uint64_t, Event> Events; + typedef std::unordered_map<uint64_t, Future> TidToFutures; + + struct C_IOEventSafe : public Context { + Journal *journal; + uint64_t tid; + + C_IOEventSafe(Journal *_journal, uint64_t _tid) + : journal(_journal), tid(_tid) { + } + + void finish(int r) override { + journal->handle_io_event_safe(r, tid); + } + }; + + struct C_OpEventSafe : public Context { + Journal *journal; + uint64_t tid; + Future op_start_future; + Future op_finish_future; + Context *on_safe; + + C_OpEventSafe(Journal *journal, uint64_t tid, const Future &op_start_future, + const Future &op_finish_future, Context *on_safe) + : journal(journal), tid(tid), op_start_future(op_start_future), + op_finish_future(op_finish_future), on_safe(on_safe) { + } + + void finish(int r) override { + journal->handle_op_event_safe(r, tid, op_start_future, op_finish_future, + on_safe); + } + }; + + struct C_ReplayProcessSafe : public Context { + Journal *journal; + ReplayEntry replay_entry; + + C_ReplayProcessSafe(Journal *journal, ReplayEntry &&replay_entry) : + journal(journal), replay_entry(std::move(replay_entry)) { + } + void finish(int r) override { + journal->handle_replay_process_safe(replay_entry, r); + } + }; + + struct ReplayHandler : public ::journal::ReplayHandler { + Journal *journal; + ReplayHandler(Journal *_journal) : journal(_journal) { + } + + void get() override { + // TODO + } + void put() override { + // TODO + } + + void handle_entries_available() override { + journal->handle_replay_ready(); + } + void handle_complete(int r) override { + journal->handle_replay_complete(r); + } + }; + + ContextWQ *m_work_queue = nullptr; + SafeTimer *m_timer = nullptr; + Mutex *m_timer_lock = nullptr; + + Journaler *m_journaler; + mutable Mutex m_lock; + State m_state; + uint64_t m_max_append_size = 0; + uint64_t m_tag_class = 0; + uint64_t m_tag_tid = 0; + journal::ImageClientMeta m_client_meta; + journal::TagData m_tag_data; + + int m_error_result; + Contexts m_wait_for_state_contexts; + + ReplayHandler m_replay_handler; + bool m_close_pending; + + Mutex m_event_lock; + uint64_t m_event_tid; + Events m_events; + + std::atomic<bool> m_user_flushed = false; + + std::atomic<uint64_t> m_op_tid = { 0 }; + TidToFutures m_op_futures; + + bool m_processing_entry = false; + bool m_blocking_writes; + + journal::Replay<ImageCtxT> *m_journal_replay; + + util::AsyncOpTracker m_async_journal_op_tracker; + + struct MetadataListener : public ::journal::JournalMetadataListener { + Journal<ImageCtxT> *journal; + + MetadataListener(Journal<ImageCtxT> *journal) : journal(journal) { } + + void handle_update(::journal::JournalMetadata *) override { + FunctionContext *ctx = new FunctionContext([this](int r) { + journal->handle_metadata_updated(); + }); + journal->m_work_queue->queue(ctx, 0); + } + } m_metadata_listener; + + typedef std::set<journal::Listener *> Listeners; + Listeners m_listeners; + Cond m_listener_cond; + bool m_listener_notify = false; + + uint64_t m_refresh_sequence = 0; + + bool is_journal_replaying(const Mutex &) const; + bool is_tag_owner(const Mutex &) const; + + uint64_t append_io_events(journal::EventType event_type, + const Bufferlists &bufferlists, + uint64_t offset, size_t length, bool flush_entry, + int filter_ret_val); + Future wait_event(Mutex &lock, uint64_t tid, Context *on_safe); + + void create_journaler(); + void destroy_journaler(int r); + void recreate_journaler(int r); + + void complete_event(typename Events::iterator it, int r); + + void start_append(); + + void handle_open(int r); + + void handle_replay_ready(); + void handle_replay_complete(int r); + void handle_replay_process_ready(int r); + void handle_replay_process_safe(ReplayEntry replay_entry, int r); + + void handle_start_external_replay(int r, + journal::Replay<ImageCtxT> **journal_replay, + Context *on_finish); + + void handle_flushing_restart(int r); + void handle_flushing_replay(); + + void handle_recording_stopped(int r); + + void handle_journal_destroyed(int r); + + void handle_io_event_safe(int r, uint64_t tid); + void handle_op_event_safe(int r, uint64_t tid, const Future &op_start_future, + const Future &op_finish_future, Context *on_safe); + + void stop_recording(); + + void transition_state(State state, int r); + + bool is_steady_state() const; + void wait_for_steady_state(Context *on_state); + + int check_resync_requested(bool *do_resync); + + void handle_metadata_updated(); + void handle_refresh_metadata(uint64_t refresh_sequence, uint64_t tag_tid, + journal::TagData tag_data, int r); + +}; + +} // namespace librbd + +extern template class librbd::Journal<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_H diff --git a/src/librbd/LibrbdAdminSocketHook.cc b/src/librbd/LibrbdAdminSocketHook.cc new file mode 100644 index 00000000..15b8fb97 --- /dev/null +++ b/src/librbd/LibrbdAdminSocketHook.cc @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "librbd/ImageCtx.h" +#include "librbd/LibrbdAdminSocketHook.h" +#include "librbd/internal.h" +#include "librbd/io/ImageRequestWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbdadminsocket: " + +namespace librbd { + +class LibrbdAdminSocketCommand { +public: + virtual ~LibrbdAdminSocketCommand() {} + virtual bool call(stringstream *ss) = 0; +}; + +class FlushCacheCommand : public LibrbdAdminSocketCommand { +public: + explicit FlushCacheCommand(ImageCtx *ictx) : ictx(ictx) {} + + bool call(stringstream *ss) override { + int r = ictx->io_work_queue->flush(); + if (r < 0) { + *ss << "flush: " << cpp_strerror(r); + return false; + } + return true; + } + +private: + ImageCtx *ictx; +}; + +struct InvalidateCacheCommand : public LibrbdAdminSocketCommand { +public: + explicit InvalidateCacheCommand(ImageCtx *ictx) : ictx(ictx) {} + + bool call(stringstream *ss) override { + int r = invalidate_cache(ictx); + if (r < 0) { + *ss << "invalidate_cache: " << cpp_strerror(r); + return false; + } + return true; + } + +private: + ImageCtx *ictx; +}; + +LibrbdAdminSocketHook::LibrbdAdminSocketHook(ImageCtx *ictx) : + admin_socket(ictx->cct->get_admin_socket()) { + + std::string command; + std::string imagename; + int r; + + imagename = ictx->md_ctx.get_pool_name() + "/" + ictx->name; + command = "rbd cache flush " + imagename; + + r = admin_socket->register_command(command, command, this, + "flush rbd image " + imagename + + " cache"); + if (r == 0) { + commands[command] = new FlushCacheCommand(ictx); + } + + command = "rbd cache invalidate " + imagename; + r = admin_socket->register_command(command, command, this, + "invalidate rbd image " + imagename + + " cache"); + if (r == 0) { + commands[command] = new InvalidateCacheCommand(ictx); + } +} + +LibrbdAdminSocketHook::~LibrbdAdminSocketHook() { + for (Commands::const_iterator i = commands.begin(); i != commands.end(); + ++i) { + (void)admin_socket->unregister_command(i->first); + delete i->second; + } +} + +bool LibrbdAdminSocketHook::call(std::string_view command, + const cmdmap_t& cmdmap, + std::string_view format, + bufferlist& out) { + Commands::const_iterator i = commands.find(command); + ceph_assert(i != commands.end()); + stringstream ss; + bool r = i->second->call(&ss); + out.append(ss); + return r; +} + +} // namespace librbd diff --git a/src/librbd/LibrbdAdminSocketHook.h b/src/librbd/LibrbdAdminSocketHook.h new file mode 100644 index 00000000..8b1b5b94 --- /dev/null +++ b/src/librbd/LibrbdAdminSocketHook.h @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H +#define CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H + +#include <map> + +#include "common/admin_socket.h" + +namespace librbd { + + struct ImageCtx; + class LibrbdAdminSocketCommand; + + class LibrbdAdminSocketHook : public AdminSocketHook { + public: + LibrbdAdminSocketHook(ImageCtx *ictx); + ~LibrbdAdminSocketHook() override; + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override; + + private: + typedef std::map<std::string,LibrbdAdminSocketCommand*, + std::less<>> Commands; + + AdminSocket *admin_socket; + Commands commands; + }; +} + +#endif diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc new file mode 100644 index 00000000..acd7e874 --- /dev/null +++ b/src/librbd/LibrbdWriteback.cc @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <errno.h> + +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/Mutex.h" +#include "common/WorkQueue.h" +#include "osdc/Striper.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/LibrbdWriteback.h" +#include "librbd/ObjectMap.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/ReadResult.h" + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbdwriteback: " + +namespace librbd { + + /** + * context to wrap another context in a Mutex + * + * @param cct cct + * @param c context to finish + * @param l mutex to lock + */ + class C_ReadRequest : public Context { + public: + C_ReadRequest(CephContext *cct, Context *c, Mutex *cache_lock) + : m_cct(cct), m_ctx(c), m_cache_lock(cache_lock) { + } + void finish(int r) override { + ldout(m_cct, 20) << "aio_cb completing " << dendl; + { + Mutex::Locker cache_locker(*m_cache_lock); + m_ctx->complete(r); + } + ldout(m_cct, 20) << "aio_cb finished" << dendl; + } + private: + CephContext *m_cct; + Context *m_ctx; + Mutex *m_cache_lock; + }; + + class C_OrderedWrite : public Context { + public: + C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result, + const ZTracer::Trace &trace, LibrbdWriteback *wb) + : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {} + ~C_OrderedWrite() override {} + void finish(int r) override { + ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl; + { + Mutex::Locker l(m_wb_handler->m_lock); + ceph_assert(!m_result->done); + m_result->done = true; + m_result->ret = r; + m_wb_handler->complete_writes(m_result->oid); + } + ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl; + m_trace.event("finish"); + } + private: + CephContext *m_cct; + LibrbdWriteback::write_result_d *m_result; + ZTracer::Trace m_trace; + LibrbdWriteback *m_wb_handler; + }; + + struct C_CommitIOEventExtent : public Context { + ImageCtx *image_ctx; + uint64_t journal_tid; + uint64_t offset; + uint64_t length; + + C_CommitIOEventExtent(ImageCtx *image_ctx, uint64_t journal_tid, + uint64_t offset, uint64_t length) + : image_ctx(image_ctx), journal_tid(journal_tid), offset(offset), + length(length) { + } + + void finish(int r) override { + // all IO operations are flushed prior to closing the journal + ceph_assert(image_ctx->journal != nullptr); + + image_ctx->journal->commit_io_event_extent(journal_tid, offset, length, + r); + } + }; + + LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock) + : m_tid(0), m_lock(lock), m_ictx(ictx) { + } + + void LibrbdWriteback::read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, + uint64_t off, uint64_t len, snapid_t snapid, + bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) + { + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &m_ictx->trace_endpoint, &parent_trace); + trace.copy_name("cache read " + oid.name); + trace.event("start"); + } + + // on completion, take the mutex and then call onfinish. + onfinish = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock); + + // re-use standard object read state machine + auto aio_comp = io::AioCompletion::create_and_start(onfinish, m_ictx, + io::AIO_TYPE_READ); + aio_comp->read_result = io::ReadResult{pbl}; + aio_comp->set_request_count(1); + + auto req_comp = new io::ReadResult::C_ObjectReadRequest( + aio_comp, off, len, {{0, len}}); + + auto req = io::ObjectDispatchSpec::create_read( + m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, oid.name, object_no, off, len, + snapid, op_flags, trace, &req_comp->bl, &req_comp->extent_map, req_comp); + req->send(); + } + + bool LibrbdWriteback::may_copy_on_write(const object_t& oid, uint64_t read_off, uint64_t read_len, snapid_t snapid) + { + m_ictx->snap_lock.get_read(); + librados::snap_t snap_id = m_ictx->snap_id; + m_ictx->parent_lock.get_read(); + uint64_t overlap = 0; + m_ictx->get_parent_overlap(snap_id, &overlap); + m_ictx->parent_lock.put_read(); + m_ictx->snap_lock.put_read(); + + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); + + // reverse map this object extent onto the parent + vector<pair<uint64_t,uint64_t> > objectx; + Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, + object_no, 0, m_ictx->layout.object_size, + objectx); + uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap); + bool may = object_overlap > 0; + ldout(m_ictx->cct, 10) << "may_copy_on_write " << oid << " " << read_off + << "~" << read_len << " = " << may << dendl; + return may; + } + + ceph_tid_t LibrbdWriteback::write(const object_t& oid, + const object_locator_t& oloc, + uint64_t off, uint64_t len, + const SnapContext& snapc, + const bufferlist &bl, + ceph::real_time mtime, uint64_t trunc_size, + __u32 trunc_seq, ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) + { + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &m_ictx->trace_endpoint, &parent_trace); + trace.copy_name("writeback " + oid.name); + trace.event("start"); + } + + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); + + write_result_d *result = new write_result_d(oid.name, oncommit); + m_writes[oid.name].push(result); + ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl; + + bufferlist bl_copy(bl); + + Context *ctx = new C_OrderedWrite(m_ictx->cct, result, trace, this); + ctx = util::create_async_context_callback(*m_ictx, ctx); + + auto req = io::ObjectDispatchSpec::create_write( + m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, oid.name, object_no, off, + std::move(bl_copy), snapc, 0, journal_tid, trace, ctx); + req->object_dispatch_flags = ( + io::OBJECT_DISPATCH_FLAG_FLUSH | + io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR); + req->send(); + + return ++m_tid; + } + + + void LibrbdWriteback::overwrite_extent(const object_t& oid, uint64_t off, + uint64_t len, + ceph_tid_t original_journal_tid, + ceph_tid_t new_journal_tid) { + typedef std::vector<std::pair<uint64_t,uint64_t> > Extents; + + ldout(m_ictx->cct, 20) << __func__ << ": " << oid << " " + << off << "~" << len << " " + << "journal_tid=" << original_journal_tid << ", " + << "new_journal_tid=" << new_journal_tid << dendl; + + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); + + // all IO operations are flushed prior to closing the journal + ceph_assert(original_journal_tid != 0 && m_ictx->journal != NULL); + + Extents file_extents; + Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, object_no, off, + len, file_extents); + for (Extents::iterator it = file_extents.begin(); + it != file_extents.end(); ++it) { + if (new_journal_tid != 0) { + // ensure new journal event is safely committed to disk before + // committing old event + m_ictx->journal->flush_event( + new_journal_tid, new C_CommitIOEventExtent(m_ictx, + original_journal_tid, + it->first, it->second)); + } else { + m_ictx->journal->commit_io_event_extent(original_journal_tid, it->first, + it->second, 0); + } + } + } + + void LibrbdWriteback::complete_writes(const std::string& oid) + { + ceph_assert(m_lock.is_locked()); + std::queue<write_result_d*>& results = m_writes[oid]; + ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl; + std::list<write_result_d*> finished; + + while (!results.empty()) { + write_result_d *result = results.front(); + if (!result->done) + break; + finished.push_back(result); + results.pop(); + } + + if (results.empty()) + m_writes.erase(oid); + + for (std::list<write_result_d*>::iterator it = finished.begin(); + it != finished.end(); ++it) { + write_result_d *result = *it; + ldout(m_ictx->cct, 20) << "complete_writes() completing " << result + << dendl; + result->oncommit->complete(result->ret); + delete result; + } + } +} diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h new file mode 100644 index 00000000..6ffba511 --- /dev/null +++ b/src/librbd/LibrbdWriteback.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H +#define CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H + +#include <queue> + +#include "common/snap_types.h" +#include "osd/osd_types.h" +#include "osdc/WritebackHandler.h" + +class Mutex; +class Context; + +namespace librbd { + + struct ImageCtx; + + class LibrbdWriteback : public WritebackHandler { + public: + LibrbdWriteback(ImageCtx *ictx, Mutex& lock); + + // Note that oloc, trunc_size, and trunc_seq are ignored + void read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, uint64_t off, uint64_t len, + snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, Context *onfinish) override; + + // Determine whether a read to this extent could be affected by a + // write-triggered copy-on-write + bool may_copy_on_write(const object_t& oid, uint64_t read_off, + uint64_t read_len, snapid_t snapid) override; + + // Note that oloc, trunc_size, and trunc_seq are ignored + ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, uint64_t trunc_size, + __u32 trunc_seq, ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) override; + using WritebackHandler::write; + + void overwrite_extent(const object_t& oid, uint64_t off, + uint64_t len, ceph_tid_t original_journal_tid, + ceph_tid_t new_journal_tid) override; + + struct write_result_d { + bool done; + int ret; + std::string oid; + Context *oncommit; + write_result_d(const std::string& oid, Context *oncommit) : + done(false), ret(0), oid(oid), oncommit(oncommit) {} + private: + write_result_d(const write_result_d& rhs); + const write_result_d& operator=(const write_result_d& rhs); + }; + + private: + void complete_writes(const std::string& oid); + + ceph_tid_t m_tid; + Mutex& m_lock; + librbd::ImageCtx *m_ictx; + ceph::unordered_map<std::string, std::queue<write_result_d*> > m_writes; + friend class C_OrderedWrite; + }; +} + +#endif diff --git a/src/librbd/ManagedLock.cc b/src/librbd/ManagedLock.cc new file mode 100644 index 00000000..15d2016f --- /dev/null +++ b/src/librbd/ManagedLock.cc @@ -0,0 +1,852 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ManagedLock.h" +#include "librbd/managed_lock/AcquireRequest.h" +#include "librbd/managed_lock/BreakRequest.h" +#include "librbd/managed_lock/GetLockerRequest.h" +#include "librbd/managed_lock/ReleaseRequest.h" +#include "librbd/managed_lock/ReacquireRequest.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/managed_lock/Utils.h" +#include "librbd/Watcher.h" +#include "librbd/ImageCtx.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "common/WorkQueue.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ManagedLock: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +using std::string; +using namespace managed_lock; + +namespace { + +template <typename R> +struct C_SendLockRequest : public Context { + R* request; + explicit C_SendLockRequest(R* request) : request(request) { + } + void finish(int r) override { + request->send(); + } +}; + +struct C_Tracked : public Context { + AsyncOpTracker &tracker; + Context *ctx; + C_Tracked(AsyncOpTracker &tracker, Context *ctx) + : tracker(tracker), ctx(ctx) { + tracker.start_op(); + } + ~C_Tracked() override { + tracker.finish_op(); + } + void finish(int r) override { + ctx->complete(r); + } +}; + +} // anonymous namespace + +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; +using managed_lock::util::decode_lock_cookie; +using managed_lock::util::encode_lock_cookie; + +template <typename I> +ManagedLock<I>::ManagedLock(librados::IoCtx &ioctx, ContextWQ *work_queue, + const string& oid, Watcher *watcher, Mode mode, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds) + : m_lock(unique_lock_name("librbd::ManagedLock<I>::m_lock", this)), + m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(ioctx.cct())), + m_work_queue(work_queue), + m_oid(oid), + m_watcher(watcher), + m_mode(mode), + m_blacklist_on_break_lock(blacklist_on_break_lock), + m_blacklist_expire_seconds(blacklist_expire_seconds), + m_state(STATE_UNLOCKED) { +} + +template <typename I> +ManagedLock<I>::~ManagedLock() { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_SHUTDOWN || m_state == STATE_UNLOCKED || + m_state == STATE_UNINITIALIZED); + if (m_state == STATE_UNINITIALIZED) { + // never initialized -- ensure any in-flight ops are complete + // since we wouldn't expect shut_down to be invoked + C_SaferCond ctx; + m_async_op_tracker.wait_for_ops(&ctx); + ctx.wait(); + } + ceph_assert(m_async_op_tracker.empty()); +} + +template <typename I> +bool ManagedLock<I>::is_lock_owner() const { + Mutex::Locker locker(m_lock); + + return is_lock_owner(m_lock); +} + +template <typename I> +bool ManagedLock<I>::is_lock_owner(Mutex &lock) const { + + ceph_assert(m_lock.is_locked()); + + bool lock_owner; + + switch (m_state) { + case STATE_LOCKED: + case STATE_REACQUIRING: + case STATE_PRE_SHUTTING_DOWN: + case STATE_POST_ACQUIRING: + case STATE_PRE_RELEASING: + lock_owner = true; + break; + default: + lock_owner = false; + break; + } + + ldout(m_cct, 20) << "=" << lock_owner << dendl; + return lock_owner; +} + +template <typename I> +void ManagedLock<I>::shut_down(Context *on_shut_down) { + ldout(m_cct, 10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(!is_state_shutdown()); + + if (m_state == STATE_WAITING_FOR_REGISTER) { + // abort stalled acquire lock state + ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl; + Action active_action = get_active_action(); + ceph_assert(active_action == ACTION_TRY_LOCK || + active_action == ACTION_ACQUIRE_LOCK); + complete_active_action(STATE_UNLOCKED, -ESHUTDOWN); + } + + execute_action(ACTION_SHUT_DOWN, on_shut_down); +} + +template <typename I> +void ManagedLock<I>::acquire_lock(Context *on_acquired) { + int r = 0; + { + Mutex::Locker locker(m_lock); + if (is_state_shutdown()) { + r = -ESHUTDOWN; + } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) { + ldout(m_cct, 10) << dendl; + execute_action(ACTION_ACQUIRE_LOCK, on_acquired); + return; + } + } + + if (on_acquired != nullptr) { + on_acquired->complete(r); + } +} + +template <typename I> +void ManagedLock<I>::try_acquire_lock(Context *on_acquired) { + int r = 0; + { + Mutex::Locker locker(m_lock); + if (is_state_shutdown()) { + r = -ESHUTDOWN; + } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) { + ldout(m_cct, 10) << dendl; + execute_action(ACTION_TRY_LOCK, on_acquired); + return; + } + } + + if (on_acquired != nullptr) { + on_acquired->complete(r); + } +} + +template <typename I> +void ManagedLock<I>::release_lock(Context *on_released) { + int r = 0; + { + Mutex::Locker locker(m_lock); + if (is_state_shutdown()) { + r = -ESHUTDOWN; + } else if (m_state != STATE_UNLOCKED || !m_actions_contexts.empty()) { + ldout(m_cct, 10) << dendl; + execute_action(ACTION_RELEASE_LOCK, on_released); + return; + } + } + + if (on_released != nullptr) { + on_released->complete(r); + } +} + +template <typename I> +void ManagedLock<I>::reacquire_lock(Context *on_reacquired) { + { + Mutex::Locker locker(m_lock); + + if (m_state == STATE_WAITING_FOR_REGISTER) { + // restart the acquire lock process now that watch is valid + ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl; + Action active_action = get_active_action(); + ceph_assert(active_action == ACTION_TRY_LOCK || + active_action == ACTION_ACQUIRE_LOCK); + execute_next_action(); + } else if (!is_state_shutdown() && + (m_state == STATE_LOCKED || + m_state == STATE_ACQUIRING || + m_state == STATE_POST_ACQUIRING || + m_state == STATE_WAITING_FOR_LOCK)) { + // interlock the lock operation with other state ops + ldout(m_cct, 10) << dendl; + execute_action(ACTION_REACQUIRE_LOCK, on_reacquired); + return; + } + } + + // ignore request if shutdown or not in a locked-related state + if (on_reacquired != nullptr) { + on_reacquired->complete(0); + } +} + +template <typename I> +void ManagedLock<I>::get_locker(managed_lock::Locker *locker, + Context *on_finish) { + ldout(m_cct, 10) << dendl; + + int r; + { + Mutex::Locker l(m_lock); + if (is_state_shutdown()) { + r = -ESHUTDOWN; + } else { + on_finish = new C_Tracked(m_async_op_tracker, on_finish); + auto req = managed_lock::GetLockerRequest<I>::create( + m_ioctx, m_oid, m_mode == EXCLUSIVE, locker, on_finish); + req->send(); + return; + } + } + + on_finish->complete(r); +} + +template <typename I> +void ManagedLock<I>::break_lock(const managed_lock::Locker &locker, + bool force_break_lock, Context *on_finish) { + ldout(m_cct, 10) << dendl; + + int r; + { + Mutex::Locker l(m_lock); + if (is_state_shutdown()) { + r = -ESHUTDOWN; + } else if (is_lock_owner(m_lock)) { + r = -EBUSY; + } else { + on_finish = new C_Tracked(m_async_op_tracker, on_finish); + auto req = managed_lock::BreakRequest<I>::create( + m_ioctx, m_work_queue, m_oid, locker, m_mode == EXCLUSIVE, + m_blacklist_on_break_lock, m_blacklist_expire_seconds, force_break_lock, + on_finish); + req->send(); + return; + } + } + + on_finish->complete(r); +} + +template <typename I> +int ManagedLock<I>::assert_header_locked() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + { + Mutex::Locker locker(m_lock); + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, + (m_mode == EXCLUSIVE ? LOCK_EXCLUSIVE : + LOCK_SHARED), + m_cookie, + managed_lock::util::get_watcher_lock_tag()); + } + + int r = m_ioctx.operate(m_oid, &op, nullptr); + if (r < 0) { + if (r == -EBLACKLISTED) { + ldout(m_cct, 5) << "client is not lock owner -- client blacklisted" + << dendl; + } else if (r == -ENOENT) { + ldout(m_cct, 5) << "client is not lock owner -- no lock detected" + << dendl; + } else if (r == -EBUSY) { + ldout(m_cct, 5) << "client is not lock owner -- owned by different client" + << dendl; + } else { + lderr(m_cct) << "failed to verify lock ownership: " << cpp_strerror(r) + << dendl; + } + + return r; + } + + return 0; +} + +template <typename I> +void ManagedLock<I>::shutdown_handler(int r, Context *on_finish) { + on_finish->complete(r); +} + +template <typename I> +void ManagedLock<I>::pre_acquire_lock_handler(Context *on_finish) { + on_finish->complete(0); +} + +template <typename I> +void ManagedLock<I>::post_acquire_lock_handler(int r, Context *on_finish) { + on_finish->complete(r); +} + +template <typename I> +void ManagedLock<I>::pre_release_lock_handler(bool shutting_down, + Context *on_finish) { + on_finish->complete(0); +} + +template <typename I> +void ManagedLock<I>::post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) { + on_finish->complete(r); +} + +template <typename I> +void ManagedLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) { + on_finish->complete(r); +} + +template <typename I> +bool ManagedLock<I>::is_transition_state() const { + switch (m_state) { + case STATE_ACQUIRING: + case STATE_WAITING_FOR_REGISTER: + case STATE_REACQUIRING: + case STATE_RELEASING: + case STATE_PRE_SHUTTING_DOWN: + case STATE_SHUTTING_DOWN: + case STATE_INITIALIZING: + case STATE_WAITING_FOR_LOCK: + case STATE_POST_ACQUIRING: + case STATE_PRE_RELEASING: + return true; + case STATE_UNLOCKED: + case STATE_LOCKED: + case STATE_SHUTDOWN: + case STATE_UNINITIALIZED: + break; + } + return false; +} + +template <typename I> +void ManagedLock<I>::append_context(Action action, Context *ctx) { + ceph_assert(m_lock.is_locked()); + + for (auto &action_ctxs : m_actions_contexts) { + if (action == action_ctxs.first) { + if (ctx != nullptr) { + action_ctxs.second.push_back(ctx); + } + return; + } + } + + Contexts contexts; + if (ctx != nullptr) { + contexts.push_back(ctx); + } + m_actions_contexts.push_back({action, std::move(contexts)}); +} + +template <typename I> +void ManagedLock<I>::execute_action(Action action, Context *ctx) { + ceph_assert(m_lock.is_locked()); + + append_context(action, ctx); + if (!is_transition_state()) { + execute_next_action(); + } +} + +template <typename I> +void ManagedLock<I>::execute_next_action() { + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_actions_contexts.empty()); + switch (get_active_action()) { + case ACTION_ACQUIRE_LOCK: + case ACTION_TRY_LOCK: + send_acquire_lock(); + break; + case ACTION_REACQUIRE_LOCK: + send_reacquire_lock(); + break; + case ACTION_RELEASE_LOCK: + send_release_lock(); + break; + case ACTION_SHUT_DOWN: + send_shutdown(); + break; + default: + ceph_abort(); + break; + } +} + +template <typename I> +typename ManagedLock<I>::Action ManagedLock<I>::get_active_action() const { + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_actions_contexts.empty()); + return m_actions_contexts.front().first; +} + +template <typename I> +void ManagedLock<I>::complete_active_action(State next_state, int r) { + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_actions_contexts.empty()); + + ActionContexts action_contexts(std::move(m_actions_contexts.front())); + m_actions_contexts.pop_front(); + m_state = next_state; + + m_lock.Unlock(); + for (auto ctx : action_contexts.second) { + ctx->complete(r); + } + m_lock.Lock(); + + if (!is_transition_state() && !m_actions_contexts.empty()) { + execute_next_action(); + } +} + +template <typename I> +bool ManagedLock<I>::is_state_shutdown() const { + ceph_assert(m_lock.is_locked()); + + switch (m_state) { + case STATE_PRE_SHUTTING_DOWN: + case STATE_SHUTTING_DOWN: + case STATE_SHUTDOWN: + return true; + default: + break; + } + + return (!m_actions_contexts.empty() && + m_actions_contexts.back().first == ACTION_SHUT_DOWN); +} + +template <typename I> +void ManagedLock<I>::send_acquire_lock() { + ceph_assert(m_lock.is_locked()); + if (m_state == STATE_LOCKED) { + complete_active_action(STATE_LOCKED, 0); + return; + } + + ldout(m_cct, 10) << dendl; + + uint64_t watch_handle = m_watcher->get_watch_handle(); + if (watch_handle == 0) { + lderr(m_cct) << "watcher not registered - delaying request" << dendl; + m_state = STATE_WAITING_FOR_REGISTER; + + // shut down might race w/ release/re-acquire of the lock + if (is_state_shutdown()) { + complete_active_action(STATE_UNLOCKED, -ESHUTDOWN); + } + return; + } + + m_state = STATE_ACQUIRING; + m_cookie = encode_lock_cookie(watch_handle); + + m_work_queue->queue(new FunctionContext([this](int r) { + pre_acquire_lock_handler(create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_pre_acquire_lock>(this)); + })); +} + +template <typename I> +void ManagedLock<I>::handle_pre_acquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + handle_acquire_lock(r); + return; + } + + using managed_lock::AcquireRequest; + AcquireRequest<I>* req = AcquireRequest<I>::create( + m_ioctx, m_watcher, m_work_queue, m_oid, m_cookie, m_mode == EXCLUSIVE, + m_blacklist_on_break_lock, m_blacklist_expire_seconds, + create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_acquire_lock>(this)); + m_work_queue->queue(new C_SendLockRequest<AcquireRequest<I>>(req), 0); +} + +template <typename I> +void ManagedLock<I>::handle_acquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -EBUSY || r == -EAGAIN) { + ldout(m_cct, 5) << "unable to acquire exclusive lock" << dendl; + } else if (r < 0) { + lderr(m_cct) << "failed to acquire exclusive lock:" << cpp_strerror(r) + << dendl; + } else { + ldout(m_cct, 5) << "successfully acquired exclusive lock" << dendl; + } + + m_post_next_state = (r < 0 ? STATE_UNLOCKED : STATE_LOCKED); + + m_work_queue->queue(new FunctionContext([this, r](int ret) { + post_acquire_lock_handler(r, create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_post_acquire_lock>(this)); + })); +} + +template <typename I> +void ManagedLock<I>::handle_post_acquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0 && m_post_next_state == STATE_LOCKED) { + // release_lock without calling pre and post handlers + revert_to_unlock_state(r); + } else if (r != -ECANCELED) { + // fail the lock request + complete_active_action(m_post_next_state, r); + } +} + +template <typename I> +void ManagedLock<I>::revert_to_unlock_state(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + using managed_lock::ReleaseRequest; + ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher, + m_work_queue, m_oid, m_cookie, + new FunctionContext([this, r](int ret) { + Mutex::Locker locker(m_lock); + ceph_assert(ret == 0); + complete_active_action(STATE_UNLOCKED, r); + })); + m_work_queue->queue(new C_SendLockRequest<ReleaseRequest<I>>(req)); +} + +template <typename I> +void ManagedLock<I>::send_reacquire_lock() { + ceph_assert(m_lock.is_locked()); + + if (m_state != STATE_LOCKED) { + complete_active_action(m_state, 0); + return; + } + + ldout(m_cct, 10) << dendl; + m_state = STATE_REACQUIRING; + + uint64_t watch_handle = m_watcher->get_watch_handle(); + if (watch_handle == 0) { + // watch (re)failed while recovering + lderr(m_cct) << "aborting reacquire due to invalid watch handle" + << dendl; + + // treat double-watch failure as a lost lock and invoke the + // release/acquire handlers + release_acquire_lock(); + complete_active_action(STATE_LOCKED, 0); + return; + } + + m_new_cookie = encode_lock_cookie(watch_handle); + if (m_cookie == m_new_cookie) { + ldout(m_cct, 10) << "skipping reacquire since cookie still valid" + << dendl; + auto ctx = create_context_callback< + ManagedLock, &ManagedLock<I>::handle_no_op_reacquire_lock>(this); + post_reacquire_lock_handler(0, ctx); + return; + } + + auto ctx = create_context_callback< + ManagedLock, &ManagedLock<I>::handle_reacquire_lock>(this); + ctx = new FunctionContext([this, ctx](int r) { + post_reacquire_lock_handler(r, ctx); + }); + + using managed_lock::ReacquireRequest; + ReacquireRequest<I>* req = ReacquireRequest<I>::create(m_ioctx, m_oid, + m_cookie, m_new_cookie, m_mode == EXCLUSIVE, ctx); + m_work_queue->queue(new C_SendLockRequest<ReacquireRequest<I>>(req)); +} + +template <typename I> +void ManagedLock<I>::handle_reacquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_REACQUIRING); + + if (r < 0) { + if (r == -EOPNOTSUPP) { + ldout(m_cct, 10) << "updating lock is not supported" << dendl; + } else { + lderr(m_cct) << "failed to update lock cookie: " << cpp_strerror(r) + << dendl; + } + + release_acquire_lock(); + } else { + m_cookie = m_new_cookie; + } + + complete_active_action(STATE_LOCKED, 0); +} + +template <typename I> +void ManagedLock<I>::handle_no_op_reacquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + ceph_assert(m_state == STATE_REACQUIRING); + ceph_assert(r >= 0); + complete_active_action(STATE_LOCKED, 0); +} + +template <typename I> +void ManagedLock<I>::release_acquire_lock() { + assert(m_lock.is_locked()); + + if (!is_state_shutdown()) { + // queue a release and re-acquire of the lock since cookie cannot + // be updated on older OSDs + execute_action(ACTION_RELEASE_LOCK, nullptr); + + ceph_assert(!m_actions_contexts.empty()); + ActionContexts &action_contexts(m_actions_contexts.front()); + + // reacquire completes when the request lock completes + Contexts contexts; + std::swap(contexts, action_contexts.second); + if (contexts.empty()) { + execute_action(ACTION_ACQUIRE_LOCK, nullptr); + } else { + for (auto ctx : contexts) { + execute_action(ACTION_ACQUIRE_LOCK, ctx); + } + } + } +} + +template <typename I> +void ManagedLock<I>::send_release_lock() { + ceph_assert(m_lock.is_locked()); + if (m_state == STATE_UNLOCKED) { + complete_active_action(STATE_UNLOCKED, 0); + return; + } + + ldout(m_cct, 10) << dendl; + m_state = STATE_PRE_RELEASING; + + m_work_queue->queue(new FunctionContext([this](int r) { + pre_release_lock_handler(false, create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_pre_release_lock>(this)); + })); +} + +template <typename I> +void ManagedLock<I>::handle_pre_release_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_PRE_RELEASING); + m_state = STATE_RELEASING; + } + + if (r < 0) { + handle_release_lock(r); + return; + } + + using managed_lock::ReleaseRequest; + ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher, + m_work_queue, m_oid, m_cookie, + create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_release_lock>(this)); + m_work_queue->queue(new C_SendLockRequest<ReleaseRequest<I>>(req), 0); +} + +template <typename I> +void ManagedLock<I>::handle_release_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_RELEASING); + + if (r >= 0 || r == -EBLACKLISTED || r == -ENOENT) { + m_cookie = ""; + m_post_next_state = STATE_UNLOCKED; + } else { + m_post_next_state = STATE_LOCKED; + } + + m_work_queue->queue(new FunctionContext([this, r](int ret) { + post_release_lock_handler(false, r, create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_post_release_lock>(this)); + })); +} + +template <typename I> +void ManagedLock<I>::handle_post_release_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + complete_active_action(m_post_next_state, r); +} + +template <typename I> +void ManagedLock<I>::send_shutdown() { + ldout(m_cct, 10) << dendl; + ceph_assert(m_lock.is_locked()); + if (m_state == STATE_UNLOCKED) { + m_state = STATE_SHUTTING_DOWN; + m_work_queue->queue(new FunctionContext([this](int r) { + shutdown_handler(r, create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_shutdown>(this)); + })); + return; + } + + ceph_assert(m_state == STATE_LOCKED); + m_state = STATE_PRE_SHUTTING_DOWN; + + m_lock.Unlock(); + m_work_queue->queue(new C_ShutDownRelease(this), 0); + m_lock.Lock(); +} + +template <typename I> +void ManagedLock<I>::handle_shutdown(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + wait_for_tracked_ops(r); +} + +template <typename I> +void ManagedLock<I>::send_shutdown_release() { + ldout(m_cct, 10) << dendl; + + Mutex::Locker locker(m_lock); + + m_work_queue->queue(new FunctionContext([this](int r) { + pre_release_lock_handler(true, create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_shutdown_pre_release>(this)); + })); +} + +template <typename I> +void ManagedLock<I>::handle_shutdown_pre_release(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::string cookie; + { + Mutex::Locker locker(m_lock); + cookie = m_cookie; + + ceph_assert(m_state == STATE_PRE_SHUTTING_DOWN); + m_state = STATE_SHUTTING_DOWN; + } + + using managed_lock::ReleaseRequest; + ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher, + m_work_queue, m_oid, cookie, + new FunctionContext([this, r](int l) { + int rst = r < 0 ? r : l; + post_release_lock_handler(true, rst, create_context_callback< + ManagedLock<I>, &ManagedLock<I>::handle_shutdown_post_release>(this)); + })); + req->send(); + +} + +template <typename I> +void ManagedLock<I>::handle_shutdown_post_release(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + wait_for_tracked_ops(r); +} + +template <typename I> +void ManagedLock<I>::wait_for_tracked_ops(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + Context *ctx = new FunctionContext([this, r](int ret) { + complete_shutdown(r); + }); + + m_async_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void ManagedLock<I>::complete_shutdown(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to shut down lock: " << cpp_strerror(r) + << dendl; + } + + ActionContexts action_contexts; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_lock.is_locked()); + ceph_assert(m_actions_contexts.size() == 1); + + action_contexts = std::move(m_actions_contexts.front()); + m_actions_contexts.pop_front(); + m_state = STATE_SHUTDOWN; + } + + // expect to be destroyed after firing callback + for (auto ctx : action_contexts.second) { + ctx->complete(r); + } +} + +} // namespace librbd + +template class librbd::ManagedLock<librbd::ImageCtx>; diff --git a/src/librbd/ManagedLock.h b/src/librbd/ManagedLock.h new file mode 100644 index 00000000..b27e549d --- /dev/null +++ b/src/librbd/ManagedLock.h @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_H +#define CEPH_LIBRBD_MANAGED_LOCK_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/AsyncOpTracker.h" +#include "common/Mutex.h" +#include "cls/lock/cls_lock_types.h" +#include "librbd/watcher/Types.h" +#include "librbd/managed_lock/Types.h" +#include <list> +#include <string> +#include <utility> + +class ContextWQ; + +namespace librbd { + +struct ImageCtx; + +namespace managed_lock { struct Locker; } + +template <typename ImageCtxT = librbd::ImageCtx> +class ManagedLock { +private: + typedef watcher::Traits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Watcher Watcher; + +public: + static ManagedLock *create(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid, Watcher *watcher, + managed_lock::Mode mode, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds) { + return new ManagedLock(ioctx, work_queue, oid, watcher, mode, + blacklist_on_break_lock, blacklist_expire_seconds); + } + void destroy() { + delete this; + } + + ManagedLock(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid, Watcher *watcher, + managed_lock::Mode mode, bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds); + virtual ~ManagedLock(); + + bool is_lock_owner() const; + + void shut_down(Context *on_shutdown); + void acquire_lock(Context *on_acquired); + void try_acquire_lock(Context *on_acquired); + void release_lock(Context *on_released); + void reacquire_lock(Context *on_reacquired); + void get_locker(managed_lock::Locker *locker, Context *on_finish); + void break_lock(const managed_lock::Locker &locker, bool force_break_lock, + Context *on_finish); + + int assert_header_locked(); + + bool is_shutdown() const { + Mutex::Locker l(m_lock); + return is_state_shutdown(); + } + +protected: + mutable Mutex m_lock; + + inline void set_state_uninitialized() { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_UNLOCKED); + m_state = STATE_UNINITIALIZED; + } + inline void set_state_initializing() { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_UNINITIALIZED); + m_state = STATE_INITIALIZING; + } + inline void set_state_unlocked() { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_INITIALIZING || m_state == STATE_RELEASING); + m_state = STATE_UNLOCKED; + } + inline void set_state_waiting_for_lock() { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_ACQUIRING); + m_state = STATE_WAITING_FOR_LOCK; + } + inline void set_state_post_acquiring() { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_ACQUIRING); + m_state = STATE_POST_ACQUIRING; + } + + bool is_state_shutdown() const; + inline bool is_state_acquiring() const { + ceph_assert(m_lock.is_locked()); + return m_state == STATE_ACQUIRING; + } + inline bool is_state_post_acquiring() const { + ceph_assert(m_lock.is_locked()); + return m_state == STATE_POST_ACQUIRING; + } + inline bool is_state_releasing() const { + ceph_assert(m_lock.is_locked()); + return m_state == STATE_RELEASING; + } + inline bool is_state_pre_releasing() const { + ceph_assert(m_lock.is_locked()); + return m_state == STATE_PRE_RELEASING; + } + inline bool is_state_locked() const { + ceph_assert(m_lock.is_locked()); + return m_state == STATE_LOCKED; + } + inline bool is_state_waiting_for_lock() const { + ceph_assert(m_lock.is_locked()); + return m_state == STATE_WAITING_FOR_LOCK; + } + + inline bool is_action_acquire_lock() const { + ceph_assert(m_lock.is_locked()); + return get_active_action() == ACTION_ACQUIRE_LOCK; + } + + virtual void shutdown_handler(int r, Context *on_finish); + virtual void pre_acquire_lock_handler(Context *on_finish); + virtual void post_acquire_lock_handler(int r, Context *on_finish); + virtual void pre_release_lock_handler(bool shutting_down, + Context *on_finish); + virtual void post_release_lock_handler(bool shutting_down, int r, + Context *on_finish); + virtual void post_reacquire_lock_handler(int r, Context *on_finish); + + void execute_next_action(); + +private: + /** + * @verbatim + * + * <start> + * | + * | + * v (acquire_lock) + * UNLOCKED -----------------------------------------> ACQUIRING + * ^ | + * | | + * RELEASING | + * | | + * | | + * | (release_lock) v + * PRE_RELEASING <----------------------------------------- LOCKED + * + * <LOCKED state> + * | + * v + * REACQUIRING -------------------------------------> <finish> + * . ^ + * . | + * . . . > <RELEASE action> ---> <ACQUIRE action> ---/ + * + * <UNLOCKED/LOCKED states> + * | + * | + * v + * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> <finish> + * + * @endverbatim + */ + enum State { + STATE_UNINITIALIZED, + STATE_INITIALIZING, + STATE_UNLOCKED, + STATE_LOCKED, + STATE_ACQUIRING, + STATE_POST_ACQUIRING, + STATE_WAITING_FOR_REGISTER, + STATE_WAITING_FOR_LOCK, + STATE_REACQUIRING, + STATE_PRE_RELEASING, + STATE_RELEASING, + STATE_PRE_SHUTTING_DOWN, + STATE_SHUTTING_DOWN, + STATE_SHUTDOWN, + }; + + enum Action { + ACTION_TRY_LOCK, + ACTION_ACQUIRE_LOCK, + ACTION_REACQUIRE_LOCK, + ACTION_RELEASE_LOCK, + ACTION_SHUT_DOWN + }; + + typedef std::list<Context *> Contexts; + typedef std::pair<Action, Contexts> ActionContexts; + typedef std::list<ActionContexts> ActionsContexts; + + struct C_ShutDownRelease : public Context { + ManagedLock *lock; + C_ShutDownRelease(ManagedLock *lock) + : lock(lock) { + } + void finish(int r) override { + lock->send_shutdown_release(); + } + }; + + librados::IoCtx& m_ioctx; + CephContext *m_cct; + ContextWQ *m_work_queue; + std::string m_oid; + Watcher *m_watcher; + managed_lock::Mode m_mode; + bool m_blacklist_on_break_lock; + uint32_t m_blacklist_expire_seconds; + + std::string m_cookie; + std::string m_new_cookie; + + State m_state; + State m_post_next_state; + + ActionsContexts m_actions_contexts; + AsyncOpTracker m_async_op_tracker; + + bool is_lock_owner(Mutex &lock) const; + bool is_transition_state() const; + + void append_context(Action action, Context *ctx); + void execute_action(Action action, Context *ctx); + + Action get_active_action() const; + void complete_active_action(State next_state, int r); + + void send_acquire_lock(); + void handle_pre_acquire_lock(int r); + void handle_acquire_lock(int r); + void handle_no_op_reacquire_lock(int r); + + void handle_post_acquire_lock(int r); + void revert_to_unlock_state(int r); + + void send_reacquire_lock(); + void handle_reacquire_lock(int r); + void release_acquire_lock(); + + void send_release_lock(); + void handle_pre_release_lock(int r); + void handle_release_lock(int r); + void handle_post_release_lock(int r); + + void send_shutdown(); + void handle_shutdown(int r); + void send_shutdown_release(); + void handle_shutdown_pre_release(int r); + void handle_shutdown_post_release(int r); + void wait_for_tracked_ops(int r); + void complete_shutdown(int r); +}; + +} // namespace librbd + +extern template class librbd::ManagedLock<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MANAGED_LOCK_H diff --git a/src/librbd/MirroringWatcher.cc b/src/librbd/MirroringWatcher.cc new file mode 100644 index 00000000..f22dc149 --- /dev/null +++ b/src/librbd/MirroringWatcher.cc @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/MirroringWatcher.h" +#include "include/rbd_types.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "common/Cond.h" +#include "librbd/Utils.h" +#include "librbd/watcher/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MirroringWatcher: " + +namespace librbd { + +using namespace mirroring_watcher; +using namespace watcher; + +using librbd::util::create_rados_callback; + +namespace { + +static const uint64_t NOTIFY_TIMEOUT_MS = 5000; + +} // anonymous namespace + +template <typename I> +MirroringWatcher<I>::MirroringWatcher(librados::IoCtx &io_ctx, + ContextWQ *work_queue) + : Watcher(io_ctx, work_queue, RBD_MIRRORING) { +} + +template <typename I> +int MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode) { + C_SaferCond ctx; + notify_mode_updated(io_ctx, mirror_mode, &ctx); + return ctx.wait(); +} + +template <typename I> +void MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode, + Context *on_finish) { + CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ModeUpdatedPayload{mirror_mode}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS, + nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +int MirroringWatcher<I>::notify_image_updated( + librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, const std::string &global_image_id) { + C_SaferCond ctx; + notify_image_updated(io_ctx, mirror_image_state, image_id, global_image_id, + &ctx); + return ctx.wait(); +} + +template <typename I> +void MirroringWatcher<I>::notify_image_updated( + librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, const std::string &global_image_id, + Context *on_finish) { + + CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ImageUpdatedPayload{ + mirror_image_state, image_id, global_image_id}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS, + nullptr); + ceph_assert(r == 0); + comp->release(); + +} + +template <typename I> +void MirroringWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + CephContext *cct = this->m_cct; + ldout(cct, 15) << ": notify_id=" << notify_id << ", " + << "handle=" << handle << dendl; + + + NotifyMessage notify_message; + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + lderr(cct) << ": error decoding image notification: " << err.what() + << dendl; + Context *ctx = new C_NotifyAck(this, notify_id, handle); + ctx->complete(0); + return; + } + + apply_visitor(watcher::util::HandlePayloadVisitor<MirroringWatcher<I>>( + this, notify_id, handle), notify_message.payload); +} + +template <typename I> +bool MirroringWatcher<I>::handle_payload(const ModeUpdatedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << ": mode updated: " << payload.mirror_mode << dendl; + handle_mode_updated(payload.mirror_mode); + return true; +} + +template <typename I> +bool MirroringWatcher<I>::handle_payload(const ImageUpdatedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << ": image state updated" << dendl; + handle_image_updated(payload.mirror_image_state, payload.image_id, + payload.global_image_id); + return true; +} + +template <typename I> +bool MirroringWatcher<I>::handle_payload(const UnknownPayload &payload, + Context *on_notify_ack) { + return true; +} + +} // namespace librbd + +template class librbd::MirroringWatcher<librbd::ImageCtx>; diff --git a/src/librbd/MirroringWatcher.h b/src/librbd/MirroringWatcher.h new file mode 100644 index 00000000..4c72080b --- /dev/null +++ b/src/librbd/MirroringWatcher.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_H +#define CEPH_LIBRBD_MIRRORING_WATCHER_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Watcher.h" +#include "librbd/mirroring_watcher/Types.h" + +namespace librbd { + +namespace watcher { +namespace util { +template <typename> struct HandlePayloadVisitor; +} +} + +template <typename ImageCtxT = librbd::ImageCtx> +class MirroringWatcher : public Watcher { + friend struct watcher::util::HandlePayloadVisitor<MirroringWatcher<ImageCtxT>>; + +public: + MirroringWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue); + + static int notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode); + static void notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode, + Context *on_finish); + + static int notify_image_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, + const std::string &global_image_id); + static void notify_image_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, + const std::string &global_image_id, + Context *on_finish); + + virtual void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) = 0; + virtual void handle_image_updated(cls::rbd::MirrorImageState state, + const std::string &image_id, + const std::string &global_image_id) = 0; + +private: + bool handle_payload(const mirroring_watcher::ModeUpdatedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const mirroring_watcher::ImageUpdatedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const mirroring_watcher::UnknownPayload &payload, + Context *on_notify_ack); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; +}; + +} // namespace librbd + +extern template class librbd::MirroringWatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRRORING_WATCHER_H diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc new file mode 100644 index 00000000..50cbfda8 --- /dev/null +++ b/src/librbd/ObjectMap.cc @@ -0,0 +1,364 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ObjectMap.h" +#include "librbd/BlockGuard.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/object_map/RefreshRequest.h" +#include "librbd/object_map/ResizeRequest.h" +#include "librbd/object_map/SnapshotCreateRequest.h" +#include "librbd/object_map/SnapshotRemoveRequest.h" +#include "librbd/object_map/SnapshotRollbackRequest.h" +#include "librbd/object_map/UnlockRequest.h" +#include "librbd/object_map/UpdateRequest.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" + +#include "include/rados/librados.hpp" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "include/stringify.h" +#include "osdc/Striper.h" +#include <sstream> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ObjectMap: " << this << " " << __func__ \ + << ": " + +namespace librbd { + +template <typename I> +ObjectMap<I>::ObjectMap(I &image_ctx, uint64_t snap_id) + : m_image_ctx(image_ctx), m_snap_id(snap_id), + m_update_guard(new UpdateGuard(m_image_ctx.cct)) { +} + +template <typename I> +ObjectMap<I>::~ObjectMap() { + delete m_update_guard; +} + +template <typename I> +int ObjectMap<I>::aio_remove(librados::IoCtx &io_ctx, const std::string &image_id, + librados::AioCompletion *c) { + return io_ctx.aio_remove(object_map_name(image_id, CEPH_NOSNAP), c); +} + +template <typename I> +std::string ObjectMap<I>::object_map_name(const std::string &image_id, + uint64_t snap_id) { + std::string oid(RBD_OBJECT_MAP_PREFIX + image_id); + if (snap_id != CEPH_NOSNAP) { + std::stringstream snap_suffix; + snap_suffix << "." << std::setfill('0') << std::setw(16) << std::hex + << snap_id; + oid += snap_suffix.str(); + } + return oid; +} + +template <typename I> +bool ObjectMap<I>::is_compatible(const file_layout_t& layout, uint64_t size) { + uint64_t object_count = Striper::get_num_objects(layout, size); + return (object_count <= cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT); +} + +template <typename I> +ceph::BitVector<2u>::Reference ObjectMap<I>::operator[](uint64_t object_no) +{ + ceph_assert(m_image_ctx.object_map_lock.is_wlocked()); + ceph_assert(object_no < m_object_map.size()); + return m_object_map[object_no]; +} + +template <typename I> +uint8_t ObjectMap<I>::operator[](uint64_t object_no) const +{ + ceph_assert(m_image_ctx.object_map_lock.is_locked()); + ceph_assert(object_no < m_object_map.size()); + return m_object_map[object_no]; +} + +template <typename I> +bool ObjectMap<I>::object_may_exist(uint64_t object_no) const +{ + ceph_assert(m_image_ctx.snap_lock.is_locked()); + + // Fall back to default logic if object map is disabled or invalid + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock)) { + return true; + } + + bool flags_set; + int r = m_image_ctx.test_flags(m_image_ctx.snap_id, + RBD_FLAG_OBJECT_MAP_INVALID, + m_image_ctx.snap_lock, &flags_set); + if (r < 0 || flags_set) { + return true; + } + + RWLock::RLocker l(m_image_ctx.object_map_lock); + uint8_t state = (*this)[object_no]; + bool exists = (state != OBJECT_NONEXISTENT); + ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" << exists + << dendl; + return exists; +} + +template <typename I> +bool ObjectMap<I>::object_may_not_exist(uint64_t object_no) const +{ + ceph_assert(m_image_ctx.snap_lock.is_locked()); + + // Fall back to default logic if object map is disabled or invalid + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock)) { + return true; + } + + bool flags_set; + int r = m_image_ctx.test_flags(m_image_ctx.snap_id, + RBD_FLAG_OBJECT_MAP_INVALID, + m_image_ctx.snap_lock, &flags_set); + if (r < 0 || flags_set) { + return true; + } + + RWLock::RLocker l(m_image_ctx.object_map_lock); + uint8_t state = (*this)[object_no]; + bool nonexistent = (state != OBJECT_EXISTS && state != OBJECT_EXISTS_CLEAN); + ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" + << nonexistent << dendl; + return nonexistent; +} + +template <typename I> +bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it, + uint8_t new_state) { + ceph_assert(m_image_ctx.object_map_lock.is_wlocked()); + uint8_t state = *it; + if ((state == new_state) || + (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) || + (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) { + return false; + } + return true; +} + +template <typename I> +void ObjectMap<I>::open(Context *on_finish) { + auto req = object_map::RefreshRequest<I>::create( + m_image_ctx, &m_object_map, m_snap_id, on_finish); + req->send(); +} + +template <typename I> +void ObjectMap<I>::close(Context *on_finish) { + if (m_snap_id != CEPH_NOSNAP) { + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + auto req = object_map::UnlockRequest<I>::create(m_image_ctx, on_finish); + req->send(); +} + +template <typename I> +bool ObjectMap<I>::set_object_map(ceph::BitVector<2> &target_object_map) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock)); + RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock); + m_object_map = target_object_map; + return true; +} + +template <typename I> +void ObjectMap<I>::rollback(uint64_t snap_id, Context *on_finish) { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.object_map_lock.is_wlocked()); + + object_map::SnapshotRollbackRequest *req = + new object_map::SnapshotRollbackRequest(m_image_ctx, snap_id, on_finish); + req->send(); +} + +template <typename I> +void ObjectMap<I>::snapshot_add(uint64_t snap_id, Context *on_finish) { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + ceph_assert(snap_id != CEPH_NOSNAP); + + object_map::SnapshotCreateRequest *req = + new object_map::SnapshotCreateRequest(m_image_ctx, &m_object_map, snap_id, + on_finish); + req->send(); +} + +template <typename I> +void ObjectMap<I>::snapshot_remove(uint64_t snap_id, Context *on_finish) { + ceph_assert(m_image_ctx.snap_lock.is_wlocked()); + ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + ceph_assert(snap_id != CEPH_NOSNAP); + + object_map::SnapshotRemoveRequest *req = + new object_map::SnapshotRemoveRequest(m_image_ctx, &m_object_map, snap_id, + on_finish); + req->send(); +} + +template <typename I> +void ObjectMap<I>::aio_save(Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock)); + RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock); + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + } + cls_client::object_map_save(&op, m_object_map); + + std::string oid(object_map_name(m_image_ctx.id, m_snap_id)); + librados::AioCompletion *comp = util::create_rados_callback(on_finish); + + int r = m_image_ctx.md_ctx.aio_operate(oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void ObjectMap<I>::aio_resize(uint64_t new_size, uint8_t default_object_state, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock)); + ceph_assert(m_image_ctx.image_watcher != NULL); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + object_map::ResizeRequest *req = new object_map::ResizeRequest( + m_image_ctx, &m_object_map, m_snap_id, new_size, default_object_state, + on_finish); + req->send(); +} + +template <typename I> +void ObjectMap<I>::detained_aio_update(UpdateOperation &&op) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.object_map_lock.is_wlocked()); + + BlockGuardCell *cell; + int r = m_update_guard->detain({op.start_object_no, op.end_object_no}, + &op, &cell); + if (r < 0) { + lderr(cct) << "failed to detain object map update: " << cpp_strerror(r) + << dendl; + m_image_ctx.op_work_queue->queue(op.on_finish, r); + return; + } else if (r > 0) { + ldout(cct, 20) << "detaining object map update due to in-flight update: " + << "start=" << op.start_object_no << ", " + << "end=" << op.end_object_no << ", " + << (op.current_state ? + stringify(static_cast<uint32_t>(*op.current_state)) : + "") + << "->" << static_cast<uint32_t>(op.new_state) << dendl; + return; + } + + ldout(cct, 20) << "in-flight update cell: " << cell << dendl; + Context *on_finish = op.on_finish; + Context *ctx = new FunctionContext([this, cell, on_finish](int r) { + handle_detained_aio_update(cell, r, on_finish); + }); + aio_update(CEPH_NOSNAP, op.start_object_no, op.end_object_no, op.new_state, + op.current_state, op.parent_trace, op.ignore_enoent, ctx); +} + +template <typename I> +void ObjectMap<I>::handle_detained_aio_update(BlockGuardCell *cell, int r, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "cell=" << cell << ", r=" << r << dendl; + + typename UpdateGuard::BlockOperations block_ops; + m_update_guard->release(cell, &block_ops); + + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock); + for (auto &op : block_ops) { + detained_aio_update(std::move(op)); + } + } + + on_finish->complete(r); +} + +template <typename I> +void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, + bool ignore_enoent, Context *on_finish) { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + ceph_assert(m_image_ctx.image_watcher != nullptr); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + ceph_assert(snap_id != CEPH_NOSNAP || + m_image_ctx.object_map_lock.is_wlocked()); + ceph_assert(start_object_no < end_object_no); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "start=" << start_object_no << ", " + << "end=" << end_object_no << ", " + << (current_state ? + stringify(static_cast<uint32_t>(*current_state)) : "") + << "->" << static_cast<uint32_t>(new_state) << dendl; + if (snap_id == CEPH_NOSNAP) { + end_object_no = std::min(end_object_no, m_object_map.size()); + if (start_object_no >= end_object_no) { + ldout(cct, 20) << "skipping update of invalid object map" << dendl; + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + auto it = m_object_map.begin() + start_object_no; + auto end_it = m_object_map.begin() + end_object_no; + for (; it != end_it; ++it) { + if (update_required(it, new_state)) { + break; + } + } + if (it == end_it) { + ldout(cct, 20) << "object map update not required" << dendl; + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + } + + auto req = object_map::UpdateRequest<I>::create( + m_image_ctx, &m_object_map, snap_id, start_object_no, end_object_no, + new_state, current_state, parent_trace, ignore_enoent, on_finish); + req->send(); +} + +} // namespace librbd + +template class librbd::ObjectMap<librbd::ImageCtx>; + diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h new file mode 100644 index 00000000..c930a5b5 --- /dev/null +++ b/src/librbd/ObjectMap.h @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_H +#define CEPH_LIBRBD_OBJECT_MAP_H + +#include "include/int_types.h" +#include "include/fs_types.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/object_map_types.h" +#include "common/bit_vector.hpp" +#include "librbd/Utils.h" +#include <boost/optional.hpp> + +class Context; +class RWLock; +namespace ZTracer { struct Trace; } + +namespace librbd { + +template <typename Op> class BlockGuard; +struct BlockGuardCell; +class ImageCtx; + +template <typename ImageCtxT = ImageCtx> +class ObjectMap { +public: + static ObjectMap *create(ImageCtxT &image_ctx, uint64_t snap_id) { + return new ObjectMap(image_ctx, snap_id); + } + + ObjectMap(ImageCtxT &image_ctx, uint64_t snap_id); + ~ObjectMap(); + + static int aio_remove(librados::IoCtx &io_ctx, const std::string &image_id, librados::AioCompletion *c); + static std::string object_map_name(const std::string &image_id, + uint64_t snap_id); + + static bool is_compatible(const file_layout_t& layout, uint64_t size); + + ceph::BitVector<2u>::Reference operator[](uint64_t object_no); + uint8_t operator[](uint64_t object_no) const; + inline uint64_t size() const { + return m_object_map.size(); + } + + void open(Context *on_finish); + void close(Context *on_finish); + bool set_object_map(ceph::BitVector<2> &target_object_map); + bool object_may_exist(uint64_t object_no) const; + bool object_may_not_exist(uint64_t object_no) const; + + void aio_save(Context *on_finish); + void aio_resize(uint64_t new_size, uint8_t default_object_state, + Context *on_finish); + + template <typename T, void(T::*MF)(int) = &T::complete> + bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + T *callback_object) { + return aio_update<T, MF>(snap_id, start_object_no, start_object_no + 1, + new_state, current_state, parent_trace, + ignore_enoent, callback_object); + } + + template <typename T, void(T::*MF)(int) = &T::complete> + bool aio_update(uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + T *callback_object) { + ceph_assert(start_object_no < end_object_no); + if (snap_id == CEPH_NOSNAP) { + end_object_no = std::min(end_object_no, m_object_map.size()); + if (start_object_no >= end_object_no) { + return false; + } + + auto it = m_object_map.begin() + start_object_no; + auto end_it = m_object_map.begin() + end_object_no; + for (; it != end_it; ++it) { + if (update_required(it, new_state)) { + break; + } + } + + if (it == end_it) { + return false; + } + + UpdateOperation update_operation(start_object_no, end_object_no, + new_state, current_state, parent_trace, + ignore_enoent, + util::create_context_callback<T, MF>( + callback_object)); + detained_aio_update(std::move(update_operation)); + } else { + aio_update(snap_id, start_object_no, end_object_no, new_state, + current_state, parent_trace, ignore_enoent, + util::create_context_callback<T, MF>(callback_object)); + } + return true; + } + + void rollback(uint64_t snap_id, Context *on_finish); + void snapshot_add(uint64_t snap_id, Context *on_finish); + void snapshot_remove(uint64_t snap_id, Context *on_finish); + +private: + struct UpdateOperation { + uint64_t start_object_no; + uint64_t end_object_no; + uint8_t new_state; + boost::optional<uint8_t> current_state; + ZTracer::Trace parent_trace; + bool ignore_enoent; + Context *on_finish; + + UpdateOperation(uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, + bool ignore_enoent, Context *on_finish) + : start_object_no(start_object_no), end_object_no(end_object_no), + new_state(new_state), current_state(current_state), + parent_trace(parent_trace), ignore_enoent(ignore_enoent), + on_finish(on_finish) { + } + }; + + typedef BlockGuard<UpdateOperation> UpdateGuard; + + ImageCtxT &m_image_ctx; + ceph::BitVector<2> m_object_map; + uint64_t m_snap_id; + + UpdateGuard *m_update_guard = nullptr; + + void detained_aio_update(UpdateOperation &&update_operation); + void handle_detained_aio_update(BlockGuardCell *cell, int r, + Context *on_finish); + + void aio_update(uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + Context *on_finish); + bool update_required(const ceph::BitVector<2>::Iterator &it, + uint8_t new_state); + +}; + +} // namespace librbd + +extern template class librbd::ObjectMap<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_H diff --git a/src/librbd/Operations.cc b/src/librbd/Operations.cc new file mode 100644 index 00000000..ffb8f5c8 --- /dev/null +++ b/src/librbd/Operations.cc @@ -0,0 +1,1793 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Operations.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/WorkQueue.h" +#include "osdc/Striper.h" + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ObjectMap.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/journal/StandardPolicy.h" +#include "librbd/operation/DisableFeaturesRequest.h" +#include "librbd/operation/EnableFeaturesRequest.h" +#include "librbd/operation/FlattenRequest.h" +#include "librbd/operation/MetadataRemoveRequest.h" +#include "librbd/operation/MetadataSetRequest.h" +#include "librbd/operation/MigrateRequest.h" +#include "librbd/operation/ObjectMapIterate.h" +#include "librbd/operation/RebuildObjectMapRequest.h" +#include "librbd/operation/RenameRequest.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/operation/SnapshotCreateRequest.h" +#include "librbd/operation/SnapshotProtectRequest.h" +#include "librbd/operation/SnapshotRemoveRequest.h" +#include "librbd/operation/SnapshotRenameRequest.h" +#include "librbd/operation/SnapshotRollbackRequest.h" +#include "librbd/operation/SnapshotUnprotectRequest.h" +#include "librbd/operation/SnapshotLimitRequest.h" +#include "librbd/operation/SparsifyRequest.h" +#include <set> +#include <boost/bind.hpp> +#include <boost/scope_exit.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Operations: " + +namespace librbd { + +namespace { + +template <typename I> +struct C_NotifyUpdate : public Context { + I &image_ctx; + Context *on_finish; + bool notified = false; + + C_NotifyUpdate(I &image_ctx, Context *on_finish) + : image_ctx(image_ctx), on_finish(on_finish) { + } + + void complete(int r) override { + CephContext *cct = image_ctx.cct; + if (notified) { + if (r == -ETIMEDOUT) { + // don't fail the op if a peer fails to get the update notification + lderr(cct) << "update notification timed-out" << dendl; + r = 0; + } else if (r == -ENOENT) { + // don't fail if header is missing (e.g. v1 image rename) + ldout(cct, 5) << "update notification on missing header" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << "update notification failed: " << cpp_strerror(r) + << dendl; + } + Context::complete(r); + return; + } + + if (r < 0) { + // op failed -- no need to send update notification + Context::complete(r); + return; + } + + notified = true; + image_ctx.notify_update(this); + } + void finish(int r) override { + on_finish->complete(r); + } +}; + +template <typename I> +struct C_InvokeAsyncRequest : public Context { + /** + * @verbatim + * + * <start> + * | + * . . . . . . | . . . . . . . . . . . . . . . . . . + * . . | . . + * . v v v . + * . REFRESH_IMAGE (skip if not needed) . + * . | . + * . v . + * . ACQUIRE_LOCK (skip if exclusive lock . + * . | disabled or has lock) . + * . | . + * . /--------/ \--------\ . . . . . . . . . . . . . + * . | | . + * . v v . + * LOCAL_REQUEST REMOTE_REQUEST + * | | + * | | + * \--------\ /--------/ + * | + * v + * <finish> + * + * @endverbatim + */ + + I &image_ctx; + std::string name; + exclusive_lock::OperationRequestType request_type; + bool permit_snapshot; + boost::function<void(Context*)> local; + boost::function<void(Context*)> remote; + std::set<int> filter_error_codes; + Context *on_finish; + bool request_lock = false; + + C_InvokeAsyncRequest(I &image_ctx, const std::string& name, + exclusive_lock::OperationRequestType request_type, + bool permit_snapshot, + const boost::function<void(Context*)>& local, + const boost::function<void(Context*)>& remote, + const std::set<int> &filter_error_codes, + Context *on_finish) + : image_ctx(image_ctx), name(name), request_type(request_type), + permit_snapshot(permit_snapshot), local(local), remote(remote), + filter_error_codes(filter_error_codes), on_finish(on_finish) { + } + + void send() { + send_refresh_image(); + } + + void send_refresh_image() { + if (!image_ctx.state->is_refresh_required()) { + send_acquire_exclusive_lock(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_context_callback< + C_InvokeAsyncRequest<I>, + &C_InvokeAsyncRequest<I>::handle_refresh_image>(this); + image_ctx.state->refresh(ctx); + } + + void handle_refresh_image(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + complete(r); + return; + } + + send_acquire_exclusive_lock(); + } + + void send_acquire_exclusive_lock() { + // context can complete before owner_lock is unlocked + RWLock &owner_lock(image_ctx.owner_lock); + owner_lock.get_read(); + image_ctx.snap_lock.get_read(); + if (image_ctx.read_only || + (!permit_snapshot && image_ctx.snap_id != CEPH_NOSNAP)) { + image_ctx.snap_lock.put_read(); + owner_lock.put_read(); + complete(-EROFS); + return; + } + image_ctx.snap_lock.put_read(); + + if (image_ctx.exclusive_lock == nullptr) { + send_local_request(); + owner_lock.put_read(); + return; + } else if (image_ctx.image_watcher == nullptr) { + owner_lock.put_read(); + complete(-EROFS); + return; + } + + if (image_ctx.exclusive_lock->is_lock_owner() && + image_ctx.exclusive_lock->accept_request(request_type, nullptr)) { + send_local_request(); + owner_lock.put_read(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_async_context_callback( + image_ctx, util::create_context_callback< + C_InvokeAsyncRequest<I>, + &C_InvokeAsyncRequest<I>::handle_acquire_exclusive_lock>(this)); + + if (request_lock) { + // current lock owner doesn't support op -- try to perform + // the action locally + request_lock = false; + image_ctx.exclusive_lock->acquire_lock(ctx); + } else { + image_ctx.exclusive_lock->try_acquire_lock(ctx); + } + owner_lock.put_read(); + } + + void handle_acquire_exclusive_lock(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r < 0) { + complete(r == -EBLACKLISTED ? -EBLACKLISTED : -EROFS); + return; + } + + // context can complete before owner_lock is unlocked + RWLock &owner_lock(image_ctx.owner_lock); + owner_lock.get_read(); + if (image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()) { + send_local_request(); + owner_lock.put_read(); + return; + } + + send_remote_request(); + owner_lock.put_read(); + } + + void send_remote_request() { + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_async_context_callback( + image_ctx, util::create_context_callback< + C_InvokeAsyncRequest<I>, + &C_InvokeAsyncRequest<I>::handle_remote_request>(this)); + remote(ctx); + } + + void handle_remote_request(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + ldout(cct, 5) << name << " not supported by current lock owner" << dendl; + request_lock = true; + send_refresh_image(); + return; + } else if (r != -ETIMEDOUT && r != -ERESTART) { + image_ctx.state->handle_update_notification(); + + complete(r); + return; + } + + ldout(cct, 5) << name << " timed out notifying lock owner" << dendl; + send_refresh_image(); + } + + void send_local_request() { + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_async_context_callback( + image_ctx, util::create_context_callback< + C_InvokeAsyncRequest<I>, + &C_InvokeAsyncRequest<I>::handle_local_request>(this)); + local(ctx); + } + + void handle_local_request(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r == -ERESTART) { + send_refresh_image(); + return; + } + complete(r); + } + + void finish(int r) override { + if (filter_error_codes.count(r) != 0) { + r = 0; + } + on_finish->complete(r); + } +}; + +template <typename I> +bool needs_invalidate(I& image_ctx, uint64_t object_no, + uint8_t current_state, uint8_t new_state) { + if ( (current_state == OBJECT_EXISTS || + current_state == OBJECT_EXISTS_CLEAN) && + (new_state == OBJECT_NONEXISTENT || + new_state == OBJECT_PENDING)) { + return false; + } + return true; +} + +} // anonymous namespace + +template <typename I> +Operations<I>::Operations(I &image_ctx) + : m_image_ctx(image_ctx), m_async_request_seq(0) { +} + +template <typename I> +int Operations<I>::flatten(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "flatten" << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + { + RWLock::RLocker parent_locker(m_image_ctx.parent_lock); + if (m_image_ctx.parent_md.spec.pool_id == -1) { + lderr(cct) << "image has no parent" << dendl; + return -EINVAL; + } + } + + uint64_t request_id = ++m_async_request_seq; + r = invoke_async_request("flatten", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations<I>::execute_flatten, this, + boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher<I>::notify_flatten, + m_image_ctx.image_watcher, request_id, + boost::ref(prog_ctx), _1)); + + if (r < 0 && r != -EINVAL) { + return r; + } + ldout(cct, 20) << "flatten finished" << dendl; + return 0; +} + +template <typename I> +void Operations<I>::execute_flatten(ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "flatten" << dendl; + + if (m_image_ctx.read_only || m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + m_image_ctx.parent_lock.get_read(); + + // can't flatten a non-clone + if (m_image_ctx.parent_md.spec.pool_id == -1) { + lderr(cct) << "image has no parent" << dendl; + m_image_ctx.parent_lock.put_read(); + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EINVAL); + return; + } + if (m_image_ctx.snap_id != CEPH_NOSNAP) { + lderr(cct) << "snapshots cannot be flattened" << dendl; + m_image_ctx.parent_lock.put_read(); + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EROFS); + return; + } + + ::SnapContext snapc = m_image_ctx.snapc; + + uint64_t overlap; + int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &overlap); + ceph_assert(r == 0); + ceph_assert(overlap <= m_image_ctx.size); + + uint64_t overlap_objects = Striper::get_num_objects(m_image_ctx.layout, + overlap); + + m_image_ctx.parent_lock.put_read(); + m_image_ctx.snap_lock.put_read(); + + operation::FlattenRequest<I> *req = new operation::FlattenRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), overlap_objects, + snapc, prog_ctx); + req->send(); +} + +template <typename I> +int Operations<I>::rebuild_object_map(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "rebuild_object_map" << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + uint64_t request_id = ++m_async_request_seq; + r = invoke_async_request("rebuild object map", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true, + boost::bind(&Operations<I>::execute_rebuild_object_map, + this, boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher<I>::notify_rebuild_object_map, + m_image_ctx.image_watcher, request_id, + boost::ref(prog_ctx), _1)); + + ldout(cct, 10) << "rebuild object map finished" << dendl; + if (r < 0) { + return r; + } + return 0; +} + +template <typename I> +void Operations<I>::execute_rebuild_object_map(ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + if (m_image_ctx.read_only || m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + lderr(cct) << "image must support object-map feature" << dendl; + on_finish->complete(-EINVAL); + return; + } + + operation::RebuildObjectMapRequest<I> *req = + new operation::RebuildObjectMapRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), prog_ctx); + req->send(); +} + +template <typename I> +int Operations<I>::check_object_map(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + r = invoke_async_request("check object map", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true, + boost::bind(&Operations<I>::check_object_map, this, + boost::ref(prog_ctx), _1), + [this](Context *c) { + m_image_ctx.op_work_queue->queue(c, -EOPNOTSUPP); + }); + + return r; +} + +template <typename I> +void Operations<I>::object_map_iterate(ProgressContext &prog_ctx, + operation::ObjectIterateWork<I> handle_mismatch, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + on_finish->complete(-EINVAL); + return; + } + + operation::ObjectMapIterateRequest<I> *req = + new operation::ObjectMapIterateRequest<I>(m_image_ctx, on_finish, + prog_ctx, handle_mismatch); + req->send(); +} + +template <typename I> +void Operations<I>::check_object_map(ProgressContext &prog_ctx, + Context *on_finish) { + object_map_iterate(prog_ctx, needs_invalidate, on_finish); +} + +template <typename I> +int Operations<I>::rename(const char *dstname) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dstname + << dendl; + + int r = librbd::detect_format(m_image_ctx.md_ctx, dstname, NULL, NULL); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error checking for existing image called " + << dstname << ":" << cpp_strerror(r) << dendl; + return r; + } + if (r == 0) { + lderr(cct) << "rbd image " << dstname << " already exists" << dendl; + return -EEXIST; + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + r = invoke_async_request("rename", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations<I>::execute_rename, this, + dstname, _1), + boost::bind(&ImageWatcher<I>::notify_rename, + m_image_ctx.image_watcher, dstname, + _1)); + if (r < 0 && r != -EEXIST) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + execute_rename(dstname, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + + m_image_ctx.set_image_name(dstname); + return 0; +} + +template <typename I> +void Operations<I>::execute_rename(const std::string &dest_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + if (m_image_ctx.name == dest_name) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.snap_lock.put_read(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dest_name + << dendl; + + if (m_image_ctx.old_format) { + // unregister watch before and register back after rename + on_finish = new C_NotifyUpdate<I>(m_image_ctx, on_finish); + on_finish = new FunctionContext([this, on_finish](int r) { + if (m_image_ctx.old_format) { + m_image_ctx.image_watcher->set_oid(m_image_ctx.header_oid); + } + m_image_ctx.image_watcher->register_watch(on_finish); + }); + on_finish = new FunctionContext([this, dest_name, on_finish](int r) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + operation::RenameRequest<I> *req = new operation::RenameRequest<I>( + m_image_ctx, on_finish, dest_name); + req->send(); + }); + m_image_ctx.image_watcher->unregister_watch(on_finish); + return; + } + operation::RenameRequest<I> *req = new operation::RenameRequest<I>( + m_image_ctx, on_finish, dest_name); + req->send(); +} + +template <typename I> +int Operations<I>::resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx) { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.snap_lock.get_read(); + ldout(cct, 5) << this << " " << __func__ << ": " + << "size=" << m_image_ctx.size << ", " + << "new_size=" << size << dendl; + m_image_ctx.snap_lock.put_read(); + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP) && + !ObjectMap<>::is_compatible(m_image_ctx.layout, size)) { + lderr(cct) << "New size not compatible with object map" << dendl; + return -EINVAL; + } + + uint64_t request_id = ++m_async_request_seq; + r = invoke_async_request("resize", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations<I>::execute_resize, this, + size, allow_shrink, boost::ref(prog_ctx), _1, 0), + boost::bind(&ImageWatcher<I>::notify_resize, + m_image_ctx.image_watcher, request_id, + size, allow_shrink, boost::ref(prog_ctx), _1)); + + m_image_ctx.perfcounter->inc(l_librbd_resize); + ldout(cct, 2) << "resize finished" << dendl; + return r; +} + +template <typename I> +void Operations<I>::execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx, + Context *on_finish, + uint64_t journal_op_tid) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + m_image_ctx.snap_lock.get_read(); + ldout(cct, 5) << this << " " << __func__ << ": " + << "size=" << m_image_ctx.size << ", " + << "new_size=" << size << dendl; + + if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only || + m_image_ctx.operations_disabled) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EROFS); + return; + } else if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock) && + !ObjectMap<>::is_compatible(m_image_ctx.layout, size)) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EINVAL); + return; + } + m_image_ctx.snap_lock.put_read(); + + operation::ResizeRequest<I> *req = new operation::ResizeRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), size, allow_shrink, + prog_ctx, journal_op_tid, false); + req->send(); +} + +template <typename I> +int Operations<I>::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name) { + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + snap_create(snap_namespace, snap_name, &ctx); + r = ctx.wait(); + + if (r < 0) { + return r; + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_create); + return r; +} + +template <typename I> +void Operations<I>::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.snap_lock.put_read(); + + C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>( + m_image_ctx, "snap_create", exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations<I>::execute_snap_create, this, snap_namespace, snap_name, + _1, 0, false), + boost::bind(&ImageWatcher<I>::notify_snap_create, m_image_ctx.image_watcher, + snap_namespace, snap_name, _1), + {-EEXIST}, on_finish); + req->send(); +} + +template <typename I> +void Operations<I>::execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish, + uint64_t journal_op_tid, + bool skip_object_map) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.snap_lock.put_read(); + + operation::SnapshotCreateRequest<I> *req = + new operation::SnapshotCreateRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), + snap_namespace, snap_name, journal_op_tid, skip_object_map); + req->send(); +} + +template <typename I> +int Operations<I>::snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + ProgressContext& prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) + return r; + + C_SaferCond cond_ctx; + { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + { + // need to drop snap_lock before invalidating cache + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + if (!m_image_ctx.snap_exists) { + return -ENOENT; + } + + if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) { + return -EROFS; + } + + uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "No such snapshot found." << dendl; + return -ENOENT; + } + } + + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false); + if (r < 0) { + return r; + } + + execute_snap_rollback(snap_namespace, snap_name, prog_ctx, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_rollback); + return r; +} + +template <typename I> +void Operations<I>::execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + ProgressContext& prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "No such snapshot found." << dendl; + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-ENOENT); + return; + } + + uint64_t new_size = m_image_ctx.get_image_size(snap_id); + m_image_ctx.snap_lock.put_read(); + + // async mode used for journal replay + operation::SnapshotRollbackRequest<I> *request = + new operation::SnapshotRollbackRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name, + snap_id, new_size, prog_ctx); + request->send(); +} + +template <typename I> +int Operations<I>::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + snap_remove(snap_namespace, snap_name, &ctx); + r = ctx.wait(); + + if (r < 0) { + return r; + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_remove); + return 0; +} + +template <typename I> +void Operations<I>::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + on_finish->complete(-EROFS); + return; + } + + // quickly filter out duplicate ops + m_image_ctx.snap_lock.get_read(); + if (m_image_ctx.get_snap_id(snap_namespace, snap_name) == CEPH_NOSNAP) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-ENOENT); + return; + } + + bool proxy_op = ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 || + (m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0); + m_image_ctx.snap_lock.put_read(); + + if (proxy_op) { + auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL; + if (cls::rbd::get_snap_namespace_type(snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) { + request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE; + } + C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>( + m_image_ctx, "snap_remove", request_type, true, + boost::bind(&Operations<I>::execute_snap_remove, this, snap_namespace, snap_name, _1), + boost::bind(&ImageWatcher<I>::notify_snap_remove, m_image_ctx.image_watcher, + snap_namespace, snap_name, _1), + {-ENOENT}, on_finish); + req->send(); + } else { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + execute_snap_remove(snap_namespace, snap_name, on_finish); + } +} + +template <typename I> +void Operations<I>::execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + { + if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(m_image_ctx.cct) << "No such snapshot found." << dendl; + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-ENOENT); + return; + } + + bool is_protected; + int r = m_image_ctx.is_snap_protected(snap_id, &is_protected); + if (r < 0) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(r); + return; + } else if (is_protected) { + lderr(m_image_ctx.cct) << "snapshot is protected" << dendl; + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EBUSY); + return; + } + m_image_ctx.snap_lock.put_read(); + + operation::SnapshotRemoveRequest<I> *req = + new operation::SnapshotRemoveRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), + snap_namespace, snap_name, snap_id); + req->send(); +} + +template <typename I> +int Operations<I>::snap_rename(const char *srcname, const char *dstname) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": " + << "snap_name=" << srcname << ", " + << "new_snap_name=" << dstname << dendl; + + snapid_t snap_id; + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) + return r; + + { + RWLock::RLocker l(m_image_ctx.snap_lock); + snap_id = m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), srcname); + if (snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), dstname) != CEPH_NOSNAP) { + return -EEXIST; + } + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + r = invoke_async_request("snap_rename", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations<I>::execute_snap_rename, + this, snap_id, dstname, _1), + boost::bind(&ImageWatcher<I>::notify_snap_rename, + m_image_ctx.image_watcher, snap_id, + dstname, _1)); + if (r < 0 && r != -EEXIST) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + execute_snap_rename(snap_id, dstname, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_rename); + return 0; +} + +template <typename I> +void Operations<I>::execute_snap_rename(const uint64_t src_snap_id, + const std::string &dest_snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + if ((m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), + dest_snap_name) != CEPH_NOSNAP) { + // Renaming is supported for snapshots from user namespace only. + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.snap_lock.put_read(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": " + << "snap_id=" << src_snap_id << ", " + << "new_snap_name=" << dest_snap_name << dendl; + + operation::SnapshotRenameRequest<I> *req = + new operation::SnapshotRenameRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), src_snap_id, + dest_snap_name); + req->send(); +} + +template <typename I> +int Operations<I>::snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + return -EROFS; + } + + if (!m_image_ctx.test_features(RBD_FEATURE_LAYERING)) { + lderr(cct) << "image must support layering" << dendl; + return -ENOSYS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + bool is_protected; + r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_protected); + if (r < 0) { + return r; + } + + if (is_protected) { + return -EBUSY; + } + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + r = invoke_async_request("snap_protect", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations<I>::execute_snap_protect, + this, snap_namespace, snap_name, _1), + boost::bind(&ImageWatcher<I>::notify_snap_protect, + m_image_ctx.image_watcher, + snap_namespace, snap_name, _1)); + if (r < 0 && r != -EBUSY) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + execute_snap_protect(snap_namespace, snap_name, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + return 0; +} + +template <typename I> +void Operations<I>::execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + bool is_protected; + int r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_protected); + if (r < 0) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(r); + return; + } else if (is_protected) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EBUSY); + return; + } + m_image_ctx.snap_lock.put_read(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + operation::SnapshotProtectRequest<I> *request = + new operation::SnapshotProtectRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name); + request->send(); +} + +template <typename I> +int Operations<I>::snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + bool is_unprotected; + r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_unprotected); + if (r < 0) { + return r; + } + + if (is_unprotected) { + return -EINVAL; + } + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + r = invoke_async_request("snap_unprotect", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations<I>::execute_snap_unprotect, + this, snap_namespace, snap_name, _1), + boost::bind(&ImageWatcher<I>::notify_snap_unprotect, + m_image_ctx.image_watcher, + snap_namespace, snap_name, _1)); + if (r < 0 && r != -EINVAL) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + execute_snap_unprotect(snap_namespace, snap_name, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + return 0; +} + +template <typename I> +void Operations<I>::execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + bool is_unprotected; + int r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_unprotected); + if (r < 0) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(r); + return; + } else if (is_unprotected) { + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EINVAL); + return; + } + m_image_ctx.snap_lock.put_read(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + operation::SnapshotUnprotectRequest<I> *request = + new operation::SnapshotUnprotectRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name); + request->send(); +} + +template <typename I> +int Operations<I>::snap_set_limit(uint64_t limit) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit << dendl; + + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond limit_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true); + if (r < 0) { + return r; + } + + execute_snap_set_limit(limit, &limit_ctx); + } + + r = limit_ctx.wait(); + return r; +} + +template <typename I> +void Operations<I>::execute_snap_set_limit(const uint64_t limit, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit + << dendl; + + operation::SnapshotLimitRequest<I> *request = + new operation::SnapshotLimitRequest<I>(m_image_ctx, on_finish, limit); + request->send(); +} + +template <typename I> +int Operations<I>::update_features(uint64_t features, bool enabled) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": features=" << features + << ", enabled=" << enabled << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } else if (m_image_ctx.old_format) { + lderr(cct) << "old-format images do not support features" << dendl; + return -EINVAL; + } + + uint64_t disable_mask = (RBD_FEATURES_MUTABLE | + RBD_FEATURES_DISABLE_ONLY); + if ((enabled && (features & RBD_FEATURES_MUTABLE) != features) || + (!enabled && (features & disable_mask) != features)) { + lderr(cct) << "cannot update immutable features" << dendl; + return -EINVAL; + } + + bool set_object_map = (features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP; + bool set_fast_diff = (features & RBD_FEATURE_FAST_DIFF) == RBD_FEATURE_FAST_DIFF; + bool exist_fast_diff = (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0; + bool exist_object_map = (m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0; + + if ((enabled && ((set_object_map && !exist_fast_diff) || (set_fast_diff && !exist_object_map))) + || (!enabled && (set_object_map && exist_fast_diff))) { + features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF); + } + + if (features == 0) { + lderr(cct) << "update requires at least one feature" << dendl; + return -EINVAL; + } + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + if (enabled && (features & m_image_ctx.features) != 0) { + lderr(cct) << "one or more requested features are already enabled" + << dendl; + return -EINVAL; + } + if (!enabled && (features & ~m_image_ctx.features) != 0) { + lderr(cct) << "one or more requested features are already disabled" + << dendl; + return -EINVAL; + } + } + + // if disabling journaling, avoid attempting to open the journal + // when acquiring the exclusive lock in case the journal is corrupt + bool disabling_journal = false; + if (!enabled && ((features & RBD_FEATURE_JOURNALING) != 0)) { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + m_image_ctx.set_journal_policy(new journal::DisabledPolicy()); + disabling_journal = true; + } + BOOST_SCOPE_EXIT_ALL( (this)(disabling_journal) ) { + if (disabling_journal) { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + m_image_ctx.set_journal_policy( + new journal::StandardPolicy<I>(&m_image_ctx)); + } + }; + + // The journal options are not passed to the lock owner in the + // update features request. Therefore, if journaling is being + // enabled, the lock should be locally acquired instead of + // attempting to send the request to the peer. + if (enabled && (features & RBD_FEATURE_JOURNALING) != 0) { + C_SaferCond cond_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true); + if (r < 0) { + return r; + } + + execute_update_features(features, enabled, &cond_ctx, 0); + } + + r = cond_ctx.wait(); + } else { + r = invoke_async_request("update_features", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations<I>::execute_update_features, + this, features, enabled, _1, 0), + boost::bind(&ImageWatcher<I>::notify_update_features, + m_image_ctx.image_watcher, features, + enabled, _1)); + } + ldout(cct, 2) << "update_features finished" << dendl; + return r; +} + +template <typename I> +void Operations<I>::execute_update_features(uint64_t features, bool enabled, + Context *on_finish, + uint64_t journal_op_tid) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": features=" << features + << ", enabled=" << enabled << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + if (enabled) { + operation::EnableFeaturesRequest<I> *req = + new operation::EnableFeaturesRequest<I>( + m_image_ctx, on_finish, journal_op_tid, features); + req->send(); + } else { + operation::DisableFeaturesRequest<I> *req = + new operation::DisableFeaturesRequest<I>( + m_image_ctx, on_finish, journal_op_tid, features, false); + req->send(); + } +} + +template <typename I> +int Operations<I>::metadata_set(const std::string &key, + const std::string &value) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value=" + << value << dendl; + + std::string config_key; + bool config_override = util::is_metadata_config_override(key, &config_key); + if (config_override) { + // validate config setting + if (!librbd::api::Config<I>::is_option_name(&m_image_ctx, config_key)) { + lderr(cct) << "validation for " << key + << " failed: not allowed image level override" << dendl; + return -EINVAL; + } + int r = ConfigProxy{false}.set_val(config_key.c_str(), value); + if (r < 0) { + return r; + } + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + C_SaferCond metadata_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true); + if (r < 0) { + return r; + } + + execute_metadata_set(key, value, &metadata_ctx); + } + + r = metadata_ctx.wait(); + if (config_override && r >= 0) { + // apply new config key immediately + r = m_image_ctx.state->refresh_if_required(); + } + + return r; +} + +template <typename I> +void Operations<I>::execute_metadata_set(const std::string &key, + const std::string &value, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value=" + << value << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + operation::MetadataSetRequest<I> *request = + new operation::MetadataSetRequest<I>(m_image_ctx, + new C_NotifyUpdate<I>(m_image_ctx, on_finish), + key, value); + request->send(); +} + +template <typename I> +int Operations<I>::metadata_remove(const std::string &key) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + std::string value; + r = cls_client::metadata_get(&m_image_ctx.md_ctx, m_image_ctx.header_oid, key, &value); + if(r < 0) + return r; + + C_SaferCond metadata_ctx; + { + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true); + if (r < 0) { + return r; + } + + execute_metadata_remove(key, &metadata_ctx); + } + + r = metadata_ctx.wait(); + + std::string config_key; + if (util::is_metadata_config_override(key, &config_key) && r >= 0) { + // apply new config key immediately + r = m_image_ctx.state->refresh_if_required(); + } + + return r; +} + +template <typename I> +void Operations<I>::execute_metadata_remove(const std::string &key, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + operation::MetadataRemoveRequest<I> *request = + new operation::MetadataRemoveRequest<I>( + m_image_ctx, + new C_NotifyUpdate<I>(m_image_ctx, on_finish), key); + request->send(); +} + +template <typename I> +int Operations<I>::migrate(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "migrate" << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + { + RWLock::RLocker parent_locker(m_image_ctx.parent_lock); + if (m_image_ctx.migration_info.empty()) { + lderr(cct) << "image has no migrating parent" << dendl; + return -EINVAL; + } + } + + uint64_t request_id = ++m_async_request_seq; + r = invoke_async_request("migrate", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations<I>::execute_migrate, this, + boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher<I>::notify_migrate, + m_image_ctx.image_watcher, request_id, + boost::ref(prog_ctx), _1)); + + if (r < 0 && r != -EINVAL) { + return r; + } + ldout(cct, 20) << "migrate finished" << dendl; + return 0; +} + +template <typename I> +void Operations<I>::execute_migrate(ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "migrate" << dendl; + + if (m_image_ctx.read_only || m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.snap_lock.get_read(); + m_image_ctx.parent_lock.get_read(); + + if (m_image_ctx.migration_info.empty()) { + lderr(cct) << "image has no migrating parent" << dendl; + m_image_ctx.parent_lock.put_read(); + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EINVAL); + return; + } + if (m_image_ctx.snap_id != CEPH_NOSNAP) { + lderr(cct) << "snapshots cannot be migrated" << dendl; + m_image_ctx.parent_lock.put_read(); + m_image_ctx.snap_lock.put_read(); + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.parent_lock.put_read(); + m_image_ctx.snap_lock.put_read(); + + operation::MigrateRequest<I> *req = new operation::MigrateRequest<I>( + m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), prog_ctx); + req->send(); +} + +template <typename I> +int Operations<I>::sparsify(size_t sparse_size, ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "sparsify" << dendl; + + if (sparse_size < 4096 || sparse_size > m_image_ctx.get_object_size() || + (sparse_size & (sparse_size - 1)) != 0) { + lderr(cct) << "sparse size should be power of two not less than 4096" + << " and not larger image object size" << dendl; + return -EINVAL; + } + + uint64_t request_id = ++m_async_request_seq; + int r = invoke_async_request("sparsify", + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations<I>::execute_sparsify, + this, sparse_size, + boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher<I>::notify_sparsify, + m_image_ctx.image_watcher, + request_id, sparse_size, + boost::ref(prog_ctx), _1)); + if (r < 0 && r != -EINVAL) { + return r; + } + ldout(cct, 20) << "resparsify finished" << dendl; + return 0; +} + +template <typename I> +void Operations<I>::execute_sparsify(size_t sparse_size, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "sparsify" << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + auto req = new operation::SparsifyRequest<I>( + m_image_ctx, sparse_size, new C_NotifyUpdate<I>(m_image_ctx, on_finish), + prog_ctx); + req->send(); +} + +template <typename I> +int Operations<I>::prepare_image_update( + exclusive_lock::OperationRequestType request_type, bool request_lock) { + ceph_assert(m_image_ctx.owner_lock.is_locked() && + !m_image_ctx.owner_lock.is_wlocked()); + if (m_image_ctx.image_watcher == nullptr) { + return -EROFS; + } + + // need to upgrade to a write lock + C_SaferCond ctx; + m_image_ctx.owner_lock.put_read(); + bool attempting_lock = false; + { + RWLock::WLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock != nullptr && + (!m_image_ctx.exclusive_lock->is_lock_owner() || + !m_image_ctx.exclusive_lock->accept_request(request_type, nullptr))) { + + attempting_lock = true; + m_image_ctx.exclusive_lock->block_requests(0); + + if (request_lock) { + m_image_ctx.exclusive_lock->acquire_lock(&ctx); + } else { + m_image_ctx.exclusive_lock->try_acquire_lock(&ctx); + } + } + } + + int r = 0; + if (attempting_lock) { + r = ctx.wait(); + } + + m_image_ctx.owner_lock.get_read(); + if (attempting_lock && m_image_ctx.exclusive_lock != nullptr) { + m_image_ctx.exclusive_lock->unblock_requests(); + } + + if (r == -EAGAIN || r == -EBUSY) { + r = 0; + } + if (r < 0) { + return r; + } else if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.exclusive_lock->is_lock_owner()) { + return m_image_ctx.exclusive_lock->get_unlocked_op_error(); + } + + return 0; +} + +template <typename I> +int Operations<I>::invoke_async_request( + const std::string& name, exclusive_lock::OperationRequestType request_type, + bool permit_snapshot, const boost::function<void(Context*)>& local_request, + const boost::function<void(Context*)>& remote_request) { + C_SaferCond ctx; + C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(m_image_ctx, name, + request_type, + permit_snapshot, + local_request, + remote_request, + {}, &ctx); + req->send(); + return ctx.wait(); +} + +} // namespace librbd + +template class librbd::Operations<librbd::ImageCtx>; diff --git a/src/librbd/Operations.h b/src/librbd/Operations.h new file mode 100644 index 00000000..253c9f92 --- /dev/null +++ b/src/librbd/Operations.h @@ -0,0 +1,129 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATIONS_H +#define CEPH_LIBRBD_OPERATIONS_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/int_types.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/operation/ObjectMapIterate.h" +#include <atomic> +#include <string> +#include <boost/function.hpp> + +class Context; + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +template <typename ImageCtxT = ImageCtx> +class Operations { +public: + Operations(ImageCtxT &image_ctx); + + int flatten(ProgressContext &prog_ctx); + void execute_flatten(ProgressContext &prog_ctx, Context *on_finish); + + int rebuild_object_map(ProgressContext &prog_ctx); + void execute_rebuild_object_map(ProgressContext &prog_ctx, + Context *on_finish); + + int check_object_map(ProgressContext &prog_ctx); + void check_object_map(ProgressContext &prog_ctx, Context *on_finish); + + void object_map_iterate(ProgressContext &prog_ctx, + operation::ObjectIterateWork<ImageCtxT> handle_mismatch, + Context* on_finish); + + int rename(const char *dstname); + void execute_rename(const std::string &dest_name, Context *on_finish); + + int resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx); + void execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx, + Context *on_finish, uint64_t journal_op_tid); + + int snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name); + void snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name, Context *on_finish); + void execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish, + uint64_t journal_op_tid, bool skip_object_map); + + int snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + ProgressContext& prog_ctx); + void execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + ProgressContext& prog_ctx, Context *on_finish); + + int snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + Context *on_finish); + void execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish); + + int snap_rename(const char *srcname, const char *dstname); + void execute_snap_rename(const uint64_t src_snap_id, + const std::string &dest_snap_name, + Context *on_finish); + + int snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish); + + int snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish); + + int snap_set_limit(uint64_t limit); + void execute_snap_set_limit(uint64_t limit, Context *on_finish); + + int update_features(uint64_t features, bool enabled); + void execute_update_features(uint64_t features, bool enabled, + Context *on_finish, uint64_t journal_op_tid); + + int metadata_set(const std::string &key, const std::string &value); + void execute_metadata_set(const std::string &key, const std::string &value, + Context *on_finish); + + int metadata_remove(const std::string &key); + void execute_metadata_remove(const std::string &key, Context *on_finish); + + int migrate(ProgressContext &prog_ctx); + void execute_migrate(ProgressContext &prog_ctx, Context *on_finish); + + int sparsify(size_t sparse_size, ProgressContext &prog_ctx); + void execute_sparsify(size_t sparse_size, ProgressContext &prog_ctx, + Context *on_finish); + + int prepare_image_update(exclusive_lock::OperationRequestType request_type, + bool request_lock); + +private: + ImageCtxT &m_image_ctx; + std::atomic<int> m_async_request_seq; + + int invoke_async_request(const std::string& name, + exclusive_lock::OperationRequestType request_type, + bool permit_snapshot, + const boost::function<void(Context*)>& local, + const boost::function<void(Context*)>& remote); +}; + +} // namespace librbd + +extern template class librbd::Operations<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATIONS_H diff --git a/src/librbd/TaskFinisher.h b/src/librbd/TaskFinisher.h new file mode 100644 index 00000000..410b8ee8 --- /dev/null +++ b/src/librbd/TaskFinisher.h @@ -0,0 +1,159 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef LIBRBD_TASK_FINISHER_H +#define LIBRBD_TASK_FINISHER_H + +#include "include/Context.h" +#include "common/ceph_context.h" +#include "common/Finisher.h" +#include "common/Mutex.h" +#include "common/Timer.h" +#include <map> +#include <utility> + +class CephContext; + +namespace librbd { + +struct TaskFinisherSingleton { + Mutex m_lock; + SafeTimer *m_safe_timer; + Finisher *m_finisher; + + explicit TaskFinisherSingleton(CephContext *cct) + : m_lock("librbd::TaskFinisher::m_lock") { + m_safe_timer = new SafeTimer(cct, m_lock, false); + m_safe_timer->init(); + m_finisher = new Finisher(cct, "librbd::TaskFinisher::m_finisher", "taskfin_librbd"); + m_finisher->start(); + } + virtual ~TaskFinisherSingleton() { + { + Mutex::Locker l(m_lock); + m_safe_timer->shutdown(); + delete m_safe_timer; + } + m_finisher->wait_for_empty(); + m_finisher->stop(); + delete m_finisher; + } +}; + + +template <typename Task> +class TaskFinisher { +public: + TaskFinisher(CephContext &cct) : m_cct(cct) { + auto& singleton = + cct.lookup_or_create_singleton_object<TaskFinisherSingleton>( + "librbd::TaskFinisher::m_safe_timer", false, &cct); + m_lock = &singleton.m_lock; + m_safe_timer = singleton.m_safe_timer; + m_finisher = singleton.m_finisher; + } + + void cancel(const Task& task) { + Mutex::Locker l(*m_lock); + typename TaskContexts::iterator it = m_task_contexts.find(task); + if (it != m_task_contexts.end()) { + delete it->second.first; + m_safe_timer->cancel_event(it->second.second); + m_task_contexts.erase(it); + } + } + + void cancel_all(Context *comp) { + { + Mutex::Locker l(*m_lock); + for (typename TaskContexts::iterator it = m_task_contexts.begin(); + it != m_task_contexts.end(); ++it) { + delete it->second.first; + m_safe_timer->cancel_event(it->second.second); + } + m_task_contexts.clear(); + } + m_finisher->queue(comp); + } + + bool add_event_after(const Task& task, double seconds, Context *ctx) { + Mutex::Locker l(*m_lock); + if (m_task_contexts.count(task) != 0) { + // task already scheduled on finisher or timer + delete ctx; + return false; + } + C_Task *timer_ctx = new C_Task(this, task); + m_task_contexts[task] = std::make_pair(ctx, timer_ctx); + + m_safe_timer->add_event_after(seconds, timer_ctx); + return true; + } + + void queue(Context *ctx) { + m_finisher->queue(ctx); + } + + bool queue(const Task& task, Context *ctx) { + Mutex::Locker l(*m_lock); + typename TaskContexts::iterator it = m_task_contexts.find(task); + if (it != m_task_contexts.end()) { + if (it->second.second != NULL) { + ceph_assert(m_safe_timer->cancel_event(it->second.second)); + delete it->second.first; + } else { + // task already scheduled on the finisher + delete ctx; + return false; + } + } + m_task_contexts[task] = std::make_pair(ctx, reinterpret_cast<Context *>(0)); + + m_finisher->queue(new C_Task(this, task)); + return true; + } + +private: + class C_Task : public Context { + public: + C_Task(TaskFinisher *task_finisher, const Task& task) + : m_task_finisher(task_finisher), m_task(task) + { + } + protected: + void finish(int r) override { + m_task_finisher->complete(m_task); + } + private: + TaskFinisher *m_task_finisher; + Task m_task; + }; + + CephContext &m_cct; + + Mutex *m_lock; + Finisher *m_finisher; + SafeTimer *m_safe_timer; + + typedef std::map<Task, std::pair<Context *, Context *> > TaskContexts; + TaskContexts m_task_contexts; + + void complete(const Task& task) { + Context *ctx = NULL; + { + Mutex::Locker l(*m_lock); + typename TaskContexts::iterator it = m_task_contexts.find(task); + if (it != m_task_contexts.end()) { + ctx = it->second.first; + m_task_contexts.erase(it); + } + } + + if (ctx != NULL) { + ctx->complete(0); + } + } +}; + +} // namespace librbd + +#endif // LIBRBD_TASK_FINISHER diff --git a/src/librbd/TrashWatcher.cc b/src/librbd/TrashWatcher.cc new file mode 100644 index 00000000..74e12e48 --- /dev/null +++ b/src/librbd/TrashWatcher.cc @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/TrashWatcher.h" +#include "include/rbd_types.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/watcher/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::TrashWatcher: " << __func__ << ": " + +namespace librbd { + +using namespace trash_watcher; +using namespace watcher; + +using librbd::util::create_rados_callback; + +namespace { + +static const uint64_t NOTIFY_TIMEOUT_MS = 5000; + +} // anonymous namespace + +template <typename I> +TrashWatcher<I>::TrashWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue) + : Watcher(io_ctx, work_queue, RBD_TRASH) { +} + +template <typename I> +void TrashWatcher<I>::notify_image_added( + librados::IoCtx &io_ctx, const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec, Context *on_finish) { + CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ImageAddedPayload{image_id, trash_image_spec}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void TrashWatcher<I>::notify_image_removed(librados::IoCtx &io_ctx, + const std::string& image_id, + Context *on_finish) { + CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ImageRemovedPayload{image_id}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void TrashWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + CephContext *cct = this->m_cct; + ldout(cct, 15) << "notify_id=" << notify_id << ", " + << "handle=" << handle << dendl; + + + NotifyMessage notify_message; + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + lderr(cct) << "error decoding image notification: " << err.what() + << dendl; + Context *ctx = new C_NotifyAck(this, notify_id, handle); + ctx->complete(0); + return; + } + + apply_visitor(watcher::util::HandlePayloadVisitor<TrashWatcher<I>>( + this, notify_id, handle), notify_message.payload); +} + +template <typename I> +bool TrashWatcher<I>::handle_payload(const ImageAddedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << dendl; + handle_image_added(payload.image_id, payload.trash_image_spec); + return true; +} + +template <typename I> +bool TrashWatcher<I>::handle_payload(const ImageRemovedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << dendl; + handle_image_removed(payload.image_id); + return true; +} + +template <typename I> +bool TrashWatcher<I>::handle_payload(const UnknownPayload &payload, + Context *on_notify_ack) { + return true; +} + +} // namespace librbd + +template class librbd::TrashWatcher<librbd::ImageCtx>; diff --git a/src/librbd/TrashWatcher.h b/src/librbd/TrashWatcher.h new file mode 100644 index 00000000..8f5fd139 --- /dev/null +++ b/src/librbd/TrashWatcher.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_WATCHER_H +#define CEPH_LIBRBD_TRASH_WATCHER_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Watcher.h" +#include "librbd/trash_watcher/Types.h" + +namespace librbd { + +namespace watcher { +namespace util { +template <typename> struct HandlePayloadVisitor; +} // namespace util +} // namespace watcher + +template <typename ImageCtxT = librbd::ImageCtx> +class TrashWatcher : public Watcher { + friend struct watcher::util::HandlePayloadVisitor<TrashWatcher<ImageCtxT>>; +public: + TrashWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue); + + static void notify_image_added(librados::IoCtx &io_ctx, + const std::string& image_id, + const cls::rbd::TrashImageSpec& spec, + Context *on_finish); + static void notify_image_removed(librados::IoCtx &io_ctx, + const std::string& image_id, + Context *on_finish); + +protected: + virtual void handle_image_added(const std::string &image_id, + const cls::rbd::TrashImageSpec& spec) = 0; + virtual void handle_image_removed(const std::string &image_id) = 0; + +private: + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; + + bool handle_payload(const trash_watcher::ImageAddedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const trash_watcher::ImageRemovedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const trash_watcher::UnknownPayload &payload, + Context *on_notify_ack); +}; + +} // namespace librbd + +extern template class librbd::TrashWatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_TRASH_WATCHER_H diff --git a/src/librbd/Types.h b/src/librbd/Types.h new file mode 100644 index 00000000..3f110447 --- /dev/null +++ b/src/librbd/Types.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_TYPES_H +#define LIBRBD_TYPES_H + +#include "include/types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "deep_copy/Types.h" +#include <map> +#include <string> + +namespace librbd { + +// Performance counters +enum { + l_librbd_first = 26000, + + l_librbd_rd, // read ops + l_librbd_rd_bytes, // bytes read + l_librbd_rd_latency, // average latency + l_librbd_wr, + l_librbd_wr_bytes, + l_librbd_wr_latency, + l_librbd_discard, + l_librbd_discard_bytes, + l_librbd_discard_latency, + l_librbd_flush, + l_librbd_flush_latency, + + l_librbd_ws, + l_librbd_ws_bytes, + l_librbd_ws_latency, + + l_librbd_cmp, + l_librbd_cmp_bytes, + l_librbd_cmp_latency, + + l_librbd_snap_create, + l_librbd_snap_remove, + l_librbd_snap_rollback, + l_librbd_snap_rename, + + l_librbd_notify, + l_librbd_resize, + + l_librbd_readahead, + l_librbd_readahead_bytes, + + l_librbd_invalidate_cache, + + l_librbd_opened_time, + l_librbd_lock_acquired_time, + + l_librbd_last, +}; + +typedef std::map<uint64_t, uint64_t> SnapSeqs; + +/// Full information about an image's parent. +struct ParentImageInfo { + /// Identification of the parent. + cls::rbd::ParentImageSpec spec; + + /** @brief Where the portion of data shared with the child image ends. + * Since images can be resized multiple times, the portion of data shared + * with the child image is not necessarily min(parent size, child size). + * If the child image is first shrunk and then enlarged, the common portion + * will be shorter. */ + uint64_t overlap = 0; +}; + +struct SnapInfo { + std::string name; + cls::rbd::SnapshotNamespace snap_namespace; + uint64_t size; + ParentImageInfo parent; + uint8_t protection_status; + uint64_t flags; + utime_t timestamp; + SnapInfo(std::string _name, + const cls::rbd::SnapshotNamespace &_snap_namespace, + uint64_t _size, const ParentImageInfo &_parent, + uint8_t _protection_status, uint64_t _flags, utime_t _timestamp) + : name(_name), snap_namespace(_snap_namespace), size(_size), + parent(_parent), protection_status(_protection_status), flags(_flags), + timestamp(_timestamp) { + } +}; + +enum { + OPEN_FLAG_SKIP_OPEN_PARENT = 1 << 0, + OPEN_FLAG_OLD_FORMAT = 1 << 1, + OPEN_FLAG_IGNORE_MIGRATING = 1 << 2 +}; + +struct MigrationInfo { + int64_t pool_id = -1; + std::string pool_namespace; + std::string image_name; + std::string image_id; + deep_copy::SnapMap snap_map; + uint64_t overlap = 0; + bool flatten = false; + + MigrationInfo() { + } + MigrationInfo(int64_t pool_id, const std::string& pool_namespace, + const std::string& image_name, const std::string& image_id, + const deep_copy::SnapMap &snap_map, uint64_t overlap, + bool flatten) + : pool_id(pool_id), pool_namespace(pool_namespace), image_name(image_name), + image_id(image_id), snap_map(snap_map), overlap(overlap), + flatten(flatten) { + } + + bool empty() const { + return pool_id == -1; + } +}; + +} // namespace librbd + +#endif // LIBRBD_TYPES_H diff --git a/src/librbd/Utils.cc b/src/librbd/Utils.cc new file mode 100644 index 00000000..c9d7b041 --- /dev/null +++ b/src/librbd/Utils.cc @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/algorithm/string.hpp> +#include <boost/lexical_cast.hpp> + +#include "librbd/Utils.h" +#include "include/rbd_types.h" +#include "include/stringify.h" +#include "include/rbd/features.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/Features.h" +#include <random> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::util::" << __func__ << ": " + +namespace librbd { +namespace util { + +const std::string group_header_name(const std::string &group_id) +{ + return RBD_GROUP_HEADER_PREFIX + group_id; +} + +const std::string id_obj_name(const std::string &name) +{ + return RBD_ID_PREFIX + name; +} + +const std::string header_name(const std::string &image_id) +{ + return RBD_HEADER_PREFIX + image_id; +} + +const std::string old_header_name(const std::string &image_name) +{ + return image_name + RBD_SUFFIX; +} + +std::string unique_lock_name(const std::string &name, void *address) { + return name + " (" + stringify(address) + ")"; +} + +librados::AioCompletion *create_rados_callback(Context *on_finish) { + return create_rados_callback<Context, &Context::complete>(on_finish); +} + +std::string generate_image_id(librados::IoCtx &ioctx) { + librados::Rados rados(ioctx); + + uint64_t bid = rados.get_instance_id(); + std::mt19937 generator{std::random_device{}()}; + std::uniform_int_distribution<uint32_t> distribution{0, 0xFFFFFFFF}; + uint32_t extra = distribution(generator); + + ostringstream bid_ss; + bid_ss << std::hex << bid << std::hex << extra; + std::string id = bid_ss.str(); + + // ensure the image id won't overflow the fixed block name size + if (id.length() > RBD_MAX_IMAGE_ID_LENGTH) { + id = id.substr(id.length() - RBD_MAX_IMAGE_ID_LENGTH); + } + + return id; +} + +uint64_t get_rbd_default_features(CephContext* cct) +{ + auto value = cct->_conf.get_val<std::string>("rbd_default_features"); + return librbd::rbd_features_from_string(value, nullptr); +} + + +bool calc_sparse_extent(const bufferptr &bp, + size_t sparse_size, + uint64_t length, + size_t *write_offset, + size_t *write_length, + size_t *offset) { + size_t extent_size; + if (*offset + sparse_size > length) { + extent_size = length - *offset; + } else { + extent_size = sparse_size; + } + + bufferptr extent(bp, *offset, extent_size); + *offset += extent_size; + + bool extent_is_zero = extent.is_zero(); + if (!extent_is_zero) { + *write_length += extent_size; + } + if (extent_is_zero && *write_length == 0) { + *write_offset += extent_size; + } + + if ((extent_is_zero || *offset == length) && *write_length != 0) { + return true; + } + return false; +} + +bool is_metadata_config_override(const std::string& metadata_key, + std::string* config_key) { + size_t prefix_len = librbd::ImageCtx::METADATA_CONF_PREFIX.size(); + if (metadata_key.size() > prefix_len && + metadata_key.compare(0, prefix_len, + librbd::ImageCtx::METADATA_CONF_PREFIX) == 0) { + *config_key = metadata_key.substr(prefix_len, + metadata_key.size() - prefix_len); + return true; + } + return false; +} + +int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc, + int64_t pool_id, + const std::optional<std::string>& pool_namespace, + librados::IoCtx* dst_io_ctx) { + auto cct = (CephContext *)src_io_ctx.cct(); + + librados::Rados rados(src_io_ctx); + int r = rados.ioctx_create2(pool_id, *dst_io_ctx); + if (r == -ENOENT) { + ldout(cct, 1) << pool_desc << " pool " << pool_id << " no longer exists" + << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "error accessing " << pool_desc << " pool " << pool_id + << dendl; + return r; + } + + dst_io_ctx->set_namespace( + pool_namespace ? *pool_namespace : src_io_ctx.get_namespace()); + return 0; +} + +} // namespace util +} // namespace librbd diff --git a/src/librbd/Utils.h b/src/librbd/Utils.h new file mode 100644 index 00000000..c0b68b9f --- /dev/null +++ b/src/librbd/Utils.h @@ -0,0 +1,221 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_UTILS_H +#define CEPH_LIBRBD_UTILS_H + +#include "include/rados/librados.hpp" +#include "include/rbd_types.h" +#include "include/Context.h" +#include "common/zipkin_trace.h" + +#include <atomic> +#include <type_traits> + +namespace librbd { + +class ImageCtx; + +namespace util { +namespace detail { + +template <typename T> +void rados_callback(rados_completion_t c, void *arg) { + reinterpret_cast<T*>(arg)->complete(rados_aio_get_return_value(c)); +} + +template <typename T, void(T::*MF)(int)> +void rados_callback(rados_completion_t c, void *arg) { + T *obj = reinterpret_cast<T*>(arg); + int r = rados_aio_get_return_value(c); + (obj->*MF)(r); +} + +template <typename T, Context*(T::*MF)(int*), bool destroy> +void rados_state_callback(rados_completion_t c, void *arg) { + T *obj = reinterpret_cast<T*>(arg); + int r = rados_aio_get_return_value(c); + Context *on_finish = (obj->*MF)(&r); + if (on_finish != nullptr) { + on_finish->complete(r); + if (destroy) { + delete obj; + } + } +} + +template <typename T, void (T::*MF)(int)> +class C_CallbackAdapter : public Context { + T *obj; +public: + C_CallbackAdapter(T *obj) : obj(obj) { + } + +protected: + void finish(int r) override { + (obj->*MF)(r); + } +}; + +template <typename T, Context*(T::*MF)(int*), bool destroy> +class C_StateCallbackAdapter : public Context { + T *obj; +public: + C_StateCallbackAdapter(T *obj) : obj(obj){ + } + +protected: + void complete(int r) override { + Context *on_finish = (obj->*MF)(&r); + if (on_finish != nullptr) { + on_finish->complete(r); + if (destroy) { + delete obj; + } + } + Context::complete(r); + } + void finish(int r) override { + } +}; + +template <typename WQ> +struct C_AsyncCallback : public Context { + WQ *op_work_queue; + Context *on_finish; + + C_AsyncCallback(WQ *op_work_queue, Context *on_finish) + : op_work_queue(op_work_queue), on_finish(on_finish) { + } + void finish(int r) override { + op_work_queue->queue(on_finish, r); + } +}; + +} // namespace detail + +std::string generate_image_id(librados::IoCtx &ioctx); + +template <typename T> +inline std::string generate_image_id(librados::IoCtx &ioctx) { + return generate_image_id(ioctx); +} + +const std::string group_header_name(const std::string &group_id); +const std::string id_obj_name(const std::string &name); +const std::string header_name(const std::string &image_id); +const std::string old_header_name(const std::string &image_name); +std::string unique_lock_name(const std::string &name, void *address); + +librados::AioCompletion *create_rados_callback(Context *on_finish); + +template <typename T> +librados::AioCompletion *create_rados_callback(T *obj) { + return librados::Rados::aio_create_completion( + obj, &detail::rados_callback<T>, nullptr); +} + +template <typename T, void(T::*MF)(int)> +librados::AioCompletion *create_rados_callback(T *obj) { + return librados::Rados::aio_create_completion( + obj, &detail::rados_callback<T, MF>, nullptr); +} + +template <typename T, Context*(T::*MF)(int*), bool destroy=true> +librados::AioCompletion *create_rados_callback(T *obj) { + return librados::Rados::aio_create_completion( + obj, &detail::rados_state_callback<T, MF, destroy>, nullptr); +} + +template <typename T, void(T::*MF)(int) = &T::complete> +Context *create_context_callback(T *obj) { + return new detail::C_CallbackAdapter<T, MF>(obj); +} + +template <typename T, Context*(T::*MF)(int*), bool destroy=true> +Context *create_context_callback(T *obj) { + return new detail::C_StateCallbackAdapter<T, MF, destroy>(obj); +} + +template <typename I> +Context *create_async_context_callback(I &image_ctx, Context *on_finish) { + // use async callback to acquire a clean lock context + return new detail::C_AsyncCallback< + typename std::decay<decltype(*image_ctx.op_work_queue)>::type>( + image_ctx.op_work_queue, on_finish); +} + +template <typename WQ> +Context *create_async_context_callback(WQ *work_queue, Context *on_finish) { + // use async callback to acquire a clean lock context + return new detail::C_AsyncCallback<WQ>(work_queue, on_finish); +} + +// TODO: temporary until AioCompletion supports templated ImageCtx +inline ImageCtx *get_image_ctx(ImageCtx *image_ctx) { + return image_ctx; +} + +/// helper for tracking in-flight async ops when coordinating +/// a shut down of the invoking class instance +class AsyncOpTracker { +public: + void start_op() { + m_refs++; + } + + void finish_op() { + if (--m_refs == 0 && m_on_finish != nullptr) { + Context *on_finish = nullptr; + std::swap(on_finish, m_on_finish); + on_finish->complete(0); + } + } + + template <typename I> + void wait(I &image_ctx, Context *on_finish) { + ceph_assert(m_on_finish == nullptr); + + on_finish = create_async_context_callback(image_ctx, on_finish); + if (m_refs == 0) { + on_finish->complete(0); + return; + } + m_on_finish = on_finish; + } + +private: + std::atomic<uint64_t> m_refs = { 0 }; + Context *m_on_finish = nullptr; +}; + +uint64_t get_rbd_default_features(CephContext* cct); + +bool calc_sparse_extent(const bufferptr &bp, + size_t sparse_size, + uint64_t length, + size_t *write_offset, + size_t *write_length, + size_t *offset); + +template <typename I> +inline ZTracer::Trace create_trace(const I &image_ctx, const char *trace_name, + const ZTracer::Trace &parent_trace) { + if (parent_trace.valid()) { + return ZTracer::Trace(trace_name, &image_ctx.trace_endpoint, &parent_trace); + } + return ZTracer::Trace(); +} + +bool is_metadata_config_override(const std::string& metadata_key, + std::string* config_key); + +int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc, + int64_t pool_id, + const std::optional<std::string>& pool_namespace, + librados::IoCtx* dst_io_ctx); + +} // namespace util +} // namespace librbd + +#endif // CEPH_LIBRBD_UTILS_H diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc new file mode 100644 index 00000000..2ed85f78 --- /dev/null +++ b/src/librbd/WatchNotifyTypes.cc @@ -0,0 +1,525 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_types.h" +#include "common/Formatter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "librbd/WatchNotifyTypes.h" +#include "librbd/watcher/Utils.h" + +namespace librbd { +namespace watch_notify { + +namespace { + +class CheckForRefreshVisitor : public boost::static_visitor<bool> { +public: + template <typename Payload> + inline bool operator()(const Payload &payload) const { + return Payload::CHECK_FOR_REFRESH; + } +}; + +class GetNotifyOpVisitor : public boost::static_visitor<NotifyOp> { +public: + template <typename Payload> + NotifyOp operator()(const Payload &payload) const { + return Payload::NOTIFY_OP; + } +}; + +class DumpPayloadVisitor : public boost::static_visitor<void> { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void AsyncRequestId::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); + encode(request_id, bl); +} + +void AsyncRequestId::decode(bufferlist::const_iterator &iter) { + using ceph::decode; + decode(client_id, iter); + decode(request_id, iter); +} + +void AsyncRequestId::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); + f->dump_unsigned("request_id", request_id); +} + +void AcquiredLockPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); +} + +void AcquiredLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + if (version >= 2) { + decode(client_id, iter); + } +} + +void AcquiredLockPayload::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); +} + +void ReleasedLockPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); +} + +void ReleasedLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + if (version >= 2) { + decode(client_id, iter); + } +} + +void ReleasedLockPayload::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); +} + +void RequestLockPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); + encode(force, bl); +} + +void RequestLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + if (version >= 2) { + decode(client_id, iter); + } + if (version >= 3) { + decode(force, iter); + } +} + +void RequestLockPayload::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); + f->dump_bool("force", force); +} + +void HeaderUpdatePayload::encode(bufferlist &bl) const { +} + +void HeaderUpdatePayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void HeaderUpdatePayload::dump(Formatter *f) const { +} + +void AsyncRequestPayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + encode(async_request_id, bl); +} + +void AsyncRequestPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(async_request_id, iter); +} + +void AsyncRequestPayloadBase::dump(Formatter *f) const { + f->open_object_section("async_request_id"); + async_request_id.dump(f); + f->close_section(); +} + +void AsyncProgressPayload::encode(bufferlist &bl) const { + using ceph::encode; + AsyncRequestPayloadBase::encode(bl); + encode(offset, bl); + encode(total, bl); +} + +void AsyncProgressPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + AsyncRequestPayloadBase::decode(version, iter); + decode(offset, iter); + decode(total, iter); +} + +void AsyncProgressPayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_unsigned("offset", offset); + f->dump_unsigned("total", total); +} + +void AsyncCompletePayload::encode(bufferlist &bl) const { + using ceph::encode; + AsyncRequestPayloadBase::encode(bl); + encode(result, bl); +} + +void AsyncCompletePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + AsyncRequestPayloadBase::decode(version, iter); + decode(result, iter); +} + +void AsyncCompletePayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_int("result", result); +} + +void ResizePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(size, bl); + AsyncRequestPayloadBase::encode(bl); + encode(allow_shrink, bl); +} + +void ResizePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(size, iter); + AsyncRequestPayloadBase::decode(version, iter); + + if (version >= 4) { + decode(allow_shrink, iter); + } +} + +void ResizePayload::dump(Formatter *f) const { + f->dump_unsigned("size", size); + f->dump_bool("allow_shrink", allow_shrink); + AsyncRequestPayloadBase::dump(f); +} + +void SnapPayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + encode(snap_name, bl); + encode(snap_namespace, bl); +} + +void SnapPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(snap_name, iter); + if (version >= 6) { + decode(snap_namespace, iter); + } +} + +void SnapPayloadBase::dump(Formatter *f) const { + f->dump_string("snap_name", snap_name); + snap_namespace.dump(f); +} + +void SnapCreatePayload::encode(bufferlist &bl) const { + SnapPayloadBase::encode(bl); +} + +void SnapCreatePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + SnapPayloadBase::decode(version, iter); + if (version == 5) { + decode(snap_namespace, iter); + } +} + +void SnapCreatePayload::dump(Formatter *f) const { + SnapPayloadBase::dump(f); +} + +void SnapRenamePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(snap_id, bl); + SnapPayloadBase::encode(bl); +} + +void SnapRenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(snap_id, iter); + SnapPayloadBase::decode(version, iter); +} + +void SnapRenamePayload::dump(Formatter *f) const { + f->dump_unsigned("src_snap_id", snap_id); + SnapPayloadBase::dump(f); +} + +void RenamePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(image_name, bl); +} + +void RenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(image_name, iter); +} + +void RenamePayload::dump(Formatter *f) const { + f->dump_string("image_name", image_name); +} + +void UpdateFeaturesPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(features, bl); + encode(enabled, bl); +} + +void UpdateFeaturesPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(features, iter); + decode(enabled, iter); +} + +void UpdateFeaturesPayload::dump(Formatter *f) const { + f->dump_unsigned("features", features); + f->dump_bool("enabled", enabled); +} + +void SparsifyPayload::encode(bufferlist &bl) const { + using ceph::encode; + AsyncRequestPayloadBase::encode(bl); + encode(sparse_size, bl); +} + +void SparsifyPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + AsyncRequestPayloadBase::decode(version, iter); + decode(sparse_size, iter); +} + +void SparsifyPayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_unsigned("sparse_size", sparse_size); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +bool NotifyMessage::check_for_refresh() const { + return boost::apply_visitor(CheckForRefreshVisitor(), payload); +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(6, 1, bl); + boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_ACQUIRED_LOCK: + payload = AcquiredLockPayload(); + break; + case NOTIFY_OP_RELEASED_LOCK: + payload = ReleasedLockPayload(); + break; + case NOTIFY_OP_REQUEST_LOCK: + payload = RequestLockPayload(); + break; + case NOTIFY_OP_HEADER_UPDATE: + payload = HeaderUpdatePayload(); + break; + case NOTIFY_OP_ASYNC_PROGRESS: + payload = AsyncProgressPayload(); + break; + case NOTIFY_OP_ASYNC_COMPLETE: + payload = AsyncCompletePayload(); + break; + case NOTIFY_OP_FLATTEN: + payload = FlattenPayload(); + break; + case NOTIFY_OP_RESIZE: + payload = ResizePayload(); + break; + case NOTIFY_OP_SNAP_CREATE: + payload = SnapCreatePayload(); + break; + case NOTIFY_OP_SNAP_REMOVE: + payload = SnapRemovePayload(); + break; + case NOTIFY_OP_SNAP_RENAME: + payload = SnapRenamePayload(); + break; + case NOTIFY_OP_SNAP_PROTECT: + payload = SnapProtectPayload(); + break; + case NOTIFY_OP_SNAP_UNPROTECT: + payload = SnapUnprotectPayload(); + break; + case NOTIFY_OP_REBUILD_OBJECT_MAP: + payload = RebuildObjectMapPayload(); + break; + case NOTIFY_OP_RENAME: + payload = RenamePayload(); + break; + case NOTIFY_OP_UPDATE_FEATURES: + payload = UpdateFeaturesPayload(); + break; + case NOTIFY_OP_MIGRATE: + payload = MigratePayload(); + break; + case NOTIFY_OP_SPARSIFY: + payload = SparsifyPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +NotifyOp NotifyMessage::get_notify_op() const { + return apply_visitor(GetNotifyOpVisitor(), payload); +} + +void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) { + o.push_back(new NotifyMessage(AcquiredLockPayload(ClientId(1, 2)))); + o.push_back(new NotifyMessage(ReleasedLockPayload(ClientId(1, 2)))); + o.push_back(new NotifyMessage(RequestLockPayload(ClientId(1, 2), true))); + o.push_back(new NotifyMessage(HeaderUpdatePayload())); + o.push_back(new NotifyMessage(AsyncProgressPayload(AsyncRequestId(ClientId(0, 1), 2), 3, 4))); + o.push_back(new NotifyMessage(AsyncCompletePayload(AsyncRequestId(ClientId(0, 1), 2), 3))); + o.push_back(new NotifyMessage(FlattenPayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(ResizePayload(123, true, AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(SnapCreatePayload(cls::rbd::UserSnapshotNamespace(), + "foo"))); + o.push_back(new NotifyMessage(SnapRemovePayload(cls::rbd::UserSnapshotNamespace(), "foo"))); + o.push_back(new NotifyMessage(SnapProtectPayload(cls::rbd::UserSnapshotNamespace(), "foo"))); + o.push_back(new NotifyMessage(SnapUnprotectPayload(cls::rbd::UserSnapshotNamespace(), "foo"))); + o.push_back(new NotifyMessage(RebuildObjectMapPayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(RenamePayload("foo"))); + o.push_back(new NotifyMessage(UpdateFeaturesPayload(1, true))); + o.push_back(new NotifyMessage(MigratePayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(SparsifyPayload(AsyncRequestId(ClientId(0, 1), 2), 1))); +} + +void ResponseMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(result, bl); + ENCODE_FINISH(bl); +} + +void ResponseMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + decode(result, iter); + DECODE_FINISH(iter); +} + +void ResponseMessage::dump(Formatter *f) const { + f->dump_int("result", result); +} + +void ResponseMessage::generate_test_instances(std::list<ResponseMessage *> &o) { + o.push_back(new ResponseMessage(1)); +} + +std::ostream &operator<<(std::ostream &out, + const librbd::watch_notify::NotifyOp &op) { + using namespace librbd::watch_notify; + + switch (op) { + case NOTIFY_OP_ACQUIRED_LOCK: + out << "AcquiredLock"; + break; + case NOTIFY_OP_RELEASED_LOCK: + out << "ReleasedLock"; + break; + case NOTIFY_OP_REQUEST_LOCK: + out << "RequestLock"; + break; + case NOTIFY_OP_HEADER_UPDATE: + out << "HeaderUpdate"; + break; + case NOTIFY_OP_ASYNC_PROGRESS: + out << "AsyncProgress"; + break; + case NOTIFY_OP_ASYNC_COMPLETE: + out << "AsyncComplete"; + break; + case NOTIFY_OP_FLATTEN: + out << "Flatten"; + break; + case NOTIFY_OP_RESIZE: + out << "Resize"; + break; + case NOTIFY_OP_SNAP_CREATE: + out << "SnapCreate"; + break; + case NOTIFY_OP_SNAP_REMOVE: + out << "SnapRemove"; + break; + case NOTIFY_OP_SNAP_RENAME: + out << "SnapRename"; + break; + case NOTIFY_OP_SNAP_PROTECT: + out << "SnapProtect"; + break; + case NOTIFY_OP_SNAP_UNPROTECT: + out << "SnapUnprotect"; + break; + case NOTIFY_OP_REBUILD_OBJECT_MAP: + out << "RebuildObjectMap"; + break; + case NOTIFY_OP_RENAME: + out << "Rename"; + break; + case NOTIFY_OP_UPDATE_FEATURES: + out << "UpdateFeatures"; + break; + case NOTIFY_OP_MIGRATE: + out << "Migrate"; + break; + case NOTIFY_OP_SPARSIFY: + out << "Sparsify"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(op) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, + const librbd::watch_notify::AsyncRequestId &request) { + out << "[" << request.client_id.gid << "," << request.client_id.handle << "," + << request.request_id << "]"; + return out; +} +} // namespace watch_notify +} // namespace librbd diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h new file mode 100644 index 00000000..02d2b60b --- /dev/null +++ b/src/librbd/WatchNotifyTypes.h @@ -0,0 +1,400 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef LIBRBD_WATCH_NOTIFY_TYPES_H +#define LIBRBD_WATCH_NOTIFY_TYPES_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "librbd/watcher/Types.h" +#include <iosfwd> +#include <list> +#include <string> +#include <boost/variant.hpp> + +namespace ceph { +class Formatter; +} + +namespace librbd { +namespace watch_notify { + +using librbd::watcher::ClientId; + +WRITE_CLASS_ENCODER(ClientId); + +struct AsyncRequestId { + ClientId client_id; + uint64_t request_id; + + AsyncRequestId() : request_id() {} + AsyncRequestId(const ClientId &client_id_, uint64_t request_id_) + : client_id(client_id_), request_id(request_id_) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + inline bool operator<(const AsyncRequestId &rhs) const { + if (client_id != rhs.client_id) { + return client_id < rhs.client_id; + } else { + return request_id < rhs.request_id; + } + } + inline bool operator!=(const AsyncRequestId &rhs) const { + return (client_id != rhs.client_id || request_id != rhs.request_id); + } +}; + +enum NotifyOp { + NOTIFY_OP_ACQUIRED_LOCK = 0, + NOTIFY_OP_RELEASED_LOCK = 1, + NOTIFY_OP_REQUEST_LOCK = 2, + NOTIFY_OP_HEADER_UPDATE = 3, + NOTIFY_OP_ASYNC_PROGRESS = 4, + NOTIFY_OP_ASYNC_COMPLETE = 5, + NOTIFY_OP_FLATTEN = 6, + NOTIFY_OP_RESIZE = 7, + NOTIFY_OP_SNAP_CREATE = 8, + NOTIFY_OP_SNAP_REMOVE = 9, + NOTIFY_OP_REBUILD_OBJECT_MAP = 10, + NOTIFY_OP_SNAP_RENAME = 11, + NOTIFY_OP_SNAP_PROTECT = 12, + NOTIFY_OP_SNAP_UNPROTECT = 13, + NOTIFY_OP_RENAME = 14, + NOTIFY_OP_UPDATE_FEATURES = 15, + NOTIFY_OP_MIGRATE = 16, + NOTIFY_OP_SPARSIFY = 17, +}; + +struct AcquiredLockPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_ACQUIRED_LOCK; + static const bool CHECK_FOR_REFRESH = false; + + ClientId client_id; + + AcquiredLockPayload() {} + AcquiredLockPayload(const ClientId &client_id_) : client_id(client_id_) {} + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ReleasedLockPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_RELEASED_LOCK; + static const bool CHECK_FOR_REFRESH = false; + + ClientId client_id; + + ReleasedLockPayload() {} + ReleasedLockPayload(const ClientId &client_id_) : client_id(client_id_) {} + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct RequestLockPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_REQUEST_LOCK; + static const bool CHECK_FOR_REFRESH = false; + + ClientId client_id; + bool force = false; + + RequestLockPayload() {} + RequestLockPayload(const ClientId &client_id_, bool force_) + : client_id(client_id_), force(force_) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct HeaderUpdatePayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_HEADER_UPDATE; + static const bool CHECK_FOR_REFRESH = false; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct AsyncRequestPayloadBase { +public: + AsyncRequestId async_request_id; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; + +protected: + AsyncRequestPayloadBase() {} + AsyncRequestPayloadBase(const AsyncRequestId &id) : async_request_id(id) {} +}; + +struct AsyncProgressPayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_PROGRESS; + static const bool CHECK_FOR_REFRESH = false; + + AsyncProgressPayload() : offset(0), total(0) {} + AsyncProgressPayload(const AsyncRequestId &id, uint64_t offset_, uint64_t total_) + : AsyncRequestPayloadBase(id), offset(offset_), total(total_) {} + + uint64_t offset; + uint64_t total; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct AsyncCompletePayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_COMPLETE; + static const bool CHECK_FOR_REFRESH = false; + + AsyncCompletePayload() : result(0) {} + AsyncCompletePayload(const AsyncRequestId &id, int r) + : AsyncRequestPayloadBase(id), result(r) {} + + int result; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct FlattenPayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_FLATTEN; + static const bool CHECK_FOR_REFRESH = true; + + FlattenPayload() {} + FlattenPayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {} +}; + +struct ResizePayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_RESIZE; + static const bool CHECK_FOR_REFRESH = true; + + ResizePayload() : size(0), allow_shrink(true) {} + ResizePayload(uint64_t size_, bool allow_shrink_, const AsyncRequestId &id) + : AsyncRequestPayloadBase(id), size(size_), allow_shrink(allow_shrink_) {} + + uint64_t size; + bool allow_shrink; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct SnapPayloadBase { +public: + static const bool CHECK_FOR_REFRESH = true; + + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; + +protected: + SnapPayloadBase() {} + SnapPayloadBase(const cls::rbd::SnapshotNamespace& _snap_namespace, + const std::string &name) + : snap_namespace(_snap_namespace), snap_name(name) {} +}; + +struct SnapCreatePayload : public SnapPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_CREATE; + + SnapCreatePayload() {} + SnapCreatePayload(const cls::rbd::SnapshotNamespace &_snap_namespace, + const std::string &name) + : SnapPayloadBase(_snap_namespace, name) {} + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct SnapRenamePayload : public SnapPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_RENAME; + + SnapRenamePayload() {} + SnapRenamePayload(const uint64_t &src_snap_id, + const std::string &dst_name) + : SnapPayloadBase(cls::rbd::UserSnapshotNamespace(), dst_name), snap_id(src_snap_id) {} + + uint64_t snap_id = 0; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct SnapRemovePayload : public SnapPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_REMOVE; + + SnapRemovePayload() {} + SnapRemovePayload(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : SnapPayloadBase(snap_namespace, name) {} +}; + +struct SnapProtectPayload : public SnapPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_PROTECT; + + SnapProtectPayload() {} + SnapProtectPayload(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : SnapPayloadBase(snap_namespace, name) {} +}; + +struct SnapUnprotectPayload : public SnapPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_UNPROTECT; + + SnapUnprotectPayload() {} + SnapUnprotectPayload(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : SnapPayloadBase(snap_namespace, name) {} +}; + +struct RebuildObjectMapPayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_REBUILD_OBJECT_MAP; + static const bool CHECK_FOR_REFRESH = true; + + RebuildObjectMapPayload() {} + RebuildObjectMapPayload(const AsyncRequestId &id) + : AsyncRequestPayloadBase(id) {} +}; + +struct RenamePayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_RENAME; + static const bool CHECK_FOR_REFRESH = true; + + RenamePayload() {} + RenamePayload(const std::string _image_name) : image_name(_image_name) {} + + std::string image_name; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UpdateFeaturesPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_UPDATE_FEATURES; + static const bool CHECK_FOR_REFRESH = true; + + UpdateFeaturesPayload() : features(0), enabled(false) {} + UpdateFeaturesPayload(uint64_t features_, bool enabled_) + : features(features_), enabled(enabled_) {} + + uint64_t features; + bool enabled; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct MigratePayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_MIGRATE; + static const bool CHECK_FOR_REFRESH = true; + + MigratePayload() {} + MigratePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {} +}; + +struct SparsifyPayload : public AsyncRequestPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SPARSIFY; + static const bool CHECK_FOR_REFRESH = true; + + SparsifyPayload() {} + SparsifyPayload(const AsyncRequestId &id, size_t sparse_size) + : AsyncRequestPayloadBase(id), sparse_size(sparse_size) {} + + size_t sparse_size = 0; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1); + static const bool CHECK_FOR_REFRESH = false; + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant<AcquiredLockPayload, + ReleasedLockPayload, + RequestLockPayload, + HeaderUpdatePayload, + AsyncProgressPayload, + AsyncCompletePayload, + FlattenPayload, + ResizePayload, + SnapCreatePayload, + SnapRemovePayload, + SnapRenamePayload, + SnapProtectPayload, + SnapUnprotectPayload, + RebuildObjectMapPayload, + RenamePayload, + UpdateFeaturesPayload, + MigratePayload, + SparsifyPayload, + UnknownPayload> Payload; + +struct NotifyMessage { + NotifyMessage() : payload(UnknownPayload()) {} + NotifyMessage(const Payload &payload_) : payload(payload_) {} + + Payload payload; + + bool check_for_refresh() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + NotifyOp get_notify_op() const; + + static void generate_test_instances(std::list<NotifyMessage *> &o); +}; + +struct ResponseMessage { + ResponseMessage() : result(0) {} + ResponseMessage(int result_) : result(result_) {} + + int result; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<ResponseMessage *> &o); +}; + +std::ostream &operator<<(std::ostream &out, + const NotifyOp &op); +std::ostream &operator<<(std::ostream &out, + const AsyncRequestId &request); + +WRITE_CLASS_ENCODER(AsyncRequestId); +WRITE_CLASS_ENCODER(NotifyMessage); +WRITE_CLASS_ENCODER(ResponseMessage); + +} // namespace watch_notify +} // namespace librbd + + +#endif // LIBRBD_WATCH_NOTIFY_TYPES_H diff --git a/src/librbd/Watcher.cc b/src/librbd/Watcher.cc new file mode 100644 index 00000000..9f866f25 --- /dev/null +++ b/src/librbd/Watcher.cc @@ -0,0 +1,367 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/Watcher.h" +#include "librbd/watcher/RewatchRequest.h" +#include "librbd/Utils.h" +#include "librbd/TaskFinisher.h" +#include "include/encoding.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include <boost/bind.hpp> + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { + +using namespace watcher; + +using util::create_context_callback; +using util::create_rados_callback; +using std::string; + +namespace { + +struct C_UnwatchAndFlush : public Context { + librados::Rados rados; + Context *on_finish; + bool flushing = false; + int ret_val = 0; + + C_UnwatchAndFlush(librados::IoCtx &io_ctx, Context *on_finish) + : rados(io_ctx), on_finish(on_finish) { + } + + void complete(int r) override { + if (ret_val == 0 && r < 0) { + ret_val = r; + } + + if (!flushing) { + flushing = true; + + librados::AioCompletion *aio_comp = create_rados_callback(this); + r = rados.aio_watch_flush(aio_comp); + ceph_assert(r == 0); + aio_comp->release(); + return; + } + + // ensure our reference to the RadosClient is released prior + // to completing the callback to avoid racing an explicit + // librados shutdown + Context *ctx = on_finish; + r = ret_val; + delete this; + + ctx->complete(r); + } + + void finish(int r) override { + } +}; + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Watcher::C_NotifyAck " << this << " " \ + << __func__ << ": " + +Watcher::C_NotifyAck::C_NotifyAck(Watcher *watcher, uint64_t notify_id, + uint64_t handle) + : watcher(watcher), cct(watcher->m_cct), notify_id(notify_id), + handle(handle) { + ldout(cct, 10) << "id=" << notify_id << ", " << "handle=" << handle << dendl; +} + +void Watcher::C_NotifyAck::finish(int r) { + ldout(cct, 10) << "r=" << r << dendl; + ceph_assert(r == 0); + watcher->acknowledge_notify(notify_id, handle, out); +} + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Watcher: " << this << " " << __func__ \ + << ": " + +Watcher::Watcher(librados::IoCtx& ioctx, ContextWQ *work_queue, + const string& oid) + : m_ioctx(ioctx), m_work_queue(work_queue), m_oid(oid), + m_cct(reinterpret_cast<CephContext *>(ioctx.cct())), + m_watch_lock(util::unique_lock_name("librbd::Watcher::m_watch_lock", this)), + m_watch_handle(0), m_notifier(work_queue, ioctx, oid), + m_watch_state(WATCH_STATE_IDLE), m_watch_ctx(*this) { +} + +Watcher::~Watcher() { + RWLock::RLocker l(m_watch_lock); + ceph_assert(is_unregistered(m_watch_lock)); +} + +void Watcher::register_watch(Context *on_finish) { + ldout(m_cct, 10) << dendl; + + RWLock::WLocker watch_locker(m_watch_lock); + ceph_assert(is_unregistered(m_watch_lock)); + m_watch_state = WATCH_STATE_REGISTERING; + m_watch_blacklisted = false; + + librados::AioCompletion *aio_comp = create_rados_callback( + new C_RegisterWatch(this, on_finish)); + int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx); + ceph_assert(r == 0); + aio_comp->release(); +} + +void Watcher::handle_register_watch(int r, Context *on_finish) { + ldout(m_cct, 10) << "r=" << r << dendl; + + bool watch_error = false; + Context *unregister_watch_ctx = nullptr; + { + RWLock::WLocker watch_locker(m_watch_lock); + ceph_assert(m_watch_state == WATCH_STATE_REGISTERING); + + m_watch_state = WATCH_STATE_IDLE; + if (r < 0) { + lderr(m_cct) << "failed to register watch: " << cpp_strerror(r) + << dendl; + m_watch_handle = 0; + } + + if (m_unregister_watch_ctx != nullptr) { + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else if (r == 0 && m_watch_error) { + lderr(m_cct) << "re-registering watch after error" << dendl; + m_watch_state = WATCH_STATE_REWATCHING; + watch_error = true; + } else { + m_watch_blacklisted = (r == -EBLACKLISTED); + } + } + + on_finish->complete(r); + + if (unregister_watch_ctx != nullptr) { + unregister_watch_ctx->complete(0); + } else if (watch_error) { + rewatch(); + } +} + +void Watcher::unregister_watch(Context *on_finish) { + ldout(m_cct, 10) << dendl; + + { + RWLock::WLocker watch_locker(m_watch_lock); + if (m_watch_state != WATCH_STATE_IDLE) { + ldout(m_cct, 10) << "delaying unregister until register completed" + << dendl; + + ceph_assert(m_unregister_watch_ctx == nullptr); + m_unregister_watch_ctx = new FunctionContext([this, on_finish](int r) { + unregister_watch(on_finish); + }); + return; + } else if (is_registered(m_watch_lock)) { + librados::AioCompletion *aio_comp = create_rados_callback( + new C_UnwatchAndFlush(m_ioctx, on_finish)); + int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp); + ceph_assert(r == 0); + aio_comp->release(); + + m_watch_handle = 0; + m_watch_blacklisted = false; + return; + } + } + + on_finish->complete(0); +} + +bool Watcher::notifications_blocked() const { + RWLock::RLocker locker(m_watch_lock); + + bool blocked = (m_blocked_count > 0); + ldout(m_cct, 5) << "blocked=" << blocked << dendl; + return blocked; +} + +void Watcher::block_notifies(Context *on_finish) { + { + RWLock::WLocker locker(m_watch_lock); + ++m_blocked_count; + ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl; + } + m_async_op_tracker.wait_for_ops(on_finish); +} + +void Watcher::unblock_notifies() { + RWLock::WLocker locker(m_watch_lock); + ceph_assert(m_blocked_count > 0); + --m_blocked_count; + ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl; +} + +void Watcher::flush(Context *on_finish) { + m_notifier.flush(on_finish); +} + +std::string Watcher::get_oid() const { + RWLock::RLocker locker(m_watch_lock); + return m_oid; +} + +void Watcher::set_oid(const string& oid) { + RWLock::WLocker watch_locker(m_watch_lock); + ceph_assert(is_unregistered(m_watch_lock)); + + m_oid = oid; +} + +void Watcher::handle_error(uint64_t handle, int err) { + lderr(m_cct) << "handle=" << handle << ": " << cpp_strerror(err) << dendl; + + RWLock::WLocker watch_locker(m_watch_lock); + m_watch_error = true; + + if (is_registered(m_watch_lock)) { + m_watch_state = WATCH_STATE_REWATCHING; + if (err == -EBLACKLISTED) { + m_watch_blacklisted = true; + } + + FunctionContext *ctx = new FunctionContext( + boost::bind(&Watcher::rewatch, this)); + m_work_queue->queue(ctx); + } +} + +void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle, + bufferlist &out) { + m_ioctx.notify_ack(m_oid, notify_id, handle, out); +} + +void Watcher::rewatch() { + ldout(m_cct, 10) << dendl; + + Context *unregister_watch_ctx = nullptr; + { + RWLock::WLocker watch_locker(m_watch_lock); + ceph_assert(m_watch_state == WATCH_STATE_REWATCHING); + + if (m_unregister_watch_ctx != nullptr) { + m_watch_state = WATCH_STATE_IDLE; + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else { + m_watch_error = false; + auto ctx = create_context_callback< + Watcher, &Watcher::handle_rewatch>(this); + auto req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock, + &m_watch_ctx, &m_watch_handle, ctx); + req->send(); + return; + } + } + + unregister_watch_ctx->complete(0); +} + +void Watcher::handle_rewatch(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + bool watch_error = false; + Context *unregister_watch_ctx = nullptr; + { + RWLock::WLocker watch_locker(m_watch_lock); + ceph_assert(m_watch_state == WATCH_STATE_REWATCHING); + + m_watch_blacklisted = false; + if (m_unregister_watch_ctx != nullptr) { + ldout(m_cct, 10) << "image is closing, skip rewatch" << dendl; + m_watch_state = WATCH_STATE_IDLE; + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else if (r == -EBLACKLISTED) { + lderr(m_cct) << "client blacklisted" << dendl; + m_watch_blacklisted = true; + } else if (r == -ENOENT) { + ldout(m_cct, 5) << "object does not exist" << dendl; + } else if (r < 0) { + lderr(m_cct) << "failed to rewatch: " << cpp_strerror(r) << dendl; + watch_error = true; + } else if (m_watch_error) { + lderr(m_cct) << "re-registering watch after error" << dendl; + watch_error = true; + } + } + + if (unregister_watch_ctx != nullptr) { + unregister_watch_ctx->complete(0); + return; + } else if (watch_error) { + rewatch(); + return; + } + + auto ctx = create_context_callback< + Watcher, &Watcher::handle_rewatch_callback>(this); + m_work_queue->queue(ctx, r); +} + +void Watcher::handle_rewatch_callback(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + handle_rewatch_complete(r); + + bool watch_error = false; + Context *unregister_watch_ctx = nullptr; + { + RWLock::WLocker watch_locker(m_watch_lock); + ceph_assert(m_watch_state == WATCH_STATE_REWATCHING); + + if (m_unregister_watch_ctx != nullptr) { + m_watch_state = WATCH_STATE_IDLE; + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else if (r == -EBLACKLISTED || r == -ENOENT) { + m_watch_state = WATCH_STATE_IDLE; + } else if (r < 0 || m_watch_error) { + watch_error = true; + } else { + m_watch_state = WATCH_STATE_IDLE; + } + } + + if (unregister_watch_ctx != nullptr) { + unregister_watch_ctx->complete(0); + } else if (watch_error) { + rewatch(); + } +} + +void Watcher::send_notify(bufferlist& payload, + watcher::NotifyResponse *response, + Context *on_finish) { + m_notifier.notify(payload, response, on_finish); +} + +void Watcher::WatchCtx::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist& bl) { + // if notifications are blocked, finish the notification w/o + // bubbling the notification up to the derived class + watcher.m_async_op_tracker.start_op(); + if (watcher.notifications_blocked()) { + bufferlist bl; + watcher.acknowledge_notify(notify_id, handle, bl); + } else { + watcher.handle_notify(notify_id, handle, notifier_id, bl); + } + watcher.m_async_op_tracker.finish_op(); +} + +void Watcher::WatchCtx::handle_error(uint64_t handle, int err) { + watcher.handle_error(handle, err); +} + +} // namespace librbd diff --git a/src/librbd/Watcher.h b/src/librbd/Watcher.h new file mode 100644 index 00000000..69e87ad3 --- /dev/null +++ b/src/librbd/Watcher.h @@ -0,0 +1,184 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_H +#define CEPH_LIBRBD_WATCHER_H + +#include "common/AsyncOpTracker.h" +#include "common/Mutex.h" +#include "common/RWLock.h" +#include "include/rados/librados.hpp" +#include "librbd/watcher/Notifier.h" +#include "librbd/watcher/Types.h" +#include <string> +#include <utility> + +class ContextWQ; + +namespace librbd { + +namespace watcher { struct NotifyResponse; } + +class Watcher { +public: + struct C_NotifyAck : public Context { + Watcher *watcher; + CephContext *cct; + uint64_t notify_id; + uint64_t handle; + bufferlist out; + + C_NotifyAck(Watcher *watcher, uint64_t notify_id, uint64_t handle); + void finish(int r) override; + }; + + Watcher(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid); + virtual ~Watcher(); + + void register_watch(Context *on_finish); + virtual void unregister_watch(Context *on_finish); + void flush(Context *on_finish); + + bool notifications_blocked() const; + virtual void block_notifies(Context *on_finish); + void unblock_notifies(); + + std::string get_oid() const; + void set_oid(const string& oid); + + uint64_t get_watch_handle() const { + RWLock::RLocker watch_locker(m_watch_lock); + return m_watch_handle; + } + + bool is_registered() const { + RWLock::RLocker locker(m_watch_lock); + return is_registered(m_watch_lock); + } + bool is_unregistered() const { + RWLock::RLocker locker(m_watch_lock); + return is_unregistered(m_watch_lock); + } + bool is_blacklisted() const { + RWLock::RLocker locker(m_watch_lock); + return m_watch_blacklisted; + } + +protected: + enum WatchState { + WATCH_STATE_IDLE, + WATCH_STATE_REGISTERING, + WATCH_STATE_REWATCHING + }; + + librados::IoCtx& m_ioctx; + ContextWQ *m_work_queue; + std::string m_oid; + CephContext *m_cct; + mutable RWLock m_watch_lock; + uint64_t m_watch_handle; + watcher::Notifier m_notifier; + + WatchState m_watch_state; + bool m_watch_blacklisted = false; + + AsyncOpTracker m_async_op_tracker; + + bool is_registered(const RWLock&) const { + return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle != 0); + } + bool is_unregistered(const RWLock&) const { + return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle == 0); + } + + void send_notify(bufferlist &payload, + watcher::NotifyResponse *response = nullptr, + Context *on_finish = nullptr); + + virtual void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) = 0; + + virtual void handle_error(uint64_t cookie, int err); + + void acknowledge_notify(uint64_t notify_id, uint64_t handle, + bufferlist &out); + + virtual void handle_rewatch_complete(int r) { } + +private: + /** + * @verbatim + * + * <start> + * | + * v + * UNREGISTERED + * | + * | (register_watch) + * | + * REGISTERING + * | + * v (watch error) + * REGISTERED * * * * * * * > ERROR + * | ^ | + * | | | (rewatch) + * | | v + * | | REWATCHING + * | | | + * | | | + * | \---------------------/ + * | + * | (unregister_watch) + * | + * v + * UNREGISTERED + * | + * v + * <finish> + * + * @endverbatim + */ + + struct WatchCtx : public librados::WatchCtx2 { + Watcher &watcher; + + WatchCtx(Watcher &parent) : watcher(parent) {} + + void handle_notify(uint64_t notify_id, + uint64_t handle, + uint64_t notifier_id, + bufferlist& bl) override; + void handle_error(uint64_t handle, int err) override; + }; + + struct C_RegisterWatch : public Context { + Watcher *watcher; + Context *on_finish; + + C_RegisterWatch(Watcher *watcher, Context *on_finish) + : watcher(watcher), on_finish(on_finish) { + } + void finish(int r) override { + watcher->handle_register_watch(r, on_finish); + } + }; + + WatchCtx m_watch_ctx; + Context *m_unregister_watch_ctx = nullptr; + + bool m_watch_error = false; + + uint32_t m_blocked_count = 0; + + void handle_register_watch(int r, Context *on_finish); + + void rewatch(); + void handle_rewatch(int r); + void handle_rewatch_callback(int r); + +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_H diff --git a/src/librbd/api/Config.cc b/src/librbd/api/Config.cc new file mode 100644 index 00000000..08be9d2c --- /dev/null +++ b/src/librbd/api/Config.cc @@ -0,0 +1,240 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Config.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/api/PoolMetadata.h" +#include <boost/algorithm/string/predicate.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Config: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +const uint32_t MAX_KEYS = 64; + +typedef std::map<std::string, std::pair<std::string, config_source_t>> Parent; + +static std::set<std::string> EXCLUDE_OPTIONS { + "rbd_auto_exclusive_lock_until_manual_request", + "rbd_default_format", + "rbd_default_map_options", + "rbd_default_pool", + "rbd_discard_on_zeroed_write_same", + "rbd_op_thread_timeout", + "rbd_op_threads", + "rbd_tracing", + "rbd_validate_names", + "rbd_validate_pool", + "rbd_mirror_pool_replayers_refresh_interval", + "rbd_config_pool_override_update_timestamp" + }; +static std::set<std::string> EXCLUDE_IMAGE_OPTIONS { + "rbd_default_clone_format", + "rbd_default_data_pool", + "rbd_default_features", + "rbd_default_format", + "rbd_default_order", + "rbd_default_stripe_count", + "rbd_default_stripe_unit", + "rbd_journal_order", + "rbd_journal_pool", + "rbd_journal_splay_width" + }; + +struct Options : Parent { + librados::IoCtx m_io_ctx; + + Options(librados::IoCtx& io_ctx, bool image_apply_only_options) { + m_io_ctx.dup(io_ctx); + m_io_ctx.set_namespace(""); + + CephContext *cct = reinterpret_cast<CephContext *>(m_io_ctx.cct()); + + const std::string rbd_key_prefix("rbd_"); + const std::string rbd_mirror_key_prefix("rbd_mirror_"); + auto& schema = cct->_conf.get_schema(); + for (auto& pair : schema) { + if (!boost::starts_with(pair.first, rbd_key_prefix)) { + continue; + } else if (EXCLUDE_OPTIONS.count(pair.first) != 0) { + continue; + } else if (image_apply_only_options && + EXCLUDE_IMAGE_OPTIONS.count(pair.first) != 0) { + continue; + } else if (image_apply_only_options && + boost::starts_with(pair.first, rbd_mirror_key_prefix)) { + continue; + } + + insert({pair.first, {}}); + } + } + + int init() { + CephContext *cct = (CephContext *)m_io_ctx.cct(); + + for (auto &it : *this) { + int r = cct->_conf.get_val(it.first.c_str(), &it.second.first); + ceph_assert(r == 0); + it.second.second = RBD_CONFIG_SOURCE_CONFIG; + } + + std::string last_key = ImageCtx::METADATA_CONF_PREFIX; + bool more_results = true; + + while (more_results) { + std::map<std::string, bufferlist> pairs; + + int r = librbd::api::PoolMetadata<>::list(m_io_ctx, last_key, MAX_KEYS, + &pairs); + if (r < 0) { + return r; + } + + if (pairs.empty()) { + break; + } + + more_results = (pairs.size() == MAX_KEYS); + last_key = pairs.rbegin()->first; + + for (auto kv : pairs) { + std::string key; + if (!util::is_metadata_config_override(kv.first, &key)) { + more_results = false; + break; + } + auto it = find(key); + if (it != end()) { + it->second = {{kv.second.c_str(), kv.second.length()}, + RBD_CONFIG_SOURCE_POOL}; + } + } + } + return 0; + } +}; + +} // anonymous namespace + +template <typename I> +bool Config<I>::is_option_name(librados::IoCtx& io_ctx, + const std::string &name) { + Options opts(io_ctx, false); + + return (opts.find(name) != opts.end()); +} + +template <typename I> +int Config<I>::list(librados::IoCtx& io_ctx, + std::vector<config_option_t> *options) { + Options opts(io_ctx, false); + + int r = opts.init(); + if (r < 0) { + return r; + } + + for (auto &it : opts) { + options->push_back({it.first, it.second.first, it.second.second}); + } + + return 0; +} + +template <typename I> +bool Config<I>::is_option_name(I *image_ctx, const std::string &name) { + Options opts(image_ctx->md_ctx, true); + + return (opts.find(name) != opts.end()); +} + +template <typename I> +int Config<I>::list(I *image_ctx, std::vector<config_option_t> *options) { + CephContext *cct = image_ctx->cct; + Options opts(image_ctx->md_ctx, true); + + int r = opts.init(); + if (r < 0) { + return r; + } + + std::string last_key = ImageCtx::METADATA_CONF_PREFIX; + bool more_results = true; + + while (more_results) { + std::map<std::string, bufferlist> pairs; + + r = cls_client::metadata_list(&image_ctx->md_ctx, image_ctx->header_oid, + last_key, MAX_KEYS, &pairs); + if (r < 0) { + lderr(cct) << "failed reading image metadata: " << cpp_strerror(r) + << dendl; + return r; + } + + if (pairs.empty()) { + break; + } + + more_results = (pairs.size() == MAX_KEYS); + last_key = pairs.rbegin()->first; + + for (auto kv : pairs) { + std::string key; + if (!util::is_metadata_config_override(kv.first, &key)) { + more_results = false; + break; + } + auto it = opts.find(key); + if (it != opts.end()) { + it->second = {{kv.second.c_str(), kv.second.length()}, + RBD_CONFIG_SOURCE_IMAGE}; + } + } + } + + for (auto &it : opts) { + options->push_back({it.first, it.second.first, it.second.second}); + } + + return 0; +} + +template <typename I> +void Config<I>::apply_pool_overrides(librados::IoCtx& io_ctx, + ConfigProxy* config) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + Options opts(io_ctx, false); + int r = opts.init(); + if (r < 0) { + lderr(cct) << "failed to read pool config overrides: " << cpp_strerror(r) + << dendl; + return; + } + + for (auto& pair : opts) { + if (pair.second.second == RBD_CONFIG_SOURCE_POOL) { + r = config->set_val(pair.first, pair.second.first); + if (r < 0) { + lderr(cct) << "failed to override pool config " << pair.first << "=" + << pair.second.first << ": " << cpp_strerror(r) << dendl; + } + } + } +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Config<librbd::ImageCtx>; diff --git a/src/librbd/api/Config.h b/src/librbd/api/Config.h new file mode 100644 index 00000000..daa718c9 --- /dev/null +++ b/src/librbd/api/Config.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_CONFIG_H +#define CEPH_LIBRBD_API_CONFIG_H + +#include "include/rbd/librbd.hpp" +#include "include/rados/librados_fwd.hpp" + +struct ConfigProxy; + +namespace librbd { + +class ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +class Config { +public: + static bool is_option_name(librados::IoCtx& io_ctx, const std::string &name); + static int list(librados::IoCtx& io_ctx, + std::vector<config_option_t> *options); + + static bool is_option_name(ImageCtxT *image_ctx, const std::string &name); + static int list(ImageCtxT *image_ctx, std::vector<config_option_t> *options); + + static void apply_pool_overrides(librados::IoCtx& io_ctx, + ConfigProxy* config); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Config<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_CONFIG_H diff --git a/src/librbd/api/DiffIterate.cc b/src/librbd/api/DiffIterate.cc new file mode 100644 index 00000000..f96264e3 --- /dev/null +++ b/src/librbd/api/DiffIterate.cc @@ -0,0 +1,554 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/DiffIterate.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/internal.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageRequestWQ.h" +#include "include/rados/librados.hpp" +#include "include/interval_set.h" +#include "common/errno.h" +#include "common/Throttle.h" +#include "osdc/Striper.h" +#include "librados/snap_set_diff.h" +#include <boost/tuple/tuple.hpp> +#include <list> +#include <map> +#include <vector> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::DiffIterate: " + +namespace librbd { +namespace api { + +namespace { + +enum ObjectDiffState { + OBJECT_DIFF_STATE_NONE = 0, + OBJECT_DIFF_STATE_UPDATED = 1, + OBJECT_DIFF_STATE_HOLE = 2 +}; + +struct DiffContext { + DiffIterate<>::Callback callback; + void *callback_arg; + bool whole_object; + uint64_t from_snap_id; + uint64_t end_snap_id; + interval_set<uint64_t> parent_diff; + OrderedThrottle throttle; + + template <typename I> + DiffContext(I &image_ctx, DiffIterate<>::Callback callback, + void *callback_arg, bool _whole_object, uint64_t _from_snap_id, + uint64_t _end_snap_id) + : callback(callback), callback_arg(callback_arg), + whole_object(_whole_object), from_snap_id(_from_snap_id), + end_snap_id(_end_snap_id), + throttle(image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"), true) { + } +}; + +class C_DiffObject : public Context { +public: + template <typename I> + C_DiffObject(I &image_ctx, librados::IoCtx &head_ctx, + DiffContext &diff_context, const std::string &oid, + uint64_t offset, const std::vector<ObjectExtent> &object_extents) + : m_cct(image_ctx.cct), m_head_ctx(head_ctx), + m_diff_context(diff_context), m_oid(oid), m_offset(offset), + m_object_extents(object_extents), m_snap_ret(0) { + } + + void send() { + C_OrderedThrottle *ctx = m_diff_context.throttle.start_op(this); + librados::AioCompletion *rados_completion = + util::create_rados_callback(ctx); + + librados::ObjectReadOperation op; + op.list_snaps(&m_snap_set, &m_snap_ret); + + int r = m_head_ctx.aio_operate(m_oid, rados_completion, &op, NULL); + ceph_assert(r == 0); + rados_completion->release(); + } + +protected: + typedef boost::tuple<uint64_t, size_t, bool> Diff; + typedef std::list<Diff> Diffs; + + void finish(int r) override { + CephContext *cct = m_cct; + if (r == 0 && m_snap_ret < 0) { + r = m_snap_ret; + } + + Diffs diffs; + if (r == 0) { + ldout(cct, 20) << "object " << m_oid << ": list_snaps complete" << dendl; + compute_diffs(&diffs); + } else if (r == -ENOENT) { + ldout(cct, 20) << "object " << m_oid << ": list_snaps (not found)" + << dendl; + r = 0; + compute_parent_overlap(&diffs); + } else { + ldout(cct, 20) << "object " << m_oid << ": list_snaps failed: " + << cpp_strerror(r) << dendl; + } + + if (r == 0) { + for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) { + r = m_diff_context.callback(d->get<0>(), d->get<1>(), d->get<2>(), + m_diff_context.callback_arg); + if (r < 0) { + break; + } + } + } + m_diff_context.throttle.end_op(r); + } + +private: + CephContext *m_cct; + librados::IoCtx &m_head_ctx; + DiffContext &m_diff_context; + std::string m_oid; + uint64_t m_offset; + std::vector<ObjectExtent> m_object_extents; + + librados::snap_set_t m_snap_set; + int m_snap_ret; + + void compute_diffs(Diffs *diffs) { + CephContext *cct = m_cct; + + // calc diff from from_snap_id -> to_snap_id + interval_set<uint64_t> diff; + uint64_t end_size; + bool end_exists; + librados::snap_t clone_end_snap_id; + bool whole_object; + calc_snap_set_diff(cct, m_snap_set, m_diff_context.from_snap_id, + m_diff_context.end_snap_id, &diff, &end_size, + &end_exists, &clone_end_snap_id, &whole_object); + if (whole_object) { + ldout(cct, 1) << "object " << m_oid << ": need to provide full object" + << dendl; + } + ldout(cct, 20) << " diff " << diff << " end_exists=" << end_exists + << dendl; + if (diff.empty() && !whole_object) { + if (m_diff_context.from_snap_id == 0 && !end_exists) { + compute_parent_overlap(diffs); + } + return; + } else if (m_diff_context.whole_object || whole_object) { + // provide the full object extents to the callback + for (vector<ObjectExtent>::iterator q = m_object_extents.begin(); + q != m_object_extents.end(); ++q) { + diffs->push_back(boost::make_tuple(m_offset + q->offset, q->length, + end_exists)); + } + return; + } + + for (vector<ObjectExtent>::iterator q = m_object_extents.begin(); + q != m_object_extents.end(); ++q) { + ldout(cct, 20) << "diff_iterate object " << m_oid << " extent " + << q->offset << "~" << q->length << " from " + << q->buffer_extents << dendl; + uint64_t opos = q->offset; + for (vector<pair<uint64_t,uint64_t> >::iterator r = + q->buffer_extents.begin(); + r != q->buffer_extents.end(); ++r) { + interval_set<uint64_t> overlap; // object extents + overlap.insert(opos, r->second); + overlap.intersection_of(diff); + ldout(cct, 20) << " opos " << opos + << " buf " << r->first << "~" << r->second + << " overlap " << overlap << dendl; + for (interval_set<uint64_t>::iterator s = overlap.begin(); + s != overlap.end(); ++s) { + uint64_t su_off = s.get_start() - opos; + uint64_t logical_off = m_offset + r->first + su_off; + ldout(cct, 20) << " overlap extent " << s.get_start() << "~" + << s.get_len() << " logical " << logical_off << "~" + << s.get_len() << dendl; + diffs->push_back(boost::make_tuple(logical_off, s.get_len(), + end_exists)); + } + opos += r->second; + } + ceph_assert(opos == q->offset + q->length); + } + } + + void compute_parent_overlap(Diffs *diffs) { + if (m_diff_context.from_snap_id == 0 && + !m_diff_context.parent_diff.empty()) { + // report parent diff instead + for (vector<ObjectExtent>::iterator q = m_object_extents.begin(); + q != m_object_extents.end(); ++q) { + for (vector<pair<uint64_t,uint64_t> >::iterator r = + q->buffer_extents.begin(); + r != q->buffer_extents.end(); ++r) { + interval_set<uint64_t> o; + o.insert(m_offset + r->first, r->second); + o.intersection_of(m_diff_context.parent_diff); + ldout(m_cct, 20) << " reporting parent overlap " << o << dendl; + for (interval_set<uint64_t>::iterator s = o.begin(); s != o.end(); + ++s) { + diffs->push_back(boost::make_tuple(s.get_start(), s.get_len(), + true)); + } + } + } + } + } +}; + +int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) { + // it's possible for a discard to create a hole in the parent image -- ignore + if (exists) { + interval_set<uint64_t> *diff = static_cast<interval_set<uint64_t> *>(arg); + diff->insert(off, len); + } + return 0; +} + +} // anonymous namespace + +template <typename I> +int DiffIterate<I>::diff_iterate(I *ictx, + const cls::rbd::SnapshotNamespace& from_snap_namespace, + const char *fromsnapname, + uint64_t off, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) +{ + ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off + << " len = " << len << dendl; + + if (!ictx->data_ctx.is_valid()) { + return -ENODEV; + } + + // ensure previous writes are visible to listsnaps + C_SaferCond flush_ctx; + { + RWLock::RLocker owner_locker(ictx->owner_lock); + auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, ictx, + io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec<I>::create_flush_request( + *ictx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + delete req; + } + int r = flush_ctx.wait(); + if (r < 0) { + return r; + } + + r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + ictx->snap_lock.get_read(); + r = clip_io(ictx, off, &len); + ictx->snap_lock.put_read(); + if (r < 0) { + return r; + } + + DiffIterate command(*ictx, from_snap_namespace, fromsnapname, off, len, + include_parent, whole_object, cb, arg); + r = command.execute(); + return r; +} + +template <typename I> +int DiffIterate<I>::execute() { + CephContext* cct = m_image_ctx.cct; + + ceph_assert(m_image_ctx.data_ctx.is_valid()); + + librados::IoCtx head_ctx; + librados::snap_t from_snap_id = 0; + librados::snap_t end_snap_id; + uint64_t from_size = 0; + uint64_t end_size; + { + RWLock::RLocker md_locker(m_image_ctx.md_lock); + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + head_ctx.dup(m_image_ctx.data_ctx); + if (m_from_snap_name) { + from_snap_id = m_image_ctx.get_snap_id(m_from_snap_namespace, m_from_snap_name); + from_size = m_image_ctx.get_image_size(from_snap_id); + } + end_snap_id = m_image_ctx.snap_id; + end_size = m_image_ctx.get_image_size(end_snap_id); + } + + if (from_snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + if (from_snap_id == end_snap_id) { + // no diff. + return 0; + } + if (from_snap_id >= end_snap_id) { + return -EINVAL; + } + + int r; + bool fast_diff_enabled = false; + BitVector<2> object_diff_state; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + if (m_whole_object && (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + r = diff_object_map(from_snap_id, end_snap_id, &object_diff_state); + if (r < 0) { + ldout(cct, 5) << "fast diff disabled" << dendl; + } else { + ldout(cct, 5) << "fast diff enabled" << dendl; + fast_diff_enabled = true; + } + } + } + + // we must list snaps via the head, not end snap + head_ctx.snap_set_read(CEPH_SNAPDIR); + + ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to " + << end_snap_id << " size from " << from_size + << " to " << end_size << dendl; + + // check parent overlap only if we are comparing to the beginning of time + DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg, + m_whole_object, from_snap_id, end_snap_id); + if (m_include_parent && from_snap_id == 0) { + RWLock::RLocker l(m_image_ctx.snap_lock); + RWLock::RLocker l2(m_image_ctx.parent_lock); + uint64_t overlap = 0; + m_image_ctx.get_parent_overlap(m_image_ctx.snap_id, &overlap); + r = 0; + if (m_image_ctx.parent && overlap > 0) { + ldout(cct, 10) << " first getting parent diff" << dendl; + DiffIterate diff_parent(*m_image_ctx.parent, {}, + nullptr, 0, overlap, + m_include_parent, m_whole_object, + &simple_diff_cb, + &diff_context.parent_diff); + r = diff_parent.execute(); + } + if (r < 0) { + return r; + } + } + + uint64_t period = m_image_ctx.get_stripe_period(); + uint64_t off = m_offset; + uint64_t left = m_length; + + while (left > 0) { + uint64_t period_off = off - (off % period); + uint64_t read_len = min(period_off + period - off, left); + + // map to extents + map<object_t,vector<ObjectExtent> > object_extents; + Striper::file_to_extents(cct, m_image_ctx.format_string, + &m_image_ctx.layout, off, read_len, 0, + object_extents, 0); + + // get snap info for each object + for (map<object_t,vector<ObjectExtent> >::iterator p = + object_extents.begin(); + p != object_extents.end(); ++p) { + ldout(cct, 20) << "object " << p->first << dendl; + + if (fast_diff_enabled) { + const uint64_t object_no = p->second.front().objectno; + if (object_diff_state[object_no] == OBJECT_DIFF_STATE_NONE && + from_snap_id == 0 && !diff_context.parent_diff.empty()) { + // no data in child object -- report parent diff instead + for (auto& oe : p->second) { + for (auto& be : oe.buffer_extents) { + interval_set<uint64_t> o; + o.insert(off + be.first, be.second); + o.intersection_of(diff_context.parent_diff); + ldout(cct, 20) << " reporting parent overlap " << o << dendl; + for (auto e = o.begin(); e != o.end(); ++e) { + r = m_callback(e.get_start(), e.get_len(), true, + m_callback_arg); + if (r < 0) { + return r; + } + } + } + } + } else if (object_diff_state[object_no] != OBJECT_DIFF_STATE_NONE) { + bool updated = (object_diff_state[object_no] == + OBJECT_DIFF_STATE_UPDATED); + for (std::vector<ObjectExtent>::iterator q = p->second.begin(); + q != p->second.end(); ++q) { + r = m_callback(off + q->offset, q->length, updated, m_callback_arg); + if (r < 0) { + return r; + } + } + } + } else { + C_DiffObject *diff_object = new C_DiffObject(m_image_ctx, head_ctx, + diff_context, + p->first.name, off, + p->second); + diff_object->send(); + + if (diff_context.throttle.pending_error()) { + r = diff_context.throttle.wait_for_ret(); + return r; + } + } + } + + left -= read_len; + off += read_len; + } + + r = diff_context.throttle.wait_for_ret(); + if (r < 0) { + return r; + } + return 0; +} + +template <typename I> +int DiffIterate<I>::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, + BitVector<2>* object_diff_state) { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + CephContext* cct = m_image_ctx.cct; + + bool diff_from_start = (from_snap_id == 0); + if (from_snap_id == 0) { + if (!m_image_ctx.snaps.empty()) { + from_snap_id = m_image_ctx.snaps.back(); + } else { + from_snap_id = CEPH_NOSNAP; + } + } + + object_diff_state->clear(); + uint64_t current_snap_id = from_snap_id; + uint64_t next_snap_id = to_snap_id; + BitVector<2> prev_object_map; + bool prev_object_map_valid = false; + while (true) { + uint64_t current_size = m_image_ctx.size; + if (current_snap_id != CEPH_NOSNAP) { + std::map<librados::snap_t, SnapInfo>::const_iterator snap_it = + m_image_ctx.snap_info.find(current_snap_id); + ceph_assert(snap_it != m_image_ctx.snap_info.end()); + current_size = snap_it->second.size; + + ++snap_it; + if (snap_it != m_image_ctx.snap_info.end()) { + next_snap_id = snap_it->first; + } else { + next_snap_id = CEPH_NOSNAP; + } + } + + uint64_t flags; + int r = m_image_ctx.get_flags(from_snap_id, &flags); + if (r < 0) { + lderr(cct) << "diff_object_map: failed to retrieve image flags" << dendl; + return r; + } + if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) { + ldout(cct, 1) << "diff_object_map: cannot perform fast diff on invalid " + << "object map" << dendl; + return -EINVAL; + } + + BitVector<2> object_map; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, + current_snap_id)); + r = cls_client::object_map_load(&m_image_ctx.md_ctx, oid, &object_map); + if (r < 0) { + lderr(cct) << "diff_object_map: failed to load object map " << oid + << dendl; + return r; + } + ldout(cct, 20) << "diff_object_map: loaded object map " << oid << dendl; + + uint64_t num_objs = Striper::get_num_objects(m_image_ctx.layout, + current_size); + if (object_map.size() < num_objs) { + ldout(cct, 1) << "diff_object_map: object map too small: " + << object_map.size() << " < " << num_objs << dendl; + return -EINVAL; + } + object_map.resize(num_objs); + object_diff_state->resize(object_map.size()); + + uint64_t overlap = std::min(object_map.size(), prev_object_map.size()); + auto it = object_map.begin(); + auto overlap_end_it = it + overlap; + auto pre_it = prev_object_map.begin(); + auto diff_it = object_diff_state->begin(); + uint64_t i = 0; + for (; it != overlap_end_it; ++it, ++pre_it, ++diff_it, ++i) { + ldout(cct, 20) << __func__ << ": object state: " << i << " " + << static_cast<uint32_t>(*pre_it) + << "->" << static_cast<uint32_t>(*it) << dendl; + if (*it == OBJECT_NONEXISTENT) { + if (*pre_it != OBJECT_NONEXISTENT) { + *diff_it = OBJECT_DIFF_STATE_HOLE; + } + } else if (*it == OBJECT_EXISTS || + (*pre_it != *it && + !(*pre_it == OBJECT_EXISTS && + *it == OBJECT_EXISTS_CLEAN))) { + *diff_it = OBJECT_DIFF_STATE_UPDATED; + } + } + ldout(cct, 20) << "diff_object_map: computed overlap diffs" << dendl; + auto end_it = object_map.end(); + if (object_map.size() > prev_object_map.size() && + (diff_from_start || prev_object_map_valid)) { + for (; it != end_it; ++it,++diff_it, ++i) { + ldout(cct, 20) << __func__ << ": object state: " << i << " " + << "->" << static_cast<uint32_t>(*it) << dendl; + if (*it == OBJECT_NONEXISTENT) { + *diff_it = OBJECT_DIFF_STATE_NONE; + } else { + *diff_it = OBJECT_DIFF_STATE_UPDATED; + } + } + } + ldout(cct, 20) << "diff_object_map: computed resize diffs" << dendl; + + if (current_snap_id == next_snap_id || next_snap_id > to_snap_id) { + break; + } + current_snap_id = next_snap_id; + prev_object_map = object_map; + prev_object_map_valid = true; + } + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::DiffIterate<librbd::ImageCtx>; diff --git a/src/librbd/api/DiffIterate.h b/src/librbd/api/DiffIterate.h new file mode 100644 index 00000000..e6074d9c --- /dev/null +++ b/src/librbd/api/DiffIterate.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_DIFF_ITERATE_H +#define CEPH_LIBRBD_API_DIFF_ITERATE_H + +#include "include/int_types.h" +#include "common/bit_vector.hpp" +#include "cls/rbd/cls_rbd_types.h" + +namespace librbd { + +class ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +class DiffIterate { +public: + typedef int (*Callback)(uint64_t, size_t, int, void *); + + static int diff_iterate(ImageCtxT *ictx, + const cls::rbd::SnapshotNamespace& from_snap_namespace, + const char *fromsnapname, + uint64_t off, uint64_t len, bool include_parent, + bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); + +private: + ImageCtxT &m_image_ctx; + cls::rbd::SnapshotNamespace m_from_snap_namespace; + const char* m_from_snap_name; + uint64_t m_offset; + uint64_t m_length; + bool m_include_parent; + bool m_whole_object; + Callback m_callback; + void *m_callback_arg; + + DiffIterate(ImageCtxT &image_ctx, + const cls::rbd::SnapshotNamespace& from_snap_namespace, + const char *from_snap_name, uint64_t off, uint64_t len, + bool include_parent, bool whole_object, Callback callback, + void *callback_arg) + : m_image_ctx(image_ctx), m_from_snap_namespace(from_snap_namespace), + m_from_snap_name(from_snap_name), m_offset(off), + m_length(len), m_include_parent(include_parent), + m_whole_object(whole_object), m_callback(callback), + m_callback_arg(callback_arg) + { + } + + int execute(); + + int diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, + BitVector<2>* object_diff_state); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::DiffIterate<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_DIFF_ITERATE_H diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc new file mode 100644 index 00000000..8b75bd94 --- /dev/null +++ b/src/librbd/api/Group.cc @@ -0,0 +1,1207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "librbd/ExclusiveLock.h" +#include "librbd/api/Group.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Group: " << __func__ << ": " + +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; +// list binds to list() here, so std::list is explicitly used below + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; +using librados::Rados; + + +namespace librbd { +namespace api { + +namespace { + +template <typename I> +snap_t get_group_snap_id(I* ictx, + const cls::rbd::SnapshotNamespace& in_snap_namespace) { + ceph_assert(ictx->snap_lock.is_locked()); + auto it = ictx->snap_ids.lower_bound({in_snap_namespace, ""}); + if (it != ictx->snap_ids.end() && it->first.first == in_snap_namespace) { + return it->second; + } + return CEPH_NOSNAP; +} + +string generate_uuid(librados::IoCtx& io_ctx) +{ + Rados rados(io_ctx); + uint64_t bid = rados.get_instance_id(); + + uint32_t extra = rand() % 0xFFFFFFFF; + ostringstream bid_ss; + bid_ss << std::hex << bid << std::hex << extra; + return bid_ss.str(); +} + +int group_snap_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector<cls::rbd::GroupSnapshot> *cls_snaps) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + vector<string> ind_snap_names; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + const int max_read = 1024; + cls::rbd::GroupSnapshot snap_last; + + for (;;) { + vector<cls::rbd::GroupSnapshot> snaps_page; + + r = cls_client::group_snap_list(&group_ioctx, group_header_oid, + snap_last, max_read, &snaps_page); + + if (r < 0) { + lderr(cct) << "error reading snap list from group: " + << cpp_strerror(-r) << dendl; + return r; + } + cls_snaps->insert(cls_snaps->end(), snaps_page.begin(), snaps_page.end()); + if (snaps_page.size() < max_read) { + break; + } + snap_last = *snaps_page.rbegin(); + } + + return 0; +} + +std::string calc_ind_image_snap_name(uint64_t pool_id, + const std::string &group_id, + const std::string &snap_id) +{ + std::stringstream ind_snap_name_stream; + ind_snap_name_stream << ".group." << std::hex << pool_id << "_" + << group_id << "_" << snap_id; + return ind_snap_name_stream.str(); +} + +int group_image_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector<cls::rbd::GroupImageStatus> *image_ids) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + ldout(cct, 20) << "listing images in group name " + << group_name << " group id " << group_header_oid << dendl; + image_ids->clear(); + + const int max_read = 1024; + cls::rbd::GroupImageSpec start_last; + do { + std::vector<cls::rbd::GroupImageStatus> image_ids_page; + + r = cls_client::group_image_list(&group_ioctx, group_header_oid, + start_last, max_read, &image_ids_page); + + if (r < 0) { + lderr(cct) << "error reading image list from group: " + << cpp_strerror(-r) << dendl; + return r; + } + image_ids->insert(image_ids->end(), + image_ids_page.begin(), image_ids_page.end()); + + if (image_ids_page.size() > 0) + start_last = image_ids_page.rbegin()->spec; + + r = image_ids_page.size(); + } while (r == max_read); + + return 0; +} + +int group_image_remove(librados::IoCtx& group_ioctx, string group_id, + librados::IoCtx& image_ioctx, string image_id) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_header_oid = util::group_header_name(group_id); + + string image_header_oid = util::header_name(image_id); + + ldout(cct, 20) << "removing image " << image_id + << " image id " << image_header_oid << dendl; + + cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id()); + + cls::rbd::GroupImageStatus incomplete_st(image_id, image_ioctx.get_id(), + cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE); + + cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id()); + + int r = cls_client::group_image_set(&group_ioctx, group_header_oid, + incomplete_st); + + if (r < 0) { + lderr(cct) << "couldn't put image into removing state: " + << cpp_strerror(-r) << dendl; + return r; + } + + r = cls_client::image_group_remove(&image_ioctx, image_header_oid, + group_spec); + if ((r < 0) && (r != -ENOENT)) { + lderr(cct) << "couldn't remove group reference from image" + << cpp_strerror(-r) << dendl; + return r; + } else if (r >= 0) { + ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid); + } + + r = cls_client::group_image_remove(&group_ioctx, group_header_oid, spec); + if (r < 0) { + lderr(cct) << "couldn't remove image from group" + << cpp_strerror(-r) << dendl; + return r; + } + + return 0; +} + +int group_snap_remove_by_record(librados::IoCtx& group_ioctx, + const cls::rbd::GroupSnapshot& group_snap, + const std::string& group_id, + const std::string& group_header_oid) { + + CephContext *cct = (CephContext *)group_ioctx.cct(); + std::vector<C_SaferCond*> on_finishes; + int r, ret_code; + + std::vector<librbd::ImageCtx*> ictxs; + + cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id, + group_snap.id}; + + ldout(cct, 20) << "Removing snapshots" << dendl; + int snap_count = group_snap.snaps.size(); + + for (int i = 0; i < snap_count; ++i) { + librbd::IoCtx image_io_ctx; + r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {}, + &image_io_ctx); + if (r < 0) { + return r; + } + + librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id, + nullptr, image_io_ctx, false); + + C_SaferCond* on_finish = new C_SaferCond; + + image_ctx->state->open(0, on_finish); + + ictxs.push_back(image_ctx); + on_finishes.push_back(on_finish); + } + + ret_code = 0; + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + delete ictxs[i]; + ictxs[i] = nullptr; + ret_code = r; + } + } + if (ret_code != 0) { + goto finish; + } + + ldout(cct, 20) << "Opened participating images. " << + "Deleting snapshots themselves." << dendl; + + for (int i = 0; i < snap_count; ++i) { + ImageCtx *ictx = ictxs[i]; + on_finishes[i] = new C_SaferCond; + + std::string snap_name; + ictx->snap_lock.get_read(); + snap_t snap_id = get_group_snap_id(ictx, ne); + r = ictx->get_snap_name(snap_id, &snap_name); + ictx->snap_lock.put_read(); + + if (r >= 0) { + ldout(cct, 20) << "removing individual snapshot from image " << ictx->name + << dendl; + ictx->operations->snap_remove(ne, snap_name, on_finishes[i]); + } else { + // We are ok to ignore missing image snapshots. The snapshot could have + // been inconsistent in the first place. + on_finishes[i]->complete(0); + } + } + + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0 && r != -ENOENT) { + // if previous attempts to remove this snapshot failed then the image's + // snapshot may not exist + lderr(cct) << "Failed deleting image snapshot. Ret code: " << r << dendl; + ret_code = r; + } + } + + if (ret_code != 0) { + goto finish; + } + + ldout(cct, 20) << "Removed images snapshots removing snapshot record." + << dendl; + + r = cls_client::group_snap_remove(&group_ioctx, group_header_oid, + group_snap.id); + if (r < 0) { + ret_code = r; + goto finish; + } + +finish: + for (int i = 0; i < snap_count; ++i) { + if (ictxs[i] != nullptr) { + ictxs[i]->state->close(); + } + } + return ret_code; +} + +int group_snap_rollback_by_record(librados::IoCtx& group_ioctx, + const cls::rbd::GroupSnapshot& group_snap, + const std::string& group_id, + const std::string& group_header_oid, + ProgressContext& pctx) { + CephContext *cct = (CephContext *)group_ioctx.cct(); + std::vector<C_SaferCond*> on_finishes; + int r, ret_code; + + std::vector<librbd::ImageCtx*> ictxs; + + cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id, + group_snap.id}; + + ldout(cct, 20) << "Rolling back snapshots" << dendl; + int snap_count = group_snap.snaps.size(); + + for (int i = 0; i < snap_count; ++i) { + librados::IoCtx image_io_ctx; + r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {}, + &image_io_ctx); + if (r < 0) { + return r; + } + + librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id, + nullptr, image_io_ctx, false); + + C_SaferCond* on_finish = new C_SaferCond; + + image_ctx->state->open(0, on_finish); + + ictxs.push_back(image_ctx); + on_finishes.push_back(on_finish); + } + + ret_code = 0; + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + delete ictxs[i]; + ictxs[i] = nullptr; + ret_code = r; + } + } + if (ret_code != 0) { + goto finish; + } + + ldout(cct, 20) << "Requesting exclusive locks for images" << dendl; + for (auto ictx: ictxs) { + RWLock::RLocker owner_lock(ictx->owner_lock); + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->block_requests(-EBUSY); + } + } + for (int i = 0; i < snap_count; ++i) { + ImageCtx *ictx = ictxs[i]; + RWLock::RLocker owner_lock(ictx->owner_lock); + + on_finishes[i] = new C_SaferCond; + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->acquire_lock(on_finishes[i]); + } + } + + ret_code = 0; + for (int i = 0; i < snap_count; ++i) { + r = 0; + ImageCtx *ictx = ictxs[i]; + if (ictx->exclusive_lock != nullptr) { + r = on_finishes[i]->wait(); + } + delete on_finishes[i]; + if (r < 0) { + ret_code = r; + } + } + if (ret_code != 0) { + goto finish; + } + + for (int i = 0; i < snap_count; ++i) { + ImageCtx *ictx = ictxs[i]; + on_finishes[i] = new C_SaferCond; + + RWLock::RLocker owner_locker(ictx->owner_lock); + std::string snap_name; + ictx->snap_lock.get_read(); + snap_t snap_id = get_group_snap_id(ictx, ne); + r = ictx->get_snap_name(snap_id, &snap_name); + ictx->snap_lock.put_read(); + + if (r >= 0) { + ldout(cct, 20) << "rolling back to individual snapshot for image " << ictx->name + << dendl; + ictx->operations->execute_snap_rollback(ne, snap_name, pctx, on_finishes[i]); + } else { + on_finishes[i]->complete(r); + } + } + + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0 && r != -ENOENT) { + lderr(cct) << "Failed rolling back group to snapshot. Ret code: " << r << dendl; + ret_code = r; + } + } + +finish: + for (int i = 0; i < snap_count; ++i) { + if (ictxs[i] != nullptr) { + ictxs[i]->state->close(); + } + } + return ret_code; +} + +} // anonymous namespace + +template <typename I> +int Group<I>::image_remove_by_id(librados::IoCtx& group_ioctx, + const char *group_name, + librados::IoCtx& image_ioctx, + const char *image_id) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << " image " + << &image_ioctx << " id " << image_id << dendl; + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 20) << "removing image from group name " << group_name + << " group id " << group_id << dendl; + + return group_image_remove(group_ioctx, group_id, image_ioctx, string(image_id)); +} + +template <typename I> +int Group<I>::create(librados::IoCtx& io_ctx, const char *group_name) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + + string id = generate_uuid(io_ctx); + + ldout(cct, 2) << "adding group to directory..." << dendl; + + int r = cls_client::group_dir_add(&io_ctx, RBD_GROUP_DIRECTORY, group_name, + id); + if (r < 0) { + lderr(cct) << "error adding group to directory: " + << cpp_strerror(r) + << dendl; + return r; + } + string header_oid = util::group_header_name(id); + + r = io_ctx.create(header_oid, true); + if (r < 0) { + lderr(cct) << "error creating group header: " << cpp_strerror(r) << dendl; + goto err_remove_from_dir; + } + + return 0; + +err_remove_from_dir: + int remove_r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY, + group_name, id); + if (remove_r < 0) { + lderr(cct) << "error cleaning up group from rbd_directory " + << "object after creation failed: " << cpp_strerror(remove_r) + << dendl; + } + + return r; +} + +template <typename I> +int Group<I>::remove(librados::IoCtx& io_ctx, const char *group_name) +{ + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "group_remove " << &io_ctx << " " << group_name << dendl; + + std::string group_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY, + std::string(group_name), &group_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error getting id of group" << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + std::vector<cls::rbd::GroupSnapshot> snaps; + r = group_snap_list(io_ctx, group_name, &snaps); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing group snapshots" << dendl; + return r; + } + + for (auto &snap : snaps) { + r = group_snap_remove_by_record(io_ctx, snap, group_id, group_header_oid); + if (r < 0) { + return r; + } + } + + std::vector<cls::rbd::GroupImageStatus> images; + r = group_image_list(io_ctx, group_name, &images); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing group images" << dendl; + return r; + } + + for (auto image : images) { + IoCtx image_ioctx; + r = util::create_ioctx(io_ctx, "image", image.spec.pool_id, {}, + &image_ioctx); + if (r < 0) { + return r; + } + + r = group_image_remove(io_ctx, group_id, image_ioctx, image.spec.image_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing image from a group" << dendl; + return r; + } + } + + string header_oid = util::group_header_name(group_id); + + r = io_ctx.remove(header_oid); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing header: " << cpp_strerror(-r) << dendl; + return r; + } + + r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY, + group_name, group_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing group from directory" << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Group<I>::list(IoCtx& io_ctx, vector<string> *names) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl; + + int max_read = 1024; + string last_read = ""; + int r; + do { + map<string, string> groups; + r = cls_client::group_dir_list(&io_ctx, RBD_GROUP_DIRECTORY, last_read, + max_read, &groups); + if (r == -ENOENT) { + return 0; // Ignore missing rbd group directory. It means we don't have any groups yet. + } + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "error listing group in directory: " + << cpp_strerror(r) << dendl; + } else { + r = 0; + } + return r; + } + for (pair<string, string> group : groups) { + names->push_back(group.first); + } + if (!groups.empty()) { + last_read = groups.rbegin()->first; + } + r = groups.size(); + } while (r == max_read); + + return 0; +} + +template <typename I> +int Group<I>::image_add(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << " image " + << &image_ioctx << " name " << image_name << dendl; + + if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) { + lderr(cct) << "group and image cannot be in different namespaces" << dendl; + return -EINVAL; + } + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + + ldout(cct, 20) << "adding image to group name " << group_name + << " group id " << group_header_oid << dendl; + + string image_id; + + r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name, + &image_id); + if (r < 0) { + lderr(cct) << "error reading image id object: " + << cpp_strerror(-r) << dendl; + return r; + } + + string image_header_oid = util::header_name(image_id); + + ldout(cct, 20) << "adding image " << image_name + << " image id " << image_header_oid << dendl; + + cls::rbd::GroupImageStatus incomplete_st( + image_id, image_ioctx.get_id(), + cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE); + cls::rbd::GroupImageStatus attached_st( + image_id, image_ioctx.get_id(), cls::rbd::GROUP_IMAGE_LINK_STATE_ATTACHED); + + r = cls_client::group_image_set(&group_ioctx, group_header_oid, + incomplete_st); + + cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id()); + + if (r < 0) { + lderr(cct) << "error adding image reference to group: " + << cpp_strerror(-r) << dendl; + return r; + } + + r = cls_client::image_group_add(&image_ioctx, image_header_oid, group_spec); + if (r < 0) { + lderr(cct) << "error adding group reference to image: " + << cpp_strerror(-r) << dendl; + cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id()); + cls_client::group_image_remove(&group_ioctx, group_header_oid, spec); + // Ignore errors in the clean up procedure. + return r; + } + ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid); + + r = cls_client::group_image_set(&group_ioctx, group_header_oid, + attached_st); + + return r; +} + +template <typename I> +int Group<I>::image_remove(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << " image " + << &image_ioctx << " name " << image_name << dendl; + + if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) { + lderr(cct) << "group and image cannot be in different namespaces" << dendl; + return -EINVAL; + } + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 20) << "removing image from group name " << group_name + << " group id " << group_id << dendl; + + string image_id; + r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name, + &image_id); + if (r < 0) { + lderr(cct) << "error reading image id object: " + << cpp_strerror(-r) << dendl; + return r; + } + + r = group_image_remove(group_ioctx, group_id, image_ioctx, image_id); + + return r; +} + +template <typename I> +int Group<I>::image_list(librados::IoCtx& group_ioctx, + const char *group_name, + std::vector<group_image_info_t>* images) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << dendl; + + std::vector<cls::rbd::GroupImageStatus> image_ids; + + group_image_list(group_ioctx, group_name, &image_ids); + + for (auto image_id : image_ids) { + IoCtx ioctx; + int r = util::create_ioctx(group_ioctx, "image", image_id.spec.pool_id, {}, + &ioctx); + if (r < 0) { + return r; + } + + std::string image_name; + r = cls_client::dir_get_name(&ioctx, RBD_DIRECTORY, + image_id.spec.image_id, &image_name); + if (r < 0) { + return r; + } + + images->push_back( + group_image_info_t { + image_name, + ioctx.get_id(), + static_cast<group_image_state_t>(image_id.state)}); + } + + return 0; +} + +template <typename I> +int Group<I>::rename(librados::IoCtx& io_ctx, const char *src_name, + const char *dest_name) +{ + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "group_rename " << &io_ctx << " " << src_name + << " -> " << dest_name << dendl; + + std::string group_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY, + std::string(src_name), &group_id); + if (r < 0) { + if (r != -ENOENT) + lderr(cct) << "error getting id of group" << dendl; + return r; + } + + r = cls_client::group_dir_rename(&io_ctx, RBD_GROUP_DIRECTORY, + src_name, dest_name, group_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error renaming group from directory" << dendl; + return r; + } + + return 0; +} + + +template <typename I> +int Group<I>::image_get_group(I *ictx, group_info_t *group_info) +{ + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + if (RBD_GROUP_INVALID_POOL != ictx->group_spec.pool_id) { + IoCtx ioctx; + r = util::create_ioctx(ictx->md_ctx, "group", ictx->group_spec.pool_id, {}, + &ioctx); + if (r < 0) { + return r; + } + + std::string group_name; + r = cls_client::dir_get_name(&ioctx, RBD_GROUP_DIRECTORY, + ictx->group_spec.group_id, &group_name); + if (r < 0) + return r; + group_info->pool = ioctx.get_id(); + group_info->name = group_name; + } else { + group_info->pool = RBD_GROUP_INVALID_POOL; + group_info->name = ""; + } + + return 0; +} + +template <typename I> +int Group<I>::snap_create(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + cls::rbd::GroupSnapshot group_snap; + vector<cls::rbd::ImageSnapshotSpec> image_snaps; + std::string ind_snap_name; + + std::vector<librbd::ImageCtx*> ictxs; + std::vector<C_SaferCond*> on_finishes; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + std::vector<cls::rbd::GroupImageStatus> images; + r = group_image_list(group_ioctx, group_name, &images); + if (r < 0) { + return r; + } + int image_count = images.size(); + + ldout(cct, 20) << "Found " << image_count << " images in group" << dendl; + + image_snaps = vector<cls::rbd::ImageSnapshotSpec>(image_count, + cls::rbd::ImageSnapshotSpec()); + + for (int i = 0; i < image_count; ++i) { + image_snaps[i].pool = images[i].spec.pool_id; + image_snaps[i].image_id = images[i].spec.image_id; + } + + string group_header_oid = util::group_header_name(group_id); + + group_snap.id = generate_uuid(group_ioctx); + group_snap.name = string(snap_name); + group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_INCOMPLETE; + group_snap.snaps = image_snaps; + + cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id, + group_snap.id}; + + r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap); + if (r == -EEXIST) { + lderr(cct) << "snapshot with this name already exists: " + << cpp_strerror(r) + << dendl; + } + int ret_code = 0; + if (r < 0) { + ret_code = r; + goto finish; + } + + for (auto image: images) { + librbd::IoCtx image_io_ctx; + r = util::create_ioctx(group_ioctx, "image", image.spec.pool_id, {}, + &image_io_ctx); + if (r < 0) { + ret_code = r; + goto finish; + } + + ldout(cct, 20) << "Opening image with id " << image.spec.image_id << dendl; + + librbd::ImageCtx* image_ctx = new ImageCtx("", image.spec.image_id.c_str(), + nullptr, image_io_ctx, false); + + C_SaferCond* on_finish = new C_SaferCond; + + image_ctx->state->open(0, on_finish); + + ictxs.push_back(image_ctx); + on_finishes.push_back(on_finish); + } + ldout(cct, 20) << "Issued open request waiting for the completion" << dendl; + ret_code = 0; + for (int i = 0; i < image_count; ++i) { + + ldout(cct, 20) << "Waiting for completion on on_finish: " << + on_finishes[i] << dendl; + + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + delete ictxs[i]; + ictxs[i] = nullptr; + ret_code = r; + } + } + if (ret_code != 0) { + goto remove_record; + } + ldout(cct, 20) << "Requesting exclusive locks for images" << dendl; + + for (auto ictx: ictxs) { + RWLock::RLocker owner_lock(ictx->owner_lock); + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->block_requests(-EBUSY); + } + } + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + RWLock::RLocker owner_lock(ictx->owner_lock); + + on_finishes[i] = new C_SaferCond; + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->acquire_lock(on_finishes[i]); + } + } + + ret_code = 0; + for (int i = 0; i < image_count; ++i) { + r = 0; + ImageCtx *ictx = ictxs[i]; + if (ictx->exclusive_lock != nullptr) { + r = on_finishes[i]->wait(); + } + delete on_finishes[i]; + if (r < 0) { + ret_code = r; + } + } + if (ret_code != 0) { + goto remove_record; + } + + ind_snap_name = calc_ind_image_snap_name(group_ioctx.get_id(), group_id, + group_snap.id); + + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + + C_SaferCond* on_finish = new C_SaferCond; + + ictx->operations->snap_create(ne, ind_snap_name.c_str(), on_finish); + + on_finishes[i] = on_finish; + } + + ret_code = 0; + for (int i = 0; i < image_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + ret_code = r; + } else { + ImageCtx *ictx = ictxs[i]; + ictx->snap_lock.get_read(); + snap_t snap_id = get_group_snap_id(ictx, ne); + ictx->snap_lock.put_read(); + if (snap_id == CEPH_NOSNAP) { + ldout(cct, 20) << "Couldn't find created snapshot with namespace: " + << ne << dendl; + ret_code = -ENOENT; + } else { + image_snaps[i].snap_id = snapid_t(snap_id); + image_snaps[i].pool = ictx->md_ctx.get_id(); + image_snaps[i].image_id = ictx->id; + } + } + } + if (ret_code != 0) { + goto remove_image_snaps; + } + + group_snap.snaps = image_snaps; + group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_COMPLETE; + + r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap); + if (r < 0) { + ret_code = r; + goto remove_image_snaps; + } + + goto finish; + +remove_image_snaps: + + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + ldout(cct, 20) << "Removing individual snapshot with name: " << + ind_snap_name << dendl; + + on_finishes[i] = new C_SaferCond; + std::string snap_name; + ictx->snap_lock.get_read(); + snap_t snap_id = get_group_snap_id(ictx, ne); + r = ictx->get_snap_name(snap_id, &snap_name); + ictx->snap_lock.put_read(); + if (r >= 0) { + ictx->operations->snap_remove(ne, snap_name.c_str(), on_finishes[i]); + } else { + // Ignore missing image snapshots. The whole snapshot could have been + // inconsistent. + on_finishes[i]->complete(0); + } + } + + for (int i = 0, n = on_finishes.size(); i < n; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0 && r != -ENOENT) { // if previous attempts to remove this snapshot failed then the image's snapshot may not exist + lderr(cct) << "Failed cleaning up image snapshot. Ret code: " << r << dendl; + // just report error, but don't abort the process + } + } + +remove_record: + r = cls_client::group_snap_remove(&group_ioctx, group_header_oid, + group_snap.id); + if (r < 0) { + lderr(cct) << "error while cleaning up group snapshot" << dendl; + // we ignore return value in clean up + } + +finish: + for (int i = 0, n = ictxs.size(); i < n; ++i) { + if (ictxs[i] != nullptr) { + ictxs[i]->state->close(); + } + } + return ret_code; +} + +template <typename I> +int Group<I>::snap_remove(librados::IoCtx& group_ioctx, const char *group_name, + const char *snap_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + std::vector<cls::rbd::GroupSnapshot> snaps; + r = group_snap_list(group_ioctx, group_name, &snaps); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot *group_snap = nullptr; + for (auto &snap : snaps) { + if (snap.name == string(snap_name)) { + group_snap = &snap; + break; + } + } + if (group_snap == nullptr) { + return -ENOENT; + } + + string group_header_oid = util::group_header_name(group_id); + r = group_snap_remove_by_record(group_ioctx, *group_snap, group_id, + group_header_oid); + return r; +} + +template <typename I> +int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, + const char *new_snap_name) { + CephContext *cct = (CephContext *)group_ioctx.cct(); + if (0 == strcmp(old_snap_name, new_snap_name)) + return -EEXIST; + + std::string group_id; + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "error reading group id object: " << cpp_strerror(r) << dendl; + return r; + } + + std::vector<cls::rbd::GroupSnapshot> group_snaps; + r = group_snap_list(group_ioctx, group_name, &group_snaps); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot group_snap; + for (auto &snap : group_snaps) { + if (snap.name == old_snap_name) { + group_snap = snap; + break; + } + } + + if (group_snap.id.empty()) { + return -ENOENT; + } + + std::string group_header_oid = util::group_header_name(group_id); + group_snap.name = new_snap_name; + r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector<group_snap_info_t> *snaps) +{ + std::vector<cls::rbd::GroupSnapshot> cls_snaps; + + int r = group_snap_list(group_ioctx, group_name, &cls_snaps); + if (r < 0) { + return r; + } + + for (auto snap : cls_snaps) { + snaps->push_back( + group_snap_info_t { + snap.name, + static_cast<group_snap_state_t>(snap.state)}); + + } + return 0; +} + +template <typename I> +int Group<I>::snap_rollback(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name, + ProgressContext& pctx) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) << dendl; + return r; + } + + std::vector<cls::rbd::GroupSnapshot> snaps; + r = group_snap_list(group_ioctx, group_name, &snaps); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot *group_snap = nullptr; + for (auto &snap : snaps) { + if (snap.name == string(snap_name)) { + group_snap = &snap; + break; + } + } + if (group_snap == nullptr) { + return -ENOENT; + } + + string group_header_oid = util::group_header_name(group_id); + r = group_snap_rollback_by_record(group_ioctx, *group_snap, group_id, + group_header_oid, pctx); + return r; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Group<librbd::ImageCtx>; diff --git a/src/librbd/api/Group.h b/src/librbd/api/Group.h new file mode 100644 index 00000000..59b978a8 --- /dev/null +++ b/src/librbd/api/Group.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_GROUP_H +#define CEPH_LIBRBD_API_GROUP_H + +#include "include/rbd/librbd.hpp" +#include "include/rados/librados_fwd.hpp" +#include <string> +#include <vector> + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Group { + + static int create(librados::IoCtx& io_ctx, const char *group_name); + static int remove(librados::IoCtx& io_ctx, const char *group_name); + static int list(librados::IoCtx& io_ctx, std::vector<std::string> *names); + static int rename(librados::IoCtx& io_ctx, const char *src_group_name, + const char *dest_group_name); + + static int image_add(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name); + static int image_remove(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name); + static int image_remove_by_id(librados::IoCtx& group_ioctx, + const char *group_name, + librados::IoCtx& image_ioctx, + const char *image_id); + static int image_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector<group_image_info_t> *images); + + static int image_get_group(ImageCtxT *ictx, group_info_t *group_info); + + static int snap_create(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name); + static int snap_remove(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name); + static int snap_rename(librados::IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, const char *new_snap_name); + static int snap_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector<group_snap_info_t> *snaps); + static int snap_rollback(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name, + ProgressContext& pctx); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Group<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_GROUP_H diff --git a/src/librbd/api/Image.cc b/src/librbd/api/Image.cc new file mode 100644 index 00000000..aa427535 --- /dev/null +++ b/src/librbd/api/Image.cc @@ -0,0 +1,836 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Image.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/DeepCopyRequest.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Trash.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/image/PreRemoveRequest.h" +#include <boost/scope_exit.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Image: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +bool compare_by_pool(const librbd::linked_image_spec_t& lhs, + const librbd::linked_image_spec_t& rhs) +{ + if (lhs.pool_id != rhs.pool_id) { + return lhs.pool_id < rhs.pool_id; + } else if (lhs.pool_namespace != rhs.pool_namespace) { + return lhs.pool_namespace < rhs.pool_namespace; + } + return false; +} + +bool compare(const librbd::linked_image_spec_t& lhs, + const librbd::linked_image_spec_t& rhs) +{ + if (lhs.pool_name != rhs.pool_name) { + return lhs.pool_name < rhs.pool_name; + } else if (lhs.pool_id != rhs.pool_id) { + return lhs.pool_id < rhs.pool_id; + } else if (lhs.pool_namespace != rhs.pool_namespace) { + return lhs.pool_namespace < rhs.pool_namespace; + } else if (lhs.image_name != rhs.image_name) { + return lhs.image_name < rhs.image_name; + } else if (lhs.image_id != rhs.image_id) { + return lhs.image_id < rhs.image_id; + } + return false; +} + +template <typename I> +int pre_remove_image(librados::IoCtx& io_ctx, const std::string& image_id) { + I *image_ctx = I::create("", image_id, nullptr, io_ctx, false); + int r = image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + auto req = image::PreRemoveRequest<I>::create(image_ctx, false, &ctx); + req->send(); + + r = ctx.wait(); + image_ctx->state->close(); + return r; +} + +} // anonymous namespace + +template <typename I> +int64_t Image<I>::get_data_pool_id(I *ictx) { + if (ictx->data_ctx.is_valid()) { + return ictx->data_ctx.get_id(); + } + + int64_t pool_id; + int r = cls_client::get_data_pool(&ictx->md_ctx, ictx->header_oid, &pool_id); + if (r < 0) { + CephContext *cct = ictx->cct; + lderr(cct) << "error getting data pool ID: " << cpp_strerror(r) << dendl; + return r; + } + + return pool_id; +} + +template <typename I> +int Image<I>::get_op_features(I *ictx, uint64_t *op_features) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "image_ctx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker snap_locker(ictx->snap_lock); + *op_features = ictx->op_features; + return 0; +} + +template <typename I> +int Image<I>::list_images(librados::IoCtx& io_ctx, + std::vector<image_spec_t> *images) { + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "list " << &io_ctx << dendl; + + int r; + images->clear(); + + if (io_ctx.get_namespace().empty()) { + bufferlist bl; + r = io_ctx.read(RBD_DIRECTORY, bl, 0, 0); + if (r == -ENOENT) { + return 0; + } else if (r < 0) { + lderr(cct) << "error listing v1 images: " << cpp_strerror(r) << dendl; + return r; + } + + // V1 format images are in a tmap + if (bl.length()) { + auto p = bl.cbegin(); + bufferlist header; + std::map<std::string, bufferlist> m; + decode(header, p); + decode(m, p); + for (auto& it : m) { + images->push_back({.id ="", .name = it.first}); + } + } + } + + // V2 format images + std::map<std::string, std::string> image_names_to_ids; + r = list_images_v2(io_ctx, &image_names_to_ids); + if (r < 0) { + lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& img_pair : image_names_to_ids) { + images->push_back({.id = img_pair.second, + .name = img_pair.first}); + } + + // include V2 images in a partially removed state + std::vector<librbd::trash_image_info_t> trash_images; + r = Trash<I>::list(io_ctx, trash_images, false); + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error listing trash images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& trash_image : trash_images) { + if (trash_image.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + images->push_back({.id = trash_image.id, + .name = trash_image.name}); + + } + } + + return 0; +} + +template <typename I> +int Image<I>::list_images_v2(librados::IoCtx& io_ctx, ImageNameToIds *images) { + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl; + + // new format images are accessed by class methods + int r; + int max_read = 1024; + string last_read = ""; + do { + map<string, string> images_page; + r = cls_client::dir_list(&io_ctx, RBD_DIRECTORY, last_read, max_read, + &images_page); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing image in directory: " + << cpp_strerror(r) << dendl; + return r; + } else if (r == -ENOENT) { + break; + } + for (map<string, string>::const_iterator it = images_page.begin(); + it != images_page.end(); ++it) { + images->insert(*it); + } + if (!images_page.empty()) { + last_read = images_page.rbegin()->first; + } + r = images_page.size(); + } while (r == max_read); + + return 0; +} + +template <typename I> +int Image<I>::get_parent(I *ictx, + librbd::linked_image_spec_t *parent_image, + librbd::snap_spec_t *parent_snap) { + auto cct = ictx->cct; + ldout(cct, 20) << "image_ctx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker snap_locker(ictx->snap_lock); + RWLock::RLocker parent_locker(ictx->parent_lock); + + bool release_parent_locks = false; + BOOST_SCOPE_EXIT_ALL(ictx, &release_parent_locks) { + if (release_parent_locks) { + ictx->parent->parent_lock.put_read(); + ictx->parent->snap_lock.put_read(); + } + }; + + // if a migration is in-progress, the true parent is the parent + // of the migration source image + auto parent = ictx->parent; + if (!ictx->migration_info.empty() && ictx->parent != nullptr) { + release_parent_locks = true; + ictx->parent->snap_lock.get_read(); + ictx->parent->parent_lock.get_read(); + + parent = ictx->parent->parent; + } + + if (parent == nullptr) { + return -ENOENT; + } + + parent_image->pool_id = parent->md_ctx.get_id(); + parent_image->pool_name = parent->md_ctx.get_pool_name(); + parent_image->pool_namespace = parent->md_ctx.get_namespace(); + + RWLock::RLocker parent_snap_locker(parent->snap_lock); + parent_snap->id = parent->snap_id; + parent_snap->namespace_type = RBD_SNAP_NAMESPACE_TYPE_USER; + if (parent->snap_id != CEPH_NOSNAP) { + auto snap_info = parent->get_snap_info(parent->snap_id); + if (snap_info == nullptr) { + lderr(cct) << "error finding parent snap name: " << cpp_strerror(r) + << dendl; + return -ENOENT; + } + + parent_snap->namespace_type = static_cast<snap_namespace_type_t>( + cls::rbd::get_snap_namespace_type(snap_info->snap_namespace)); + parent_snap->name = snap_info->name; + } + + parent_image->image_id = parent->id; + parent_image->image_name = parent->name; + parent_image->trash = true; + + librbd::trash_image_info_t trash_info; + r = Trash<I>::get(parent->md_ctx, parent->id, &trash_info); + if (r == -ENOENT || r == -EOPNOTSUPP) { + parent_image->trash = false; + } else if (r < 0) { + lderr(cct) << "error looking up trash status: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Image<I>::list_children(I *ictx, + std::vector<librbd::linked_image_spec_t> *images) { + images->clear(); + return list_descendants(ictx, 1, images); +} + +template <typename I> +int Image<I>::list_children(I *ictx, + const cls::rbd::ParentImageSpec &parent_spec, + std::vector<librbd::linked_image_spec_t> *images) { + images->clear(); + return list_descendants(ictx, parent_spec, 1, images); +} + +template <typename I> +int Image<I>::list_descendants( + librados::IoCtx& io_ctx, const std::string &image_id, + const std::optional<size_t> &max_level, + std::vector<librbd::linked_image_spec_t> *images) { + ImageCtx *ictx = new librbd::ImageCtx("", image_id, nullptr, + io_ctx, true); + int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r < 0) { + if (r == -ENOENT) { + return 0; + } + lderr(ictx->cct) << "failed to open descendant " << image_id + << " from pool " << io_ctx.get_pool_name() << ":" + << cpp_strerror(r) << dendl; + return r; + } + + r = list_descendants(ictx, max_level, images); + + int r1 = ictx->state->close(); + if (r1 < 0) { + lderr(ictx->cct) << "error when closing descendant " << image_id + << " from pool " << io_ctx.get_pool_name() << ":" + << cpp_strerror(r) << dendl; + } + + return r; +} + +template <typename I> +int Image<I>::list_descendants( + I *ictx, const std::optional<size_t> &max_level, + std::vector<librbd::linked_image_spec_t> *images) { + RWLock::RLocker l(ictx->snap_lock); + std::vector<librados::snap_t> snap_ids; + if (ictx->snap_id != CEPH_NOSNAP) { + snap_ids.push_back(ictx->snap_id); + } else { + snap_ids = ictx->snaps; + } + for (auto snap_id : snap_ids) { + cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(), + ictx->md_ctx.get_namespace(), + ictx->id, snap_id}; + int r = list_descendants(ictx, parent_spec, max_level, images); + if (r < 0) { + return r; + } + } + return 0; +} + +template <typename I> +int Image<I>::list_descendants( + I *ictx, const cls::rbd::ParentImageSpec &parent_spec, + const std::optional<size_t> &max_level, + std::vector<librbd::linked_image_spec_t> *images) { + auto child_max_level = max_level; + if (child_max_level) { + if (child_max_level == 0) { + return 0; + } + (*child_max_level)--; + } + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + // no children for non-layered or old format image + if (!ictx->test_features(RBD_FEATURE_LAYERING, ictx->snap_lock)) { + return 0; + } + + librados::Rados rados(ictx->md_ctx); + + // search all pools for clone v1 children dependent on this snapshot + std::list<std::pair<int64_t, std::string> > pools; + int r = rados.pool_list2(pools); + if (r < 0) { + lderr(cct) << "error listing pools: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& it : pools) { + int64_t base_tier; + r = rados.pool_get_base_tier(it.first, &base_tier); + if (r == -ENOENT) { + ldout(cct, 1) << "pool " << it.second << " no longer exists" << dendl; + continue; + } else if (r < 0) { + lderr(cct) << "error retrieving base tier for pool " << it.second + << dendl; + return r; + } + if (it.first != base_tier) { + // pool is a cache; skip it + continue; + } + + IoCtx ioctx; + r = util::create_ioctx(ictx->md_ctx, "child image", it.first, {}, &ioctx); + if (r == -ENOENT) { + continue; + } else if (r < 0) { + return r; + } + + std::set<std::string> image_ids; + r = cls_client::get_children(&ioctx, RBD_CHILDREN, parent_spec, + image_ids); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error reading list of children from pool " << it.second + << dendl; + return r; + } + + for (auto& image_id : image_ids) { + images->push_back({ + it.first, "", ictx->md_ctx.get_namespace(), image_id, "", false}); + r = list_descendants(ioctx, image_id, child_max_level, images); + if (r < 0) { + return r; + } + } + } + + // retrieve clone v2 children attached to this snapshot + IoCtx parent_io_ctx; + r = util::create_ioctx(ictx->md_ctx, "parent image", parent_spec.pool_id, + parent_spec.pool_namespace, &parent_io_ctx); + if (r < 0) { + return r; + } + + cls::rbd::ChildImageSpecs child_images; + r = cls_client::children_list(&parent_io_ctx, + util::header_name(parent_spec.image_id), + parent_spec.snap_id, &child_images); + if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) { + lderr(cct) << "error retrieving children: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& child_image : child_images) { + images->push_back({ + child_image.pool_id, "", child_image.pool_namespace, + child_image.image_id, "", false}); + if (!child_max_level || *child_max_level > 0) { + IoCtx ioctx; + r = util::create_ioctx(ictx->md_ctx, "child image", child_image.pool_id, + child_image.pool_namespace, &ioctx); + if (r == -ENOENT) { + continue; + } else if (r < 0) { + return r; + } + r = list_descendants(ioctx, child_image.image_id, child_max_level, + images); + if (r < 0) { + return r; + } + } + } + + // batch lookups by pool + namespace + std::sort(images->begin(), images->end(), compare_by_pool); + + int64_t child_pool_id = -1; + librados::IoCtx child_io_ctx; + std::map<std::string, std::pair<std::string, bool>> child_image_id_to_info; + for (auto& image : *images) { + if (child_pool_id == -1 || child_pool_id != image.pool_id || + child_io_ctx.get_namespace() != image.pool_namespace) { + r = util::create_ioctx(ictx->md_ctx, "child image", image.pool_id, + image.pool_namespace, &child_io_ctx); + if (r == -ENOENT) { + image.pool_name = ""; + image.image_name = ""; + continue; + } else if (r < 0) { + return r; + } + child_pool_id = image.pool_id; + + child_image_id_to_info.clear(); + + std::map<std::string, std::string> image_names_to_ids; + r = list_images_v2(child_io_ctx, &image_names_to_ids); + if (r < 0) { + lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& [name, id] : image_names_to_ids) { + child_image_id_to_info.insert({id, {name, false}}); + } + + std::vector<librbd::trash_image_info_t> trash_images; + r = Trash<I>::list(child_io_ctx, trash_images, false); + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error listing trash images: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto& it : trash_images) { + child_image_id_to_info.insert({ + it.id, + {it.name, + it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING ? false : true}}); + } + } + + auto it = child_image_id_to_info.find(image.image_id); + if (it == child_image_id_to_info.end()) { + lderr(cct) << "error looking up name for image id " + << image.image_id << " in pool " + << child_io_ctx.get_pool_name() + << (image.pool_namespace.empty() ? + "" : "/" + image.pool_namespace) << dendl; + return -ENOENT; + } + + image.pool_name = child_io_ctx.get_pool_name(); + image.image_name = it->second.first; + image.trash = it->second.second; + } + + // final sort by pool + image names + std::sort(images->begin(), images->end(), compare); + return 0; +} + +template <typename I> +int Image<I>::deep_copy(I *src, librados::IoCtx& dest_md_ctx, + const char *destname, ImageOptions& opts, + ProgressContext &prog_ctx) { + CephContext *cct = (CephContext *)dest_md_ctx.cct(); + ldout(cct, 20) << src->name + << (src->snap_name.length() ? "@" + src->snap_name : "") + << " -> " << destname << " opts = " << opts << dendl; + + uint64_t features; + uint64_t src_size; + { + RWLock::RLocker snap_locker(src->snap_lock); + + if (!src->migration_info.empty()) { + lderr(cct) << "cannot deep copy migrating image" << dendl; + return -EBUSY; + } + + features = src->features; + src_size = src->get_image_size(src->snap_id); + } + uint64_t format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, format); + } + if (format == 1) { + lderr(cct) << "old format not supported for destination image" << dendl; + return -EINVAL; + } + uint64_t stripe_unit = src->stripe_unit; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + uint64_t stripe_count = src->stripe_count; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + uint64_t order = src->order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) { + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + } + if (features & ~RBD_FEATURES_ALL) { + lderr(cct) << "librbd does not support requested features" << dendl; + return -ENOSYS; + } + + uint64_t flatten = 0; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) { + opts.unset(RBD_IMAGE_OPTION_FLATTEN); + } + + cls::rbd::ParentImageSpec parent_spec; + if (flatten > 0) { + parent_spec.pool_id = -1; + } else { + RWLock::RLocker snap_locker(src->snap_lock); + RWLock::RLocker parent_locker(src->parent_lock); + + // use oldest snapshot or HEAD for parent spec + if (!src->snap_info.empty()) { + parent_spec = src->snap_info.begin()->second.parent.spec; + } else { + parent_spec = src->parent_md.spec; + } + } + + int r; + if (parent_spec.pool_id == -1) { + r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false); + } else { + librados::IoCtx parent_io_ctx; + r = util::create_ioctx(src->md_ctx, "parent image", parent_spec.pool_id, + parent_spec.pool_namespace, &parent_io_ctx); + if (r < 0) { + return r; + } + + ConfigProxy config{cct->_conf}; + api::Config<I>::apply_pool_overrides(dest_md_ctx, &config); + + C_SaferCond ctx; + std::string dest_id = util::generate_image_id(dest_md_ctx); + auto *req = image::CloneRequest<I>::create( + config, parent_io_ctx, parent_spec.image_id, "", parent_spec.snap_id, + dest_md_ctx, destname, dest_id, opts, "", "", src->op_work_queue, &ctx); + req->send(); + r = ctx.wait(); + } + if (r < 0) { + lderr(cct) << "header creation failed" << dendl; + return r; + } + opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order)); + + auto dest = new I(destname, "", nullptr, dest_md_ctx, false); + r = dest->state->open(0); + if (r < 0) { + lderr(cct) << "failed to read newly created header" << dendl; + return r; + } + + C_SaferCond lock_ctx; + { + RWLock::WLocker locker(dest->owner_lock); + + if (dest->exclusive_lock == nullptr || + dest->exclusive_lock->is_lock_owner()) { + lock_ctx.complete(0); + } else { + dest->exclusive_lock->acquire_lock(&lock_ctx); + } + } + + r = lock_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r) + << dendl; + dest->state->close(); + return r; + } + + r = deep_copy(src, dest, flatten > 0, prog_ctx); + + int close_r = dest->state->close(); + if (r == 0 && close_r < 0) { + r = close_r; + } + return r; +} + +template <typename I> +int Image<I>::deep_copy(I *src, I *dest, bool flatten, + ProgressContext &prog_ctx) { + CephContext *cct = src->cct; + librados::snap_t snap_id_start = 0; + librados::snap_t snap_id_end; + { + RWLock::RLocker snap_locker(src->snap_lock); + snap_id_end = src->snap_id; + } + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond cond; + SnapSeqs snap_seqs; + auto req = DeepCopyRequest<I>::create(src, dest, snap_id_start, snap_id_end, + 0U, flatten, boost::none, op_work_queue, + &snap_seqs, &prog_ctx, &cond); + req->send(); + int r = cond.wait(); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Image<I>::snap_set(I *ictx, + const cls::rbd::SnapshotNamespace &snap_namespace, + const char *snap_name) { + ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = " + << (snap_name ? snap_name : "NULL") << dendl; + + // ignore return value, since we may be set to a non-existent + // snapshot and the user is trying to fix that + ictx->state->refresh_if_required(); + + uint64_t snap_id = CEPH_NOSNAP; + std::string name(snap_name == nullptr ? "" : snap_name); + if (!name.empty()) { + RWLock::RLocker snap_locker(ictx->snap_lock); + snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace{}, + snap_name); + if (snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + } + + return snap_set(ictx, snap_id); +} + +template <typename I> +int Image<I>::snap_set(I *ictx, uint64_t snap_id) { + ldout(ictx->cct, 20) << "snap_set " << ictx << " " + << "snap_id=" << snap_id << dendl; + + // ignore return value, since we may be set to a non-existent + // snapshot and the user is trying to fix that + ictx->state->refresh_if_required(); + + C_SaferCond ctx; + ictx->state->snap_set(snap_id, &ctx); + int r = ctx.wait(); + if (r < 0) { + if (r != -ENOENT) { + lderr(ictx->cct) << "failed to " << (snap_id == CEPH_NOSNAP ? "un" : "") + << "set snapshot: " << cpp_strerror(r) << dendl; + } + return r; + } + + return 0; +} + +template <typename I> +int Image<I>::remove(IoCtx& io_ctx, const std::string &image_name, + ProgressContext& prog_ctx) +{ + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "name=" << image_name << dendl; + + // look up the V2 image id based on the image name + std::string image_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name, + &image_id); + if (r == -ENOENT) { + // check if it already exists in trash from an aborted trash remove attempt + std::vector<trash_image_info_t> trash_entries; + r = Trash<I>::list(io_ctx, trash_entries, false); + if (r < 0) { + return r; + } else if (r >= 0) { + for (auto& entry : trash_entries) { + if (entry.name == image_name && + entry.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + return Trash<I>::remove(io_ctx, entry.id, true, prog_ctx); + } + } + } + + // fall-through if we failed to locate the image in the V2 directory and + // trash + } else if (r < 0) { + lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl; + return r; + } else { + // attempt to move the image to the trash (and optionally immediately + // delete the image) + ConfigProxy config(cct->_conf); + Config<I>::apply_pool_overrides(io_ctx, &config); + + rbd_trash_image_source_t trash_image_source = + RBD_TRASH_IMAGE_SOURCE_REMOVING; + uint64_t expire_seconds = 0; + if (config.get_val<bool>("rbd_move_to_trash_on_remove")) { + // keep the image in the trash upon remove requests + trash_image_source = RBD_TRASH_IMAGE_SOURCE_USER; + expire_seconds = config.get_val<uint64_t>( + "rbd_move_to_trash_on_remove_expire_seconds"); + } else { + // attempt to pre-validate the removal before moving to trash and + // removing + r = pre_remove_image<I>(io_ctx, image_id); + if (r < 0 && r != -ENOENT) { + return r; + } + } + + r = Trash<I>::move(io_ctx, trash_image_source, image_name, image_id, + expire_seconds); + if (r >= 0) { + if (trash_image_source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + // proceed with attempting to immediately remove the image + r = Trash<I>::remove(io_ctx, image_id, true, prog_ctx); + + if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK) { + // best-effort try to restore the image if the removal + // failed for possible expected reasons + Trash<I>::restore(io_ctx, {cls::rbd::TRASH_IMAGE_SOURCE_REMOVING}, + image_id, image_name); + } + } + return r; + } else if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + // fall-through if trash isn't supported + } + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + // might be a V1 image format that cannot be moved to the trash + // and would not have been listed in the V2 directory -- or the OSDs + // are too old and don't support the trash feature + C_SaferCond cond; + auto req = librbd::image::RemoveRequest<I>::create( + io_ctx, image_name, "", false, false, prog_ctx, op_work_queue, &cond); + req->send(); + + return cond.wait(); +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Image<librbd::ImageCtx>; diff --git a/src/librbd/api/Image.h b/src/librbd/api/Image.h new file mode 100644 index 00000000..af928c84 --- /dev/null +++ b/src/librbd/api/Image.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_IMAGE_H +#define LIBRBD_API_IMAGE_H + +#include "include/rbd/librbd.hpp" +#include "include/rados/librados_fwd.hpp" +#include "librbd/Types.h" +#include <map> +#include <set> +#include <string> + +namespace librbd { + +class ImageOptions; +class ProgressContext; + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Image { + typedef std::map<std::string, std::string> ImageNameToIds; + + static int64_t get_data_pool_id(ImageCtxT *ictx); + + static int get_op_features(ImageCtxT *ictx, uint64_t *op_features); + + static int list_images(librados::IoCtx& io_ctx, + std::vector<image_spec_t> *images); + static int list_images_v2(librados::IoCtx& io_ctx, + ImageNameToIds *images); + + static int get_parent(ImageCtxT *ictx, + librbd::linked_image_spec_t *parent_image, + librbd::snap_spec_t *parent_snap); + + static int list_children(ImageCtxT *ictx, + std::vector<librbd::linked_image_spec_t> *images); + static int list_children(ImageCtxT *ictx, + const cls::rbd::ParentImageSpec &parent_spec, + std::vector<librbd::linked_image_spec_t> *images); + + static int list_descendants(IoCtx& io_ctx, const std::string &image_id, + const std::optional<size_t> &max_level, + std::vector<librbd::linked_image_spec_t> *images); + static int list_descendants(ImageCtxT *ictx, + const std::optional<size_t> &max_level, + std::vector<librbd::linked_image_spec_t> *images); + static int list_descendants(ImageCtxT *ictx, + const cls::rbd::ParentImageSpec &parent_spec, + const std::optional<size_t> &max_level, + std::vector<librbd::linked_image_spec_t> *images); + + static int deep_copy(ImageCtxT *ictx, librados::IoCtx& dest_md_ctx, + const char *destname, ImageOptions& opts, + ProgressContext &prog_ctx); + static int deep_copy(ImageCtxT *src, ImageCtxT *dest, bool flatten, + ProgressContext &prog_ctx); + + static int snap_set(ImageCtxT *ictx, + const cls::rbd::SnapshotNamespace &snap_namespace, + const char *snap_name); + static int snap_set(ImageCtxT *ictx, uint64_t snap_id); + + static int remove(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext& prog_ctx); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Image<librbd::ImageCtx>; + +#endif // LIBRBD_API_IMAGE_H diff --git a/src/librbd/api/Migration.cc b/src/librbd/api/Migration.cc new file mode 100644 index 00000000..ae09d407 --- /dev/null +++ b/src/librbd/api/Migration.cc @@ -0,0 +1,1885 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Migration.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Group.h" +#include "librbd/api/Image.h" +#include "librbd/api/Snapshot.h" +#include "librbd/api/Trash.h" +#include "librbd/deep_copy/ImageCopyRequest.h" +#include "librbd/deep_copy/MetadataCopyRequest.h" +#include "librbd/deep_copy/SnapshotCopyRequest.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/image/AttachChildRequest.h" +#include "librbd/image/AttachParentRequest.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/image/DetachParentRequest.h" +#include "librbd/image/ListWatchersRequest.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/internal.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/mirror/EnableRequest.h" + +#include <boost/scope_exit.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Migration: " << __func__ << ": " + +namespace librbd { + +inline bool operator==(const linked_image_spec_t& rhs, + const linked_image_spec_t& lhs) { + bool result = (rhs.pool_id == lhs.pool_id && + rhs.pool_namespace == lhs.pool_namespace && + rhs.image_id == lhs.image_id); + return result; +} + +namespace api { + +using util::create_rados_callback; + +namespace { + +class MigrationProgressContext : public ProgressContext { +public: + MigrationProgressContext(librados::IoCtx& io_ctx, + const std::string &header_oid, + cls::rbd::MigrationState state, + ProgressContext *prog_ctx) + : m_io_ctx(io_ctx), m_header_oid(header_oid), m_state(state), + m_prog_ctx(prog_ctx), m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())), + m_lock(util::unique_lock_name("librbd::api::MigrationProgressContext", + this)) { + ceph_assert(m_prog_ctx != nullptr); + } + + ~MigrationProgressContext() { + wait_for_in_flight_updates(); + } + + int update_progress(uint64_t offset, uint64_t total) override { + ldout(m_cct, 20) << "offset=" << offset << ", total=" << total << dendl; + + m_prog_ctx->update_progress(offset, total); + + std::string description = stringify(offset * 100 / total) + "% complete"; + + send_state_description_update(description); + + return 0; + } + +private: + librados::IoCtx& m_io_ctx; + std::string m_header_oid; + cls::rbd::MigrationState m_state; + ProgressContext *m_prog_ctx; + + CephContext* m_cct; + mutable Mutex m_lock; + Cond m_cond; + std::string m_state_description; + bool m_pending_update = false; + int m_in_flight_state_updates = 0; + + void send_state_description_update(const std::string &description) { + Mutex::Locker locker(m_lock); + + if (description == m_state_description) { + return; + } + + m_state_description = description; + + if (m_in_flight_state_updates > 0) { + m_pending_update = true; + return; + } + + set_state_description(); + } + + void set_state_description() { + ldout(m_cct, 20) << "state_description=" << m_state_description << dendl; + + ceph_assert(m_lock.is_locked()); + + librados::ObjectWriteOperation op; + cls_client::migration_set_state(&op, m_state, m_state_description); + + using klass = MigrationProgressContext; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_set_state_description>(this); + int r = m_io_ctx.aio_operate(m_header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + + m_in_flight_state_updates++; + } + + void handle_set_state_description(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + m_in_flight_state_updates--; + + if (r < 0) { + lderr(m_cct) << "failed to update migration state: " << cpp_strerror(r) + << dendl; + } else if (m_pending_update) { + set_state_description(); + m_pending_update = false; + } else { + m_cond.Signal(); + } + } + + void wait_for_in_flight_updates() { + Mutex::Locker locker(m_lock); + + ldout(m_cct, 20) << "m_in_flight_state_updates=" + << m_in_flight_state_updates << dendl; + + m_pending_update = false; + while (m_in_flight_state_updates > 0) { + m_cond.Wait(m_lock); + } + } +}; + +int trash_search(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, std::string *image_id) { + std::vector<trash_image_info_t> entries; + + int r = Trash<>::list(io_ctx, entries, false); + if (r < 0) { + return r; + } + + for (auto &entry : entries) { + if (entry.source == source && entry.name == image_name) { + *image_id = entry.id; + return 0; + } + } + + return -ENOENT; +} + +template <typename I> +int open_source_image(librados::IoCtx& io_ctx, const std::string &image_name, + I **src_image_ctx, librados::IoCtx *dst_io_ctx, + std::string *dst_image_name, std::string *dst_image_id, + bool *flatten, bool *mirroring, + cls::rbd::MigrationState *state, + std::string *state_description) { + CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + librados::IoCtx src_io_ctx; + std::string src_image_name; + std::string src_image_id; + cls::rbd::MigrationSpec migration_spec; + I *image_ctx = I::create(image_name, "", nullptr, io_ctx, false); + + ldout(cct, 10) << "trying to open image by name " << io_ctx.get_pool_name() + << "/" << image_name << dendl; + + int r = image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + return r; + } + image_ctx = nullptr; + } + + BOOST_SCOPE_EXIT_TPL(&r, &image_ctx) { + if (r != 0 && image_ctx != nullptr) { + image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + if (r == 0) { + // The opened image is either a source (then just proceed) or a + // destination (then look for the source image id in the migration + // header). + + r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid, + &migration_spec); + + if (r < 0) { + lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 10) << "migration spec: " << migration_spec << dendl; + + if (migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_SRC && + migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST) { + lderr(cct) << "unexpected migration header type: " + << migration_spec.header_type << dendl; + r = -EINVAL; + return r; + } + + if (migration_spec.header_type == cls::rbd::MIGRATION_HEADER_TYPE_DST) { + ldout(cct, 10) << "the destination image is opened" << dendl; + + // Close and look for the source image. + r = image_ctx->state->close(); + image_ctx = nullptr; + if (r < 0) { + lderr(cct) << "failed closing image: " << cpp_strerror(r) + << dendl; + return r; + } + + r = util::create_ioctx(io_ctx, "source image", migration_spec.pool_id, + migration_spec.pool_namespace, &src_io_ctx); + if (r < 0) { + return r; + } + + src_image_name = migration_spec.image_name; + src_image_id = migration_spec.image_id; + } else { + ldout(cct, 10) << "the source image is opened" << dendl; + } + } else { + assert (r == -ENOENT); + + ldout(cct, 10) << "source image is not found. Trying trash" << dendl; + + r = trash_search(io_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION, image_name, + &src_image_id); + if (r < 0) { + lderr(cct) << "failed to determine image id: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 10) << "source image id from trash: " << src_image_id << dendl; + + src_io_ctx.dup(io_ctx); + } + + if (image_ctx == nullptr) { + int flags = OPEN_FLAG_IGNORE_MIGRATING; + + if (src_image_id.empty()) { + ldout(cct, 20) << "trying to open v1 image by name " + << src_io_ctx.get_pool_name() << "/" << src_image_name + << dendl; + + flags |= OPEN_FLAG_OLD_FORMAT; + } else { + ldout(cct, 20) << "trying to open v2 image by id " + << src_io_ctx.get_pool_name() << "/" << src_image_id + << dendl; + } + + image_ctx = I::create(src_image_name, src_image_id, nullptr, src_io_ctx, + false); + r = image_ctx->state->open(flags); + if (r < 0) { + lderr(cct) << "failed to open source image " << src_io_ctx.get_pool_name() + << "/" << (src_image_id.empty() ? src_image_name : src_image_id) + << ": " << cpp_strerror(r) << dendl; + image_ctx = nullptr; + return r; + } + + r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid, + &migration_spec); + if (r < 0) { + lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 20) << "migration spec: " << migration_spec << dendl; + } + + r = util::create_ioctx(image_ctx->md_ctx, "source image", + migration_spec.pool_id, migration_spec.pool_namespace, + dst_io_ctx); + if (r < 0) { + return r; + } + + *src_image_ctx = image_ctx; + *dst_image_name = migration_spec.image_name; + *dst_image_id = migration_spec.image_id; + *flatten = migration_spec.flatten; + *mirroring = migration_spec.mirroring; + *state = migration_spec.state; + *state_description = migration_spec.state_description; + + return 0; +} + +class SteppedProgressContext : public ProgressContext { +public: + SteppedProgressContext(ProgressContext* progress_ctx, size_t total_steps) + : m_progress_ctx(progress_ctx), m_total_steps(total_steps) { + } + + void next_step() { + ceph_assert(m_current_step < m_total_steps); + ++m_current_step; + } + + int update_progress(uint64_t object_number, + uint64_t object_count) override { + return m_progress_ctx->update_progress( + object_number + (object_count * (m_current_step - 1)), + object_count * m_total_steps); + } + +private: + ProgressContext* m_progress_ctx; + size_t m_total_steps; + size_t m_current_step = 1; +}; + +} // anonymous namespace + +template <typename I> +int Migration<I>::prepare(librados::IoCtx& io_ctx, + const std::string &image_name, + librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name_, + ImageOptions& opts) { + CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + std::string dest_image_name = dest_image_name_.empty() ? image_name : + dest_image_name_; + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << " -> " + << dest_io_ctx.get_pool_name() << "/" << dest_image_name + << ", opts=" << opts << dendl; + + auto image_ctx = I::create(image_name, "", nullptr, io_ctx, false); + int r = image_ctx->state->open(0); + if (r < 0) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + return r; + } + BOOST_SCOPE_EXIT_TPL(image_ctx) { + image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + std::list<obj_watch_t> watchers; + int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES; + C_SaferCond on_list_watchers; + auto list_watchers_request = librbd::image::ListWatchersRequest<I>::create( + *image_ctx, flags, &watchers, &on_list_watchers); + list_watchers_request->send(); + r = on_list_watchers.wait(); + if (r < 0) { + lderr(cct) << "failed listing watchers:" << cpp_strerror(r) << dendl; + return r; + } + if (!watchers.empty()) { + lderr(cct) << "image has watchers - not migrating" << dendl; + return -EBUSY; + } + + uint64_t format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, format); + } + if (format != 2) { + lderr(cct) << "unsupported destination image format: " << format << dendl; + return -EINVAL; + } + + uint64_t features; + { + RWLock::RLocker snap_locker(image_ctx->snap_lock); + features = image_ctx->features; + } + opts.get(RBD_IMAGE_OPTION_FEATURES, &features); + if ((features & ~RBD_FEATURES_ALL) != 0) { + lderr(cct) << "librbd does not support requested features" << dendl; + return -ENOSYS; + } + features &= ~RBD_FEATURES_INTERNAL; + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + + uint64_t order = image_ctx->order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + r = image::CreateRequest<I>::validate_order(cct, order); + if (r < 0) { + return r; + } + + uint64_t stripe_unit = image_ctx->stripe_unit; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + uint64_t stripe_count = image_ctx->stripe_count; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + + uint64_t flatten = 0; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) { + opts.unset(RBD_IMAGE_OPTION_FLATTEN); + } + + ldout(cct, 20) << "updated opts=" << opts << dendl; + + Migration migration(image_ctx, dest_io_ctx, dest_image_name, "", opts, flatten > 0, + false, cls::rbd::MIGRATION_STATE_PREPARING, "", nullptr); + r = migration.prepare(); + + return r; +} + +template <typename I> +int Migration<I>::execute(librados::IoCtx& io_ctx, + const std::string &image_name, + ProgressContext &prog_ctx) { + CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *image_ctx; + librados::IoCtx dest_io_ctx; + std::string dest_image_name; + std::string dest_image_id; + bool flatten; + bool mirroring; + cls::rbd::MigrationState state; + std::string state_description; + + int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx, + &dest_image_name, &dest_image_id, &flatten, + &mirroring, &state, &state_description); + if (r < 0) { + return r; + } + + BOOST_SCOPE_EXIT_TPL(image_ctx) { + image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + if (state != cls::rbd::MIGRATION_STATE_PREPARED) { + lderr(cct) << "current migration state is '" << state << "'" + << " (should be 'prepared')" << dendl; + return -EINVAL; + } + + ldout(cct, 5) << "migrating " << image_ctx->md_ctx.get_pool_name() << "/" + << image_ctx->name << " -> " << dest_io_ctx.get_pool_name() + << "/" << dest_image_name << dendl; + + ImageOptions opts; + Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id, + opts, flatten, mirroring, state, state_description, + &prog_ctx); + r = migration.execute(); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::abort(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx) { + CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *image_ctx; + librados::IoCtx dest_io_ctx; + std::string dest_image_name; + std::string dest_image_id; + bool flatten; + bool mirroring; + cls::rbd::MigrationState state; + std::string state_description; + + int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx, + &dest_image_name, &dest_image_id, &flatten, + &mirroring, &state, &state_description); + if (r < 0) { + return r; + } + + ldout(cct, 5) << "canceling incomplete migration " + << image_ctx->md_ctx.get_pool_name() << "/" << image_ctx->name + << " -> " << dest_io_ctx.get_pool_name() << "/" << dest_image_name + << dendl; + + ImageOptions opts; + Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id, + opts, flatten, mirroring, state, state_description, + &prog_ctx); + r = migration.abort(); + + image_ctx->state->close(); + + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::commit(librados::IoCtx& io_ctx, + const std::string &image_name, + ProgressContext &prog_ctx) { + CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *image_ctx; + librados::IoCtx dest_io_ctx; + std::string dest_image_name; + std::string dest_image_id; + bool flatten; + bool mirroring; + cls::rbd::MigrationState state; + std::string state_description; + + int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx, + &dest_image_name, &dest_image_id, &flatten, + &mirroring, &state, &state_description); + if (r < 0) { + return r; + } + + if (state != cls::rbd::MIGRATION_STATE_EXECUTED) { + lderr(cct) << "current migration state is '" << state << "'" + << " (should be 'executed')" << dendl; + image_ctx->state->close(); + return -EINVAL; + } + + ldout(cct, 5) << "migrating " << image_ctx->md_ctx.get_pool_name() << "/" + << image_ctx->name << " -> " << dest_io_ctx.get_pool_name() + << "/" << dest_image_name << dendl; + + ImageOptions opts; + Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id, + opts, flatten, mirroring, state, state_description, + &prog_ctx); + r = migration.commit(); + + // image_ctx is closed in commit when removing src image + + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::status(librados::IoCtx& io_ctx, + const std::string &image_name, + image_migration_status_t *status) { + CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *image_ctx; + librados::IoCtx dest_io_ctx; + std::string dest_image_name; + std::string dest_image_id; + bool flatten; + bool mirroring; + cls::rbd::MigrationState state; + std::string state_description; + + int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx, + &dest_image_name, &dest_image_id, &flatten, + &mirroring, &state, &state_description); + if (r < 0) { + return r; + } + + ldout(cct, 5) << "migrating " << image_ctx->md_ctx.get_pool_name() << "/" + << image_ctx->name << " -> " << dest_io_ctx.get_pool_name() + << "/" << dest_image_name << dendl; + + ImageOptions opts; + Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id, + opts, flatten, mirroring, state, state_description, + nullptr); + r = migration.status(status); + + image_ctx->state->close(); + + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +Migration<I>::Migration(I *src_image_ctx, librados::IoCtx& dst_io_ctx, + const std::string &dstname, + const std::string &dst_image_id, + ImageOptions& opts, bool flatten, bool mirroring, + cls::rbd::MigrationState state, + const std::string &state_description, + ProgressContext *prog_ctx) + : m_cct(static_cast<CephContext *>(dst_io_ctx.cct())), + m_src_image_ctx(src_image_ctx), m_dst_io_ctx(dst_io_ctx), + m_src_old_format(m_src_image_ctx->old_format), + m_src_image_name(m_src_image_ctx->old_format ? m_src_image_ctx->name : ""), + m_src_image_id(m_src_image_ctx->id), + m_src_header_oid(m_src_image_ctx->header_oid), m_dst_image_name(dstname), + m_dst_image_id(dst_image_id.empty() ? + util::generate_image_id(m_dst_io_ctx) : dst_image_id), + m_dst_header_oid(util::header_name(m_dst_image_id)), m_image_options(opts), + m_flatten(flatten), m_mirroring(mirroring), m_prog_ctx(prog_ctx), + m_src_migration_spec(cls::rbd::MIGRATION_HEADER_TYPE_SRC, + m_dst_io_ctx.get_id(), m_dst_io_ctx.get_namespace(), + m_dst_image_name, m_dst_image_id, {}, 0, flatten, + mirroring, state, state_description), + m_dst_migration_spec(cls::rbd::MIGRATION_HEADER_TYPE_DST, + src_image_ctx->md_ctx.get_id(), + src_image_ctx->md_ctx.get_namespace(), + m_src_image_ctx->name, m_src_image_ctx->id, {}, 0, + flatten, mirroring, state, state_description) { + m_src_io_ctx.dup(src_image_ctx->md_ctx); +} + +template <typename I> +int Migration<I>::prepare() { + ldout(m_cct, 10) << dendl; + + int r = validate_src_snaps(); + if (r < 0) { + return r; + } + + r = disable_mirroring(m_src_image_ctx, &m_mirroring); + if (r < 0) { + return r; + } + + r = unlink_src_image(); + if (r < 0) { + enable_mirroring(m_src_image_ctx, m_mirroring); + return r; + } + + r = set_migration(); + if (r < 0) { + relink_src_image(); + enable_mirroring(m_src_image_ctx, m_mirroring); + return r; + } + + r = create_dst_image(); + if (r < 0) { + abort(); + return r; + } + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template <typename I> +int Migration<I>::execute() { + ldout(m_cct, 10) << dendl; + + auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr, + m_dst_io_ctx, false); + int r = dst_image_ctx->state->open(0); + if (r < 0) { + lderr(m_cct) << "failed to open destination image: " << cpp_strerror(r) + << dendl; + return r; + } + + BOOST_SCOPE_EXIT_TPL(dst_image_ctx) { + dst_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + r = set_state(cls::rbd::MIGRATION_STATE_EXECUTING, ""); + if (r < 0) { + return r; + } + + while (true) { + MigrationProgressContext prog_ctx(m_src_io_ctx, m_src_header_oid, + cls::rbd::MIGRATION_STATE_EXECUTING, + m_prog_ctx); + r = dst_image_ctx->operations->migrate(prog_ctx); + if (r == -EROFS) { + RWLock::RLocker owner_locker(dst_image_ctx->owner_lock); + if (dst_image_ctx->exclusive_lock != nullptr && + !dst_image_ctx->exclusive_lock->accept_ops()) { + ldout(m_cct, 5) << "lost exclusive lock, retrying remote" << dendl; + continue; + } + } + break; + } + if (r < 0) { + lderr(m_cct) << "migration failed: " << cpp_strerror(r) << dendl; + return r; + } + + r = set_state(cls::rbd::MIGRATION_STATE_EXECUTED, ""); + if (r < 0) { + return r; + } + + dst_image_ctx->notify_update(); + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template <typename I> +int Migration<I>::abort() { + ldout(m_cct, 10) << dendl; + + int r; + + m_src_image_ctx->owner_lock.get_read(); + if (m_src_image_ctx->exclusive_lock != nullptr && + !m_src_image_ctx->exclusive_lock->is_lock_owner()) { + C_SaferCond ctx; + m_src_image_ctx->exclusive_lock->acquire_lock(&ctx); + m_src_image_ctx->owner_lock.put_read(); + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "error acquiring exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + } else { + m_src_image_ctx->owner_lock.put_read(); + } + + group_info_t group_info; + group_info.pool = -1; + + auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr, + m_dst_io_ctx, false); + r = dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + if (r < 0) { + ldout(m_cct, 1) << "failed to open destination image: " << cpp_strerror(r) + << dendl; + } else { + BOOST_SCOPE_EXIT_TPL(&dst_image_ctx) { + if (dst_image_ctx != nullptr) { + dst_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + std::list<obj_watch_t> watchers; + int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES; + C_SaferCond on_list_watchers; + auto list_watchers_request = librbd::image::ListWatchersRequest<I>::create( + *dst_image_ctx, flags, &watchers, &on_list_watchers); + list_watchers_request->send(); + r = on_list_watchers.wait(); + if (r < 0) { + lderr(m_cct) << "failed listing watchers:" << cpp_strerror(r) << dendl; + return r; + } + if (!watchers.empty()) { + lderr(m_cct) << "image has watchers - cannot abort migration" << dendl; + return -EBUSY; + } + + // ensure destination image is now read-only + r = set_state(cls::rbd::MIGRATION_STATE_ABORTING, ""); + if (r < 0) { + return r; + } + + // copy dst HEAD -> src HEAD + SteppedProgressContext progress_ctx(m_prog_ctx, 2); + revert_data(dst_image_ctx, m_src_image_ctx, &progress_ctx); + progress_ctx.next_step(); + + ldout(m_cct, 10) << "relinking children" << dendl; + r = relink_children(dst_image_ctx, m_src_image_ctx); + if (r < 0) { + return r; + } + + ldout(m_cct, 10) << "removing dst image snapshots" << dendl; + + std::vector<librbd::snap_info_t> snaps; + r = snap_list(dst_image_ctx, snaps); + if (r < 0) { + lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto &snap : snaps) { + librbd::NoOpProgressContext prog_ctx; + int r = snap_remove(dst_image_ctx, snap.name.c_str(), + RBD_SNAP_REMOVE_UNPROTECT, prog_ctx); + if (r < 0) { + lderr(m_cct) << "failed removing snapshot: " << cpp_strerror(r) + << dendl; + return r; + } + } + + ldout(m_cct, 10) << "removing group" << dendl; + + r = remove_group(dst_image_ctx, &group_info); + if (r < 0 && r != -ENOENT) { + return r; + } + + ldout(m_cct, 10) << "removing dst image" << dendl; + + ceph_assert(dst_image_ctx->ignore_migrating); + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(m_cct, &thread_pool, &op_work_queue); + C_SaferCond on_remove; + auto req = librbd::image::RemoveRequest<>::create( + m_dst_io_ctx, dst_image_ctx, false, false, progress_ctx, op_work_queue, + &on_remove); + req->send(); + r = on_remove.wait(); + + dst_image_ctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "failed removing destination image '" + << m_dst_io_ctx.get_pool_name() << "/" << m_dst_image_name + << " (" << m_dst_image_id << ")': " << cpp_strerror(r) + << dendl; + return r; + } + } + + r = relink_src_image(); + if (r < 0) { + return r; + } + + r = add_group(m_src_image_ctx, group_info); + if (r < 0) { + return r; + } + + r = remove_migration(m_src_image_ctx); + if (r < 0) { + return r; + } + + r = enable_mirroring(m_src_image_ctx, m_mirroring); + if (r < 0) { + return r; + } + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template <typename I> +int Migration<I>::commit() { + ldout(m_cct, 10) << dendl; + + BOOST_SCOPE_EXIT_TPL(&m_src_image_ctx) { + if (m_src_image_ctx != nullptr) { + m_src_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr, + m_dst_io_ctx, false); + int r = dst_image_ctx->state->open(0); + if (r < 0) { + lderr(m_cct) << "failed to open destination image: " << cpp_strerror(r) + << dendl; + return r; + } + + BOOST_SCOPE_EXIT_TPL(dst_image_ctx) { + dst_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + r = remove_migration(dst_image_ctx); + if (r < 0) { + return r; + } + + r = remove_src_image(); + if (r < 0) { + return r; + } + + r = enable_mirroring(dst_image_ctx, m_mirroring); + if (r < 0) { + return r; + } + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template <typename I> +int Migration<I>::status(image_migration_status_t *status) { + ldout(m_cct, 10) << dendl; + + status->source_pool_id = m_dst_migration_spec.pool_id; + status->source_pool_namespace = m_dst_migration_spec.pool_namespace; + status->source_image_name = m_dst_migration_spec.image_name; + status->source_image_id = m_dst_migration_spec.image_id; + status->dest_pool_id = m_src_migration_spec.pool_id; + status->dest_pool_namespace = m_src_migration_spec.pool_namespace; + status->dest_image_name = m_src_migration_spec.image_name; + status->dest_image_id = m_src_migration_spec.image_id; + + switch (m_src_migration_spec.state) { + case cls::rbd::MIGRATION_STATE_ERROR: + status->state = RBD_IMAGE_MIGRATION_STATE_ERROR; + break; + case cls::rbd::MIGRATION_STATE_PREPARING: + status->state = RBD_IMAGE_MIGRATION_STATE_PREPARING; + break; + case cls::rbd::MIGRATION_STATE_PREPARED: + status->state = RBD_IMAGE_MIGRATION_STATE_PREPARED; + break; + case cls::rbd::MIGRATION_STATE_EXECUTING: + status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTING; + break; + case cls::rbd::MIGRATION_STATE_EXECUTED: + status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTED; + break; + default: + status->state = RBD_IMAGE_MIGRATION_STATE_UNKNOWN; + break; + } + + status->state_description = m_src_migration_spec.state_description; + + return 0; +} + +template <typename I> +int Migration<I>::set_state(cls::rbd::MigrationState state, + const std::string &description) { + int r = cls_client::migration_set_state(&m_src_io_ctx, m_src_header_oid, + state, description); + if (r < 0) { + lderr(m_cct) << "failed to set source migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + r = cls_client::migration_set_state(&m_dst_io_ctx, m_dst_header_oid, state, + description); + if (r < 0) { + lderr(m_cct) << "failed to set destination migration header: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::list_src_snaps(std::vector<librbd::snap_info_t> *snaps) { + ldout(m_cct, 10) << dendl; + + int r = snap_list(m_src_image_ctx, *snaps); + if (r < 0) { + lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto &snap : *snaps) { + librbd::snap_namespace_type_t namespace_type; + r = Snapshot<I>::get_namespace_type(m_src_image_ctx, snap.id, + &namespace_type); + if (r < 0) { + lderr(m_cct) << "error getting snap namespace type: " << cpp_strerror(r) + << dendl; + return r; + } + + if (namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER) { + if (namespace_type == RBD_SNAP_NAMESPACE_TYPE_TRASH) { + lderr(m_cct) << "image has snapshots with linked clones that must be " + << "deleted or flattened before the image can be migrated" + << dendl; + } else { + lderr(m_cct) << "image has non-user type snapshots " + << "that are not supported by migration" << dendl; + } + return -EBUSY; + } + } + + return 0; +} + +template <typename I> +int Migration<I>::validate_src_snaps() { + ldout(m_cct, 10) << dendl; + + std::vector<librbd::snap_info_t> snaps; + int r = list_src_snaps(&snaps); + if (r < 0) { + return r; + } + + uint64_t dst_features = 0; + r = m_image_options.get(RBD_IMAGE_OPTION_FEATURES, &dst_features); + ceph_assert(r == 0); + + if (!m_src_image_ctx->test_features(RBD_FEATURE_LAYERING)) { + return 0; + } + + for (auto &snap : snaps) { + RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock); + cls::rbd::ParentImageSpec parent_spec{m_src_image_ctx->md_ctx.get_id(), + m_src_image_ctx->md_ctx.get_namespace(), + m_src_image_ctx->id, snap.id}; + std::vector<librbd::linked_image_spec_t> child_images; + r = api::Image<I>::list_children(m_src_image_ctx, parent_spec, + &child_images); + if (r < 0) { + lderr(m_cct) << "failed listing children: " << cpp_strerror(r) + << dendl; + return r; + } + if (!child_images.empty()) { + ldout(m_cct, 1) << m_src_image_ctx->name << "@" << snap.name + << " has children" << dendl; + + if ((dst_features & RBD_FEATURE_LAYERING) == 0) { + lderr(m_cct) << "can't migrate to destination without layering feature: " + << "image has children" << dendl; + return -EINVAL; + } + } + } + + return 0; +} + + +template <typename I> +int Migration<I>::set_migration() { + ldout(m_cct, 10) << dendl; + + m_src_image_ctx->ignore_migrating = true; + + int r = cls_client::migration_set(&m_src_io_ctx, m_src_header_oid, + m_src_migration_spec); + if (r < 0) { + lderr(m_cct) << "failed to set migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + m_src_image_ctx->notify_update(); + + return 0; +} + +template <typename I> +int Migration<I>::remove_migration(I *image_ctx) { + ldout(m_cct, 10) << dendl; + + int r; + + r = cls_client::migration_remove(&image_ctx->md_ctx, image_ctx->header_oid); + if (r == -ENOENT) { + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed removing migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + image_ctx->notify_update(); + + return 0; +} + +template <typename I> +int Migration<I>::unlink_src_image() { + if (m_src_old_format) { + return v1_unlink_src_image(); + } else { + return v2_unlink_src_image(); + } +} + +template <typename I> +int Migration<I>::v1_unlink_src_image() { + ldout(m_cct, 10) << dendl; + + int r = tmap_rm(m_src_io_ctx, m_src_image_name); + if (r < 0) { + lderr(m_cct) << "failed removing " << m_src_image_name << " from tmap: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::v2_unlink_src_image() { + ldout(m_cct, 10) << dendl; + + m_src_image_ctx->owner_lock.get_read(); + if (m_src_image_ctx->exclusive_lock != nullptr && + m_src_image_ctx->exclusive_lock->is_lock_owner()) { + C_SaferCond ctx; + m_src_image_ctx->exclusive_lock->release_lock(&ctx); + m_src_image_ctx->owner_lock.put_read(); + int r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "error releasing exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + } else { + m_src_image_ctx->owner_lock.put_read(); + } + + int r = Trash<I>::move(m_src_io_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION, + m_src_image_ctx->name, 0); + if (r < 0) { + lderr(m_cct) << "failed moving image to trash: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::relink_src_image() { + if (m_src_old_format) { + return v1_relink_src_image(); + } else { + return v2_relink_src_image(); + } +} + +template <typename I> +int Migration<I>::v1_relink_src_image() { + ldout(m_cct, 10) << dendl; + + int r = tmap_set(m_src_io_ctx, m_src_image_name); + if (r < 0) { + lderr(m_cct) << "failed adding " << m_src_image_name << " to tmap: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::v2_relink_src_image() { + ldout(m_cct, 10) << dendl; + + int r = Trash<I>::restore(m_src_io_ctx, + {cls::rbd::TRASH_IMAGE_SOURCE_MIGRATION}, + m_src_image_ctx->id, m_src_image_ctx->name); + if (r < 0) { + lderr(m_cct) << "failed restoring image from trash: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::create_dst_image() { + ldout(m_cct, 10) << dendl; + + uint64_t size; + cls::rbd::ParentImageSpec parent_spec; + { + RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock); + RWLock::RLocker parent_locker(m_src_image_ctx->parent_lock); + size = m_src_image_ctx->size; + + // use oldest snapshot or HEAD for parent spec + if (!m_src_image_ctx->snap_info.empty()) { + parent_spec = m_src_image_ctx->snap_info.begin()->second.parent.spec; + } else { + parent_spec = m_src_image_ctx->parent_md.spec; + } + } + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(m_cct, &thread_pool, &op_work_queue); + + ConfigProxy config{m_cct->_conf}; + api::Config<I>::apply_pool_overrides(m_dst_io_ctx, &config); + + int r; + C_SaferCond on_create; + librados::IoCtx parent_io_ctx; + if (parent_spec.pool_id == -1) { + auto *req = image::CreateRequest<I>::create( + config, m_dst_io_ctx, m_dst_image_name, m_dst_image_id, size, + m_image_options, "", "", true /* skip_mirror_enable */, op_work_queue, + &on_create); + req->send(); + } else { + r = util::create_ioctx(m_src_image_ctx->md_ctx, "destination image", + parent_spec.pool_id, parent_spec.pool_namespace, + &parent_io_ctx); + if (r < 0) { + return r; + } + + auto *req = image::CloneRequest<I>::create( + config, parent_io_ctx, parent_spec.image_id, "", parent_spec.snap_id, + m_dst_io_ctx, m_dst_image_name, m_dst_image_id, m_image_options, "", "", + op_work_queue, &on_create); + req->send(); + } + + r = on_create.wait(); + if (r < 0) { + lderr(m_cct) << "header creation failed: " << cpp_strerror(r) << dendl; + return r; + } + + auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr, + m_dst_io_ctx, false); + + r = dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + if (r < 0) { + lderr(m_cct) << "failed to open newly created header: " << cpp_strerror(r) + << dendl; + return r; + } + + BOOST_SCOPE_EXIT_TPL(dst_image_ctx) { + dst_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + { + RWLock::RLocker owner_locker(dst_image_ctx->owner_lock); + r = dst_image_ctx->operations->prepare_image_update( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true); + if (r < 0) { + lderr(m_cct) << "cannot obtain exclusive lock" << dendl; + return r; + } + if (dst_image_ctx->exclusive_lock != nullptr) { + dst_image_ctx->exclusive_lock->block_requests(0); + } + } + + SnapSeqs snap_seqs; + + C_SaferCond on_snapshot_copy; + auto snapshot_copy_req = librbd::deep_copy::SnapshotCopyRequest<I>::create( + m_src_image_ctx, dst_image_ctx, 0, CEPH_NOSNAP, 0, m_flatten, + m_src_image_ctx->op_work_queue, &snap_seqs, &on_snapshot_copy); + snapshot_copy_req->send(); + r = on_snapshot_copy.wait(); + if (r < 0) { + lderr(m_cct) << "failed to copy snapshots: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond on_metadata_copy; + auto metadata_copy_req = librbd::deep_copy::MetadataCopyRequest<I>::create( + m_src_image_ctx, dst_image_ctx, &on_metadata_copy); + metadata_copy_req->send(); + r = on_metadata_copy.wait(); + if (r < 0) { + lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl; + return r; + } + + m_dst_migration_spec = {cls::rbd::MIGRATION_HEADER_TYPE_DST, + m_src_io_ctx.get_id(), m_src_io_ctx.get_namespace(), + m_src_image_name, m_src_image_id, snap_seqs, size, + m_flatten, m_mirroring, + cls::rbd::MIGRATION_STATE_PREPARING, ""}; + + r = cls_client::migration_set(&m_dst_io_ctx, m_dst_header_oid, + m_dst_migration_spec); + if (r < 0) { + lderr(m_cct) << "failed to set migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + r = update_group(m_src_image_ctx, dst_image_ctx); + if (r < 0) { + return r; + } + + r = set_state(cls::rbd::MIGRATION_STATE_PREPARED, ""); + if (r < 0) { + return r; + } + + r = dst_image_ctx->state->refresh(); + if (r < 0) { + lderr(m_cct) << "failed to refresh destination image: " << cpp_strerror(r) + << dendl; + return r; + } + + r = relink_children(m_src_image_ctx, dst_image_ctx); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::remove_group(I *image_ctx, group_info_t *group_info) { + int r = librbd::api::Group<I>::image_get_group(image_ctx, group_info); + if (r < 0) { + lderr(m_cct) << "failed to get image group: " << cpp_strerror(r) << dendl; + return r; + } + + if (group_info->pool == -1) { + return -ENOENT; + } + + ceph_assert(!image_ctx->id.empty()); + + ldout(m_cct, 10) << dendl; + + IoCtx group_ioctx; + r = util::create_ioctx(image_ctx->md_ctx, "group", group_info->pool, {}, + &group_ioctx); + if (r < 0) { + return r; + } + + r = librbd::api::Group<I>::image_remove_by_id(group_ioctx, + group_info->name.c_str(), + image_ctx->md_ctx, + image_ctx->id.c_str()); + if (r < 0) { + lderr(m_cct) << "failed to remove image from group: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::add_group(I *image_ctx, group_info_t &group_info) { + if (group_info.pool == -1) { + return 0; + } + + ldout(m_cct, 10) << dendl; + + IoCtx group_ioctx; + int r = util::create_ioctx(image_ctx->md_ctx, "group", group_info.pool, {}, + &group_ioctx); + if (r < 0) { + return r; + } + + r = librbd::api::Group<I>::image_add(group_ioctx, group_info.name.c_str(), + image_ctx->md_ctx, + image_ctx->name.c_str()); + if (r < 0) { + lderr(m_cct) << "failed to add image to group: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::update_group(I *from_image_ctx, I *to_image_ctx) { + ldout(m_cct, 10) << dendl; + + group_info_t group_info; + + int r = remove_group(from_image_ctx, &group_info); + if (r < 0) { + return r == -ENOENT ? 0 : r; + } + + r = add_group(to_image_ctx, group_info); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Migration<I>::disable_mirroring(I *image_ctx, bool *was_enabled) { + *was_enabled = false; + + if (!image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + return 0; + } + + cls::rbd::MirrorImage mirror_image; + int r = cls_client::mirror_image_get(&image_ctx->md_ctx, image_ctx->id, + &mirror_image); + if (r == -ENOENT) { + ldout(m_cct, 10) << "mirroring is not enabled for this image" << dendl; + return 0; + } + + if (r < 0) { + lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(r) + << dendl; + return r; + } + + if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + *was_enabled = true; + } + + ldout(m_cct, 10) << dendl; + + C_SaferCond ctx; + auto req = mirror::DisableRequest<I>::create(image_ctx, false, true, &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "failed to disable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + m_src_migration_spec.mirroring = true; + + return 0; +} + +template <typename I> +int Migration<I>::enable_mirroring(I *image_ctx, bool was_enabled) { + + if (!image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + return 0; + } + + cls::rbd::MirrorMode mirror_mode; + int r = cls_client::mirror_mode_get(&image_ctx->md_ctx, &mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + ldout(m_cct, 10) << "mirroring is not enabled for destination pool" + << dendl; + return 0; + } + if (mirror_mode == cls::rbd::MIRROR_MODE_IMAGE && !was_enabled) { + ldout(m_cct, 10) << "mirroring is not enabled for image" << dendl; + return 0; + } + + ldout(m_cct, 10) << dendl; + + C_SaferCond ctx; + auto req = mirror::EnableRequest<I>::create(image_ctx->md_ctx, image_ctx->id, + "", image_ctx->op_work_queue, + &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +// When relinking children we should be careful as it my be interrupted +// at any moment by some reason and we may end up in an inconsistent +// state, which we have to be able to fix with "migration abort". Below +// are all possible states during migration (P1 - sourse parent, P2 - +// destination parent, C - child): +// +// P1 P2 P1 P2 P1 P2 P1 P2 +// ^\ \ ^ \ /^ /^ +// \v v/ v/ v/ +// C C C C +// +// 1 2 3 4 +// +// (1) and (4) are the initial and the final consistent states. (2) +// and (3) are intermediate inconsistent states that have to be fixed +// by relink_children running in "migration abort" mode. For this, it +// scans P2 for all children attached and relinks (fixes) states (3) +// and (4) to state (1). Then it scans P1 for remaining children and +// fixes the states (2). + +template <typename I> +int Migration<I>::relink_children(I *from_image_ctx, I *to_image_ctx) { + ldout(m_cct, 10) << dendl; + + std::vector<librbd::snap_info_t> snaps; + int r = list_src_snaps(&snaps); + if (r < 0) { + return r; + } + + bool migration_abort = (to_image_ctx == m_src_image_ctx); + + for (auto it = snaps.begin(); it != snaps.end(); it++) { + auto &snap = *it; + std::vector<librbd::linked_image_spec_t> src_child_images; + + if (from_image_ctx != m_src_image_ctx) { + ceph_assert(migration_abort); + + // We run list snaps against the src image to get only those snapshots + // that are migrated. If the "from" image is not the src image + // (abort migration case), we need to remap snap ids. + // Also collect the list of the children currently attached to the + // source, so we could make a proper decision later about relinking. + + RWLock::RLocker src_snap_locker(to_image_ctx->snap_lock); + cls::rbd::ParentImageSpec src_parent_spec{to_image_ctx->md_ctx.get_id(), + to_image_ctx->md_ctx.get_namespace(), + to_image_ctx->id, snap.id}; + r = api::Image<I>::list_children(to_image_ctx, src_parent_spec, + &src_child_images); + if (r < 0) { + lderr(m_cct) << "failed listing children: " << cpp_strerror(r) + << dendl; + return r; + } + + RWLock::RLocker snap_locker(from_image_ctx->snap_lock); + snap.id = from_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(), + snap.name); + if (snap.id == CEPH_NOSNAP) { + ldout(m_cct, 5) << "skipping snapshot " << snap.name << dendl; + continue; + } + } + + std::vector<librbd::linked_image_spec_t> child_images; + { + RWLock::RLocker snap_locker(from_image_ctx->snap_lock); + cls::rbd::ParentImageSpec parent_spec{from_image_ctx->md_ctx.get_id(), + from_image_ctx->md_ctx.get_namespace(), + from_image_ctx->id, snap.id}; + r = api::Image<I>::list_children(from_image_ctx, parent_spec, + &child_images); + if (r < 0) { + lderr(m_cct) << "failed listing children: " << cpp_strerror(r) + << dendl; + return r; + } + } + + for (auto &child_image : child_images) { + r = relink_child(from_image_ctx, to_image_ctx, snap, child_image, + migration_abort, true); + if (r < 0) { + return r; + } + + src_child_images.erase(std::remove(src_child_images.begin(), + src_child_images.end(), child_image), + src_child_images.end()); + } + + for (auto &child_image : src_child_images) { + r = relink_child(from_image_ctx, to_image_ctx, snap, child_image, + migration_abort, false); + if (r < 0) { + return r; + } + } + } + + return 0; +} + +template <typename I> +int Migration<I>::relink_child(I *from_image_ctx, I *to_image_ctx, + const librbd::snap_info_t &from_snap, + const librbd::linked_image_spec_t &child_image, + bool migration_abort, bool reattach_child) { + ldout(m_cct, 10) << from_snap.name << " " << child_image.pool_name << "/" + << child_image.pool_namespace << "/" + << child_image.image_name << " (migration_abort=" + << migration_abort << ", reattach_child=" << reattach_child + << ")" << dendl; + + librados::snap_t to_snap_id; + { + RWLock::RLocker snap_locker(to_image_ctx->snap_lock); + to_snap_id = to_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(), + from_snap.name); + if (to_snap_id == CEPH_NOSNAP) { + lderr(m_cct) << "no snapshot " << from_snap.name << " on destination image" + << dendl; + return -ENOENT; + } + } + + librados::IoCtx child_io_ctx; + int r = util::create_ioctx(to_image_ctx->md_ctx, + "child image " + child_image.image_name, + child_image.pool_id, child_image.pool_namespace, + &child_io_ctx); + if (r < 0) { + return r; + } + + I *child_image_ctx = I::create("", child_image.image_id, nullptr, + child_io_ctx, false); + r = child_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r < 0) { + lderr(m_cct) << "failed to open child image: " << cpp_strerror(r) << dendl; + return r; + } + BOOST_SCOPE_EXIT_TPL(child_image_ctx) { + child_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + uint32_t clone_format = 1; + if (child_image_ctx->test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) { + clone_format = 2; + } + + cls::rbd::ParentImageSpec parent_spec; + uint64_t parent_overlap; + { + RWLock::RLocker snap_locker(child_image_ctx->snap_lock); + RWLock::RLocker parent_locker(child_image_ctx->parent_lock); + + // use oldest snapshot or HEAD for parent spec + if (!child_image_ctx->snap_info.empty()) { + parent_spec = child_image_ctx->snap_info.begin()->second.parent.spec; + parent_overlap = child_image_ctx->snap_info.begin()->second.parent.overlap; + } else { + parent_spec = child_image_ctx->parent_md.spec; + parent_overlap = child_image_ctx->parent_md.overlap; + } + } + + if (migration_abort && + parent_spec.pool_id == to_image_ctx->md_ctx.get_id() && + parent_spec.pool_namespace == to_image_ctx->md_ctx.get_namespace() && + parent_spec.image_id == to_image_ctx->id && + parent_spec.snap_id == to_snap_id) { + ldout(m_cct, 10) << "no need for parent re-attach" << dendl; + } else { + if (parent_spec.pool_id != from_image_ctx->md_ctx.get_id() || + parent_spec.pool_namespace != from_image_ctx->md_ctx.get_namespace() || + parent_spec.image_id != from_image_ctx->id || + parent_spec.snap_id != from_snap.id) { + lderr(m_cct) << "parent is not source image: " << parent_spec.pool_id + << "/" << parent_spec.pool_namespace << "/" + << parent_spec.image_id << "@" << parent_spec.snap_id + << dendl; + return -ESTALE; + } + + parent_spec.pool_id = to_image_ctx->md_ctx.get_id(); + parent_spec.pool_namespace = to_image_ctx->md_ctx.get_namespace(); + parent_spec.image_id = to_image_ctx->id; + parent_spec.snap_id = to_snap_id; + + C_SaferCond on_reattach_parent; + auto reattach_parent_req = image::AttachParentRequest<I>::create( + *child_image_ctx, parent_spec, parent_overlap, true, &on_reattach_parent); + reattach_parent_req->send(); + r = on_reattach_parent.wait(); + if (r < 0) { + lderr(m_cct) << "failed to re-attach parent: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (reattach_child) { + C_SaferCond on_reattach_child; + auto reattach_child_req = image::AttachChildRequest<I>::create( + child_image_ctx, to_image_ctx, to_snap_id, from_image_ctx, from_snap.id, + clone_format, &on_reattach_child); + reattach_child_req->send(); + r = on_reattach_child.wait(); + if (r < 0) { + lderr(m_cct) << "failed to re-attach child: " << cpp_strerror(r) << dendl; + return r; + } + } + + child_image_ctx->notify_update(); + + return 0; +} + +template <typename I> +int Migration<I>::remove_src_image() { + ldout(m_cct, 10) << dendl; + + std::vector<librbd::snap_info_t> snaps; + int r = list_src_snaps(&snaps); + if (r < 0) { + return r; + } + + for (auto it = snaps.rbegin(); it != snaps.rend(); it++) { + auto &snap = *it; + + librbd::NoOpProgressContext prog_ctx; + int r = snap_remove(m_src_image_ctx, snap.name.c_str(), + RBD_SNAP_REMOVE_UNPROTECT, prog_ctx); + if (r < 0) { + lderr(m_cct) << "failed removing source image snapshot '" << snap.name + << "': " << cpp_strerror(r) << dendl; + return r; + } + } + + ceph_assert(m_src_image_ctx->ignore_migrating); + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(m_cct, &thread_pool, &op_work_queue); + C_SaferCond on_remove; + auto req = librbd::image::RemoveRequest<I>::create( + m_src_io_ctx, m_src_image_ctx, false, true, *m_prog_ctx, op_work_queue, + &on_remove); + req->send(); + r = on_remove.wait(); + + m_src_image_ctx = nullptr; + + // For old format image it will return -ENOENT due to expected + // tmap_rm failure at the end. + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed removing source image: " << cpp_strerror(r) + << dendl; + return r; + } + + if (!m_src_image_id.empty()) { + r = cls_client::trash_remove(&m_src_io_ctx, m_src_image_id); + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing image " << m_src_image_id + << " from rbd_trash object" << dendl; + } + } + + return 0; +} + +template <typename I> +int Migration<I>::revert_data(I* src_image_ctx, I* dst_image_ctx, + ProgressContext* prog_ctx) { + ldout(m_cct, 10) << dendl; + + cls::rbd::MigrationSpec migration_spec; + int r = cls_client::migration_get(&src_image_ctx->md_ctx, + src_image_ctx->header_oid, + &migration_spec); + + if (r < 0) { + lderr(m_cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + if (migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST) { + lderr(m_cct) << "unexpected migration header type: " + << migration_spec.header_type << dendl; + return -EINVAL; + } + + uint64_t src_snap_id_start = 0; + uint64_t src_snap_id_end = CEPH_NOSNAP; + uint64_t dst_snap_id_start = 0; + if (!migration_spec.snap_seqs.empty()) { + src_snap_id_start = migration_spec.snap_seqs.rbegin()->second; + } + + // we only care about the HEAD revision so only add a single mapping to + // represent the most recent state + SnapSeqs snap_seqs; + snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP; + + ldout(m_cct, 20) << "src_snap_id_start=" << src_snap_id_start << ", " + << "src_snap_id_end=" << src_snap_id_end << ", " + << "snap_seqs=" << snap_seqs << dendl; + + C_SaferCond ctx; + auto request = deep_copy::ImageCopyRequest<I>::create( + src_image_ctx, dst_image_ctx, src_snap_id_start, src_snap_id_end, + dst_snap_id_start, false, {}, snap_seqs, prog_ctx, &ctx); + request->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "error reverting destination image data blocks back to " + << "source image: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Migration<librbd::ImageCtx>; diff --git a/src/librbd/api/Migration.h b/src/librbd/api/Migration.h new file mode 100644 index 00000000..52a4517b --- /dev/null +++ b/src/librbd/api/Migration.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_MIGRATION_H +#define CEPH_LIBRBD_API_MIGRATION_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" + +#include <vector> + +namespace librbd { + +class ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +class Migration { +public: + static int prepare(librados::IoCtx& io_ctx, const std::string &image_name, + librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name, ImageOptions& opts); + static int execute(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx); + static int abort(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx); + static int commit(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx); + static int status(librados::IoCtx& io_ctx, const std::string &image_name, + image_migration_status_t *status); + +private: + CephContext* m_cct; + ImageCtxT *m_src_image_ctx; + librados::IoCtx m_src_io_ctx; + librados::IoCtx &m_dst_io_ctx; + bool m_src_old_format; + std::string m_src_image_name; + std::string m_src_image_id; + std::string m_src_header_oid; + std::string m_dst_image_name; + std::string m_dst_image_id; + std::string m_dst_header_oid; + ImageOptions &m_image_options; + bool m_flatten; + bool m_mirroring; + ProgressContext *m_prog_ctx; + + cls::rbd::MigrationSpec m_src_migration_spec; + cls::rbd::MigrationSpec m_dst_migration_spec; + + Migration(ImageCtxT *image_ctx, librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name, const std::string &dst_image_id, + ImageOptions& opts, bool flatten, bool mirroring, + cls::rbd::MigrationState state, const std::string &state_desc, + ProgressContext *prog_ctx); + + int prepare(); + int execute(); + int abort(); + int commit(); + int status(image_migration_status_t *status); + + int set_state(cls::rbd::MigrationState state, const std::string &description); + + int list_src_snaps(std::vector<librbd::snap_info_t> *snaps); + int validate_src_snaps(); + int disable_mirroring(ImageCtxT *image_ctx, bool *was_enabled); + int enable_mirroring(ImageCtxT *image_ctx, bool was_enabled); + int set_migration(); + int unlink_src_image(); + int relink_src_image(); + int create_dst_image(); + int remove_group(ImageCtxT *image_ctx, group_info_t *group_info); + int add_group(ImageCtxT *image_ctx, group_info_t &group_info); + int update_group(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx); + int remove_migration(ImageCtxT *image_ctx); + int relink_children(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx); + int remove_src_image(); + + int v1_set_migration(); + int v2_set_migration(); + int v1_unlink_src_image(); + int v2_unlink_src_image(); + int v1_relink_src_image(); + int v2_relink_src_image(); + + int relink_child(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx, + const librbd::snap_info_t &src_snap, + const librbd::linked_image_spec_t &child_image, + bool migration_abort, bool reattach_child); + + int revert_data(ImageCtxT* src_image_ctx, ImageCtxT* dst_image_ctx, + ProgressContext *prog_ctx); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Migration<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_MIGRATION_H diff --git a/src/librbd/api/Mirror.cc b/src/librbd/api/Mirror.cc new file mode 100644 index 00000000..4e7b7dc8 --- /dev/null +++ b/src/librbd/api/Mirror.cc @@ -0,0 +1,1541 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Mirror.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "common/ceph_json.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/api/Image.h" +#include "librbd/mirror/DemoteRequest.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/mirror/GetInfoRequest.h" +#include "librbd/mirror/GetStatusRequest.h" +#include "librbd/mirror/PromoteRequest.h" +#include "librbd/mirror/Types.h" +#include "librbd/MirroringWatcher.h" +#include <boost/algorithm/string/trim.hpp> +#include <boost/algorithm/string/replace.hpp> +#include <boost/scope_exit.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Mirror: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +int get_config_key(librados::Rados& rados, const std::string& key, + std::string* value) { + std::string cmd = + "{" + "\"prefix\": \"config-key get\", " + "\"key\": \"" + key + "\"" + "}"; + + bufferlist in_bl; + bufferlist out_bl; + + int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -EINVAL) { + return -EOPNOTSUPP; + } else if (r < 0 && r != -ENOENT) { + return r; + } + + *value = out_bl.to_str(); + return 0; +} + +int set_config_key(librados::Rados& rados, const std::string& key, + const std::string& value) { + std::string cmd; + if (value.empty()) { + cmd = "{" + "\"prefix\": \"config-key rm\", " + "\"key\": \"" + key + "\"" + "}"; + } else { + cmd = "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"" + key + "\", " + "\"val\": \"" + value + "\"" + "}"; + } + bufferlist in_bl; + bufferlist out_bl; + + int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -EINVAL) { + return -EOPNOTSUPP; + } else if (r < 0) { + return r; + } + + return 0; +} + +std::string get_peer_config_key_name(int64_t pool_id, + const std::string& peer_uuid) { + return RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) + "/" + + peer_uuid; +} + +int remove_peer_config_key(librados::IoCtx& io_ctx, + const std::string& peer_uuid) { + int64_t pool_id = io_ctx.get_id(); + auto key = get_peer_config_key_name(pool_id, peer_uuid); + + librados::Rados rados(io_ctx); + int r = set_config_key(rados, key, ""); + if (r < 0 && r != -ENOENT && r != -EPERM) { + return r; + } + return 0; +} + +int create_bootstrap_user(CephContext* cct, librados::Rados& rados, + std::string* peer_client_id, std::string* cephx_key) { + ldout(cct, 20) << dendl; + + // retrieve peer CephX user from config-key + int r = get_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY, + peer_client_id); + if (r == -EACCES) { + ldout(cct, 5) << "insufficient permissions to get peer-client-id " + << "config-key" << dendl; + return r; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve peer client id key: " + << cpp_strerror(r) << dendl; + return r; + } else if (r == -ENOENT || peer_client_id->empty()) { + ldout(cct, 20) << "creating new peer-client-id config-key" << dendl; + + *peer_client_id = "rbd-mirror-peer"; + r = set_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY, + *peer_client_id); + if (r == -EACCES) { + ldout(cct, 5) << "insufficient permissions to update peer-client-id " + << "config-key" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to update peer client id key: " + << cpp_strerror(r) << dendl; + return r; + } + } + ldout(cct, 20) << "peer_client_id=" << *peer_client_id << dendl; + + // create peer client user + std::string cmd = + R"({)" \ + R"( "prefix": "auth get-or-create",)" \ + R"( "entity": "client.)" + *peer_client_id + R"(",)" \ + R"( "caps": [)" \ + R"( "mon", "profile rbd-mirror-peer",)" \ + R"( "osd", "profile rbd"],)" \ + R"( "format": "json")" \ + R"(})"; + + bufferlist in_bl; + bufferlist out_bl; + + r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -EINVAL) { + ldout(cct, 5) << "caps mismatch for existing user" << dendl; + return -EEXIST; + } else if (r == -EACCES) { + ldout(cct, 5) << "insufficient permissions to create user" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to create or update RBD mirroring bootstrap user: " + << cpp_strerror(r) << dendl; + return r; + } + + // extract key from response + bool json_valid = false; + json_spirit::mValue json_root; + if(json_spirit::read(out_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_array()[0].get_obj(); + *cephx_key = json_obj["key"].get_str(); + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(cct) << "invalid auth keyring JSON received" << dendl; + return -EBADMSG; + } + + return 0; +} + +int create_bootstrap_peer(CephContext* cct, librados::IoCtx& io_ctx, + const std::string& site_name, const std::string& fsid, + const std::string& client_id, const std::string& key, + const std::string& mon_host, + const std::string& cluster1, + const std::string& cluster2) { + ldout(cct, 20) << dendl; + + std::string peer_uuid; + std::vector<mirror_peer_t> peers; + int r = Mirror<>::peer_list(io_ctx, &peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror peers: " << cpp_strerror(r) << dendl; + return r; + } + + if (peers.empty()) { + r = Mirror<>::peer_add(io_ctx, &peer_uuid, site_name, + "client." + client_id); + if (r < 0) { + lderr(cct) << "failed to add " << cluster1 << " peer to " + << cluster2 << " " << "cluster: " << cpp_strerror(r) << dendl; + return r; + } + } else if (peers[0].cluster_name != site_name && + peers[0].cluster_name != fsid) { + // only support a single peer + lderr(cct) << "multiple peers are not currently supported" << dendl; + return -EINVAL; + } else { + peer_uuid = peers[0].uuid; + + if (peers[0].cluster_name != site_name) { + r = Mirror<>::peer_set_cluster(io_ctx, peer_uuid, site_name); + if (r < 0) { + // non-fatal attempt to update site name + lderr(cct) << "failed to update peer site name" << dendl; + } + } + } + + Mirror<>::Attributes attributes { + {"mon_host", mon_host}, + {"key", key}}; + r = Mirror<>::peer_set_attributes(io_ctx, peer_uuid, attributes); + if (r < 0) { + lderr(cct) << "failed to update " << cluster1 << " cluster connection " + << "attributes in " << cluster2 << " cluster: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +int validate_mirroring_enabled(I *ictx) { + CephContext *cct = ictx->cct; + cls::rbd::MirrorImage mirror_image_internal; + int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, + &mirror_image_internal); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + return r; + } else if (mirror_image_internal.state != + cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + return -EINVAL; + } + return 0; +} + +int list_mirror_images(librados::IoCtx& io_ctx, + std::set<std::string>& mirror_image_ids) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + std::string last_read = ""; + int max_read = 1024; + int r; + do { + std::map<std::string, std::string> mirror_images; + r = cls_client::mirror_image_list(&io_ctx, last_read, max_read, + &mirror_images); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing mirrored image directory: " + << cpp_strerror(r) << dendl; + return r; + } + for (auto it = mirror_images.begin(); it != mirror_images.end(); ++it) { + mirror_image_ids.insert(it->first); + } + if (!mirror_images.empty()) { + last_read = mirror_images.rbegin()->first; + } + r = mirror_images.size(); + } while (r == max_read); + + return 0; +} + +struct C_ImageGetInfo : public Context { + mirror_image_info_t *mirror_image_info; + Context *on_finish; + + cls::rbd::MirrorImage mirror_image; + mirror::PromotionState promotion_state = mirror::PROMOTION_STATE_PRIMARY; + + C_ImageGetInfo(mirror_image_info_t *mirror_image_info, Context *on_finish) + : mirror_image_info(mirror_image_info), on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + mirror_image_info->global_id = mirror_image.global_image_id; + mirror_image_info->state = static_cast<rbd_mirror_image_state_t>( + mirror_image.state); + mirror_image_info->primary = ( + promotion_state == mirror::PROMOTION_STATE_PRIMARY); + on_finish->complete(0); + } +}; + +struct C_ImageGetStatus : public C_ImageGetInfo { + std::string image_name; + mirror_image_status_t *mirror_image_status; + + cls::rbd::MirrorImageStatus mirror_image_status_internal; + + C_ImageGetStatus(const std::string &image_name, + mirror_image_status_t *mirror_image_status, + Context *on_finish) + : C_ImageGetInfo(&mirror_image_status->info, on_finish), + image_name(image_name), mirror_image_status(mirror_image_status) { + } + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + mirror_image_status->name = image_name; + mirror_image_status->state = static_cast<mirror_image_status_state_t>( + mirror_image_status_internal.state); + mirror_image_status->description = mirror_image_status_internal.description; + mirror_image_status->last_update = + mirror_image_status_internal.last_update.sec(); + mirror_image_status->up = mirror_image_status_internal.up; + C_ImageGetInfo::finish(0); + } +}; + +} // anonymous namespace + +template <typename I> +int Mirror<I>::image_enable(I *ictx, bool relax_same_pool_parent_check) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + // TODO + if (!ictx->md_ctx.get_namespace().empty()) { + lderr(cct) << "namespaces are not supported" << dendl; + return -EINVAL; + } + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::MirrorMode mirror_mode; + r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode); + if (r < 0) { + lderr(cct) << "cannot enable mirroring: failed to retrieve mirror mode: " + << cpp_strerror(r) << dendl; + return r; + } + + if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + lderr(cct) << "cannot enable mirroring in the current pool mirroring mode" + << dendl; + return -EINVAL; + } + + // is mirroring not enabled for the parent? + { + RWLock::RLocker l(ictx->parent_lock); + ImageCtx *parent = ictx->parent; + if (parent) { + if (relax_same_pool_parent_check && + parent->md_ctx.get_id() == ictx->md_ctx.get_id()) { + if (!parent->test_features(RBD_FEATURE_JOURNALING)) { + lderr(cct) << "journaling is not enabled for the parent" << dendl; + return -EINVAL; + } + } else { + cls::rbd::MirrorImage mirror_image_internal; + r = cls_client::mirror_image_get(&(parent->md_ctx), parent->id, + &mirror_image_internal); + if (r == -ENOENT) { + lderr(cct) << "mirroring is not enabled for the parent" << dendl; + return -EINVAL; + } + } + } + } + + if ((ictx->features & RBD_FEATURE_JOURNALING) == 0) { + lderr(cct) << "cannot enable mirroring: journaling is not enabled" << dendl; + return -EINVAL; + } + + C_SaferCond ctx; + auto req = mirror::EnableRequest<ImageCtx>::create(ictx, &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Mirror<I>::image_disable(I *ictx, bool force) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::MirrorMode mirror_mode; + r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode); + if (r < 0) { + lderr(cct) << "cannot disable mirroring: failed to retrieve pool " + "mirroring mode: " << cpp_strerror(r) << dendl; + return r; + } + + if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + lderr(cct) << "cannot disable mirroring in the current pool mirroring " + "mode" << dendl; + return -EINVAL; + } + + // is mirroring enabled for the child? + cls::rbd::MirrorImage mirror_image_internal; + r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, + &mirror_image_internal); + if (r == -ENOENT) { + // mirroring is not enabled for this image + ldout(cct, 20) << "ignoring disable command: mirroring is not enabled for " + << "this image" << dendl; + return 0; + } else if (r == -EOPNOTSUPP) { + ldout(cct, 5) << "mirroring not supported by OSD" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to retrieve mirror image metadata: " + << cpp_strerror(r) << dendl; + return r; + } + + mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING; + r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id, + mirror_image_internal); + if (r < 0) { + lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl; + return r; + } else { + bool rollback = false; + BOOST_SCOPE_EXIT_ALL(ictx, &mirror_image_internal, &rollback) { + if (rollback) { + CephContext *cct = ictx->cct; + mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED; + int r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id, + mirror_image_internal); + if (r < 0) { + lderr(cct) << "failed to re-enable image mirroring: " + << cpp_strerror(r) << dendl; + } + } + }; + + { + RWLock::RLocker l(ictx->snap_lock); + map<librados::snap_t, SnapInfo> snap_info = ictx->snap_info; + for (auto &info : snap_info) { + cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(), + ictx->md_ctx.get_namespace(), + ictx->id, info.first}; + std::vector<librbd::linked_image_spec_t> child_images; + r = Image<I>::list_children(ictx, parent_spec, &child_images); + if (r < 0) { + rollback = true; + return r; + } + + if (child_images.empty()) { + continue; + } + + librados::IoCtx child_io_ctx; + int64_t child_pool_id = -1; + for (auto &child_image : child_images){ + std::string pool = child_image.pool_name; + if (child_pool_id == -1 || + child_pool_id != child_image.pool_id || + child_io_ctx.get_namespace() != child_image.pool_namespace) { + r = util::create_ioctx(ictx->md_ctx, "child image", + child_image.pool_id, + child_image.pool_namespace, + &child_io_ctx); + if (r < 0) { + rollback = true; + return r; + } + + child_pool_id = child_image.pool_id; + } + + cls::rbd::MirrorImage mirror_image_internal; + r = cls_client::mirror_image_get(&child_io_ctx, child_image.image_id, + &mirror_image_internal); + if (r != -ENOENT) { + rollback = true; + lderr(cct) << "mirroring is enabled on one or more children " + << dendl; + return -EBUSY; + } + } + } + } + + C_SaferCond ctx; + auto req = mirror::DisableRequest<ImageCtx>::create(ictx, force, true, + &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl; + rollback = true; + return r; + } + } + + return 0; +} + +template <typename I> +int Mirror<I>::image_promote(I *ictx, bool force) { + CephContext *cct = ictx->cct; + + C_SaferCond ctx; + Mirror<I>::image_promote(ictx, force, &ctx); + int r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to promote image" << dendl; + return r; + } + + return 0; +} + +template <typename I> +void Mirror<I>::image_promote(I *ictx, bool force, Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << ", " + << "force=" << force << dendl; + + auto req = mirror::PromoteRequest<>::create(*ictx, force, on_finish); + req->send(); +} + +template <typename I> +int Mirror<I>::image_demote(I *ictx) { + CephContext *cct = ictx->cct; + + C_SaferCond ctx; + Mirror<I>::image_demote(ictx, &ctx); + int r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to demote image" << dendl; + return r; + } + + return 0; +} + +template <typename I> +void Mirror<I>::image_demote(I *ictx, Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto req = mirror::DemoteRequest<>::create(*ictx, on_finish); + req->send(); +} + +template <typename I> +int Mirror<I>::image_resync(I *ictx) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond tag_owner_ctx; + bool is_tag_owner; + Journal<I>::is_tag_owner(ictx, &is_tag_owner, &tag_owner_ctx); + r = tag_owner_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to determine tag ownership: " << cpp_strerror(r) + << dendl; + return r; + } else if (is_tag_owner) { + lderr(cct) << "image is primary, cannot resync to itself" << dendl; + return -EINVAL; + } + + // flag the journal indicating that we want to rebuild the local image + r = Journal<I>::request_resync(ictx); + if (r < 0) { + lderr(cct) << "failed to request resync: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +void Mirror<I>::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info, + Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto on_refresh = new FunctionContext( + [ictx, mirror_image_info, on_finish](int r) { + if (r < 0) { + lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + auto ctx = new C_ImageGetInfo(mirror_image_info, on_finish); + auto req = mirror::GetInfoRequest<I>::create(*ictx, &ctx->mirror_image, + &ctx->promotion_state, + ctx); + req->send(); + }); + + if (ictx->state->is_refresh_required()) { + ictx->state->refresh(on_refresh); + } else { + on_refresh->complete(0); + } +} + +template <typename I> +int Mirror<I>::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info) { + C_SaferCond ctx; + image_get_info(ictx, mirror_image_info, &ctx); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + return 0; +} + +template <typename I> +void Mirror<I>::image_get_status(I *ictx, mirror_image_status_t *status, + Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto ctx = new C_ImageGetStatus(ictx->name, status, on_finish); + auto req = mirror::GetStatusRequest<I>::create( + *ictx, &ctx->mirror_image_status_internal, &ctx->mirror_image, + &ctx->promotion_state, ctx); + req->send(); +} + +template <typename I> +int Mirror<I>::image_get_status(I *ictx, mirror_image_status_t *status) { + C_SaferCond ctx; + image_get_status(ictx, status, &ctx); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + return 0; +} + +template <typename I> +int Mirror<I>::image_get_instance_id(I *ictx, std::string *instance_id) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + cls::rbd::MirrorImage mirror_image; + int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, &mirror_image); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + return r; + } else if (mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + return -EINVAL; + } + + entity_inst_t instance; + r = cls_client::mirror_image_instance_get(&ictx->md_ctx, + mirror_image.global_image_id, + &instance); + if (r < 0) { + if (r != -ENOENT && r != -ESTALE) { + lderr(cct) << "failed to get mirror image instance: " << cpp_strerror(r) + << dendl; + } + return r; + } + + *instance_id = stringify(instance.name.num()); + return 0; +} + +template <typename I> +int Mirror<I>::site_name_get(librados::Rados& rados, std::string* name) { + CephContext *cct = reinterpret_cast<CephContext *>(rados.cct()); + ldout(cct, 20) << dendl; + + int r = get_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name); + if (r == -EOPNOTSUPP) { + return r; + } else if (r == -ENOENT || name->empty()) { + // default to the cluster fsid + r = rados.cluster_fsid(name); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r) + << dendl; + } + return r; + } else if (r < 0) { + lderr(cct) << "failed to retrieve site name: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Mirror<I>::site_name_set(librados::Rados& rados, const std::string& name) { + CephContext *cct = reinterpret_cast<CephContext *>(rados.cct()); + + std::string site_name{name}; + boost::algorithm::trim(site_name); + ldout(cct, 20) << "site_name=" << site_name << dendl; + + int r = set_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name); + if (r == -EOPNOTSUPP) { + return r; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to update site name: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Mirror<I>::mode_get(librados::IoCtx& io_ctx, + rbd_mirror_mode_t *mirror_mode) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + cls::rbd::MirrorMode mirror_mode_internal; + int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode_internal); + if (r < 0) { + lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + switch (mirror_mode_internal) { + case cls::rbd::MIRROR_MODE_DISABLED: + case cls::rbd::MIRROR_MODE_IMAGE: + case cls::rbd::MIRROR_MODE_POOL: + *mirror_mode = static_cast<rbd_mirror_mode_t>(mirror_mode_internal); + break; + default: + lderr(cct) << "unknown mirror mode (" + << static_cast<uint32_t>(mirror_mode_internal) << ")" + << dendl; + return -EINVAL; + } + return 0; +} + +template <typename I> +int Mirror<I>::mode_set(librados::IoCtx& io_ctx, + rbd_mirror_mode_t mirror_mode) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + // TODO + if (!io_ctx.get_namespace().empty()) { + lderr(cct) << "namespaces are not supported" << dendl; + return -EINVAL; + } + + cls::rbd::MirrorMode next_mirror_mode; + switch (mirror_mode) { + case RBD_MIRROR_MODE_DISABLED: + case RBD_MIRROR_MODE_IMAGE: + case RBD_MIRROR_MODE_POOL: + next_mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode); + break; + default: + lderr(cct) << "unknown mirror mode (" + << static_cast<uint32_t>(mirror_mode) << ")" << dendl; + return -EINVAL; + } + + int r; + if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + // fail early if pool still has peers registered and attempting to disable + std::vector<cls::rbd::MirrorPeer> mirror_peers; + r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl; + return r; + } else if (!mirror_peers.empty()) { + lderr(cct) << "mirror peers still registered" << dendl; + return -EBUSY; + } + } + + cls::rbd::MirrorMode current_mirror_mode; + r = cls_client::mirror_mode_get(&io_ctx, ¤t_mirror_mode); + if (r < 0) { + lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + if (current_mirror_mode == next_mirror_mode) { + return 0; + } else if (current_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + r = cls_client::mirror_uuid_set(&io_ctx, uuid_gen.to_string()); + if (r < 0) { + lderr(cct) << "failed to allocate mirroring uuid: " << cpp_strerror(r) + << dendl; + return r; + } + } + + if (current_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + r = cls_client::mirror_mode_set(&io_ctx, cls::rbd::MIRROR_MODE_IMAGE); + if (r < 0) { + lderr(cct) << "failed to set mirror mode to image: " + << cpp_strerror(r) << dendl; + return r; + } + + r = MirroringWatcher<>::notify_mode_updated(io_ctx, + cls::rbd::MIRROR_MODE_IMAGE); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + } + + if (next_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) { + return 0; + } + + if (next_mirror_mode == cls::rbd::MIRROR_MODE_POOL) { + map<string, string> images; + r = Image<I>::list_images_v2(io_ctx, &images); + if (r < 0) { + lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& img_pair : images) { + uint64_t features; + uint64_t incompatible_features; + r = cls_client::get_features(&io_ctx, util::header_name(img_pair.second), + true, &features, &incompatible_features); + if (r < 0) { + lderr(cct) << "error getting features for image " << img_pair.first + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if ((features & RBD_FEATURE_JOURNALING) != 0) { + I *img_ctx = I::create("", img_pair.second, nullptr, io_ctx, false); + r = img_ctx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening image "<< img_pair.first << ": " + << cpp_strerror(r) << dendl; + return r; + } + + r = image_enable(img_ctx, true); + int close_r = img_ctx->state->close(); + if (r < 0) { + lderr(cct) << "error enabling mirroring for image " + << img_pair.first << ": " << cpp_strerror(r) << dendl; + return r; + } else if (close_r < 0) { + lderr(cct) << "failed to close image " << img_pair.first << ": " + << cpp_strerror(close_r) << dendl; + return close_r; + } + } + } + } else if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + while (true) { + bool retry_busy = false; + bool pending_busy = false; + + std::set<std::string> image_ids; + r = list_mirror_images(io_ctx, image_ids); + if (r < 0) { + lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& img_id : image_ids) { + if (current_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) { + cls::rbd::MirrorImage mirror_image; + r = cls_client::mirror_image_get(&io_ctx, img_id, &mirror_image); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state for image id " + << img_id << ": " << cpp_strerror(r) << dendl; + return r; + } + if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "failed to disable mirror mode: there are still " + << "images with mirroring enabled" << dendl; + return -EINVAL; + } + } else { + I *img_ctx = I::create("", img_id, nullptr, io_ctx, false); + r = img_ctx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening image id "<< img_id << ": " + << cpp_strerror(r) << dendl; + return r; + } + + r = image_disable(img_ctx, false); + int close_r = img_ctx->state->close(); + if (r == -EBUSY) { + pending_busy = true; + } else if (r < 0) { + lderr(cct) << "error disabling mirroring for image id " << img_id + << cpp_strerror(r) << dendl; + return r; + } else if (close_r < 0) { + lderr(cct) << "failed to close image id " << img_id << ": " + << cpp_strerror(close_r) << dendl; + return close_r; + } else if (pending_busy) { + // at least one mirrored image was successfully disabled, so we can + // retry any failures caused by busy parent/child relationships + retry_busy = true; + } + } + } + + if (!retry_busy && pending_busy) { + lderr(cct) << "error disabling mirroring for one or more images" + << dendl; + return -EBUSY; + } else if (!retry_busy) { + break; + } + } + } + + r = cls_client::mirror_mode_set(&io_ctx, next_mirror_mode); + if (r < 0) { + lderr(cct) << "failed to set mirror mode: " << cpp_strerror(r) << dendl; + return r; + } + + r = MirroringWatcher<>::notify_mode_updated(io_ctx, next_mirror_mode); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + return 0; +} + +template <typename I> +int Mirror<I>::peer_bootstrap_create(librados::IoCtx& io_ctx, + std::string* token) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + auto mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring mode: " << cpp_strerror(r) + << dendl; + return r; + } else if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + return -EINVAL; + } + + // retrieve the cluster fsid + std::string fsid; + librados::Rados rados(io_ctx); + r = rados.cluster_fsid(&fsid); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string peer_client_id; + std::string cephx_key; + r = create_bootstrap_user(cct, rados, &peer_client_id, &cephx_key); + if (r < 0) { + return r; + } + + std::string mon_host = cct->_conf.get_val<std::string>("mon_host"); + ldout(cct, 20) << "mon_host=" << mon_host << dendl; + + // format the token response + bufferlist token_bl; + token_bl.append( + R"({)" \ + R"("fsid":")" + fsid + R"(",)" + \ + R"("client_id":")" + peer_client_id + R"(",)" + \ + R"("key":")" + cephx_key + R"(",)" + \ + R"("mon_host":")" + \ + boost::replace_all_copy(mon_host, "\"", "\\\"") + R"(")" + \ + R"(})"); + ldout(cct, 20) << "token=" << token_bl.to_str() << dendl; + + bufferlist base64_bl; + token_bl.encode_base64(base64_bl); + *token = base64_bl.to_str(); + + return 0; +} + +template <typename I> +int Mirror<I>::peer_bootstrap_import(librados::IoCtx& io_ctx, + rbd_mirror_peer_direction_t direction, + const std::string& token) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + if (direction != RBD_MIRROR_PEER_DIRECTION_RX && + direction != RBD_MIRROR_PEER_DIRECTION_RX_TX) { + lderr(cct) << "invalid mirror peer direction" << dendl; + return -EINVAL; + } + + bufferlist token_bl; + try { + bufferlist base64_bl; + base64_bl.append(token); + token_bl.decode_base64(base64_bl); + } catch (buffer::error& err) { + lderr(cct) << "failed to decode base64" << dendl; + return -EINVAL; + } + + ldout(cct, 20) << "token=" << token_bl.to_str() << dendl; + + bool json_valid = false; + std::string expected_remote_fsid; + std::string remote_client_id; + std::string remote_key; + std::string remote_mon_host; + + json_spirit::mValue json_root; + if(json_spirit::read(token_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_obj(); + expected_remote_fsid = json_obj["fsid"].get_str(); + remote_client_id = json_obj["client_id"].get_str(); + remote_key = json_obj["key"].get_str(); + remote_mon_host = json_obj["mon_host"].get_str(); + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(cct) << "invalid bootstrap token JSON received" << dendl; + return -EINVAL; + } + + // sanity check import process + std::string local_fsid; + librados::Rados rados(io_ctx); + int r = rados.cluster_fsid(&local_fsid); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string local_site_name; + r = site_name_get(rados, &local_site_name); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster site name: " << cpp_strerror(r) + << dendl; + return r; + } + + // attempt to connect to remote cluster + librados::Rados remote_rados; + remote_rados.init(remote_client_id.c_str()); + + auto remote_cct = reinterpret_cast<CephContext*>(remote_rados.cct()); + remote_cct->_conf.set_val("mon_host", remote_mon_host); + remote_cct->_conf.set_val("key", remote_key); + + r = remote_rados.connect(); + if (r < 0) { + lderr(cct) << "failed to connect to peer cluster: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string remote_fsid; + r = remote_rados.cluster_fsid(&remote_fsid); + if (r < 0) { + lderr(cct) << "failed to retrieve remote cluster fsid: " + << cpp_strerror(r) << dendl; + return r; + } else if (local_fsid == remote_fsid) { + lderr(cct) << "cannot import token for local cluster" << dendl; + return -EINVAL; + } else if (expected_remote_fsid != remote_fsid) { + lderr(cct) << "unexpected remote cluster fsid" << dendl; + return -EINVAL; + } + + std::string remote_site_name; + r = site_name_get(remote_rados, &remote_site_name); + if (r < 0) { + lderr(cct) << "failed to retrieve remote cluster site name: " + << cpp_strerror(r) << dendl; + return r; + } else if (local_site_name == remote_site_name) { + lderr(cct) << "cannot import token for duplicate site name" << dendl; + return -EINVAL; + } + + librados::IoCtx remote_io_ctx; + r = remote_rados.ioctx_create(io_ctx.get_pool_name().c_str(), remote_io_ctx); + if (r == -ENOENT) { + ldout(cct, 10) << "remote pool does not exist" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to open remote pool '" << io_ctx.get_pool_name() + << "': " << cpp_strerror(r) << dendl; + return r; + } + + auto remote_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + r = cls_client::mirror_mode_get(&remote_io_ctx, &remote_mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve remote mirroring mode: " + << cpp_strerror(r) << dendl; + return r; + } else if (remote_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + return -ENOSYS; + } + + auto local_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + r = cls_client::mirror_mode_get(&io_ctx, &local_mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve local mirroring mode: " << cpp_strerror(r) + << dendl; + return r; + } else if (local_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + // copy mirror mode from remote peer + r = mode_set(io_ctx, static_cast<rbd_mirror_mode_t>(remote_mirror_mode)); + if (r < 0) { + return r; + } + } + + if (direction == RBD_MIRROR_PEER_DIRECTION_RX_TX) { + // create a local mirror peer user and export it to the remote cluster + std::string local_client_id; + std::string local_key; + r = create_bootstrap_user(cct, rados, &local_client_id, &local_key); + if (r < 0) { + return r; + } + + std::string local_mon_host = cct->_conf.get_val<std::string>("mon_host"); + + // create local cluster peer in remote cluster + r = create_bootstrap_peer(cct, remote_io_ctx, local_site_name, local_fsid, + local_client_id, local_key, local_mon_host, + "local", "remote"); + if (r < 0) { + return r; + } + } + + // create remote cluster peer in local cluster + r = create_bootstrap_peer(cct, io_ctx, remote_site_name, remote_fsid, + remote_client_id, remote_key, remote_mon_host, + "remote", "local"); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Mirror<I>::peer_add(librados::IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << "name=" << cluster_name << ", " + << "client=" << client_name << dendl; + + // TODO + if (!io_ctx.get_namespace().empty()) { + lderr(cct) << "namespaces are not supported" << dendl; + return -EINVAL; + } + + if (cct->_conf->cluster == cluster_name) { + lderr(cct) << "cannot add self as remote peer" << dendl; + return -EINVAL; + } + + int r; + do { + uuid_d uuid_gen; + uuid_gen.generate_random(); + + *uuid = uuid_gen.to_string(); + r = cls_client::mirror_peer_add(&io_ctx, *uuid, cluster_name, + client_name); + if (r == -ESTALE) { + ldout(cct, 5) << "duplicate UUID detected, retrying" << dendl; + } else if (r < 0) { + lderr(cct) << "failed to add mirror peer '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + } while (r == -ESTALE); + return 0; +} + +template <typename I> +int Mirror<I>::peer_remove(librados::IoCtx& io_ctx, const std::string &uuid) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << dendl; + + int r = remove_peer_config_key(io_ctx, uuid); + if (r < 0) { + lderr(cct) << "failed to remove peer attributes '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + + r = cls_client::mirror_peer_remove(&io_ctx, uuid); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to remove peer '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template <typename I> +int Mirror<I>::peer_list(librados::IoCtx& io_ctx, + std::vector<mirror_peer_t> *peers) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << dendl; + + std::vector<cls::rbd::MirrorPeer> mirror_peers; + int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl; + return r; + } + + peers->clear(); + peers->reserve(mirror_peers.size()); + for (auto &mirror_peer : mirror_peers) { + mirror_peer_t peer; + peer.uuid = mirror_peer.uuid; + peer.cluster_name = mirror_peer.cluster_name; + peer.client_name = mirror_peer.client_name; + peers->push_back(peer); + } + return 0; +} + +template <typename I> +int Mirror<I>::peer_set_client(librados::IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "client=" << client_name << dendl; + + int r = cls_client::mirror_peer_set_client(&io_ctx, uuid, client_name); + if (r < 0) { + lderr(cct) << "failed to update client '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template <typename I> +int Mirror<I>::peer_set_cluster(librados::IoCtx& io_ctx, + const std::string &uuid, + const std::string &cluster_name) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "cluster=" << cluster_name << dendl; + + if (cct->_conf->cluster == cluster_name) { + lderr(cct) << "cannot set self as remote peer" << dendl; + return -EINVAL; + } + + int r = cls_client::mirror_peer_set_cluster(&io_ctx, uuid, cluster_name); + if (r < 0) { + lderr(cct) << "failed to update cluster '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template <typename I> +int Mirror<I>::peer_get_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + Attributes* attributes) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << dendl; + + attributes->clear(); + + librados::Rados rados(io_ctx); + std::string value; + int r = get_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid), + &value); + if (r == -ENOENT || value.empty()) { + return -ENOENT; + } else if (r < 0) { + lderr(cct) << "failed to retrieve peer attributes: " << cpp_strerror(r) + << dendl; + return r; + } + + bool json_valid = false; + json_spirit::mValue json_root; + if(json_spirit::read(value, json_root)) { + try { + auto& json_obj = json_root.get_obj(); + for (auto& pairs : json_obj) { + (*attributes)[pairs.first] = pairs.second.get_str(); + } + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(cct) << "invalid peer attributes JSON received" << dendl; + return -EINVAL; + } + return 0; +} + +template <typename I> +int Mirror<I>::peer_set_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + const Attributes& attributes) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "attributes=" << attributes << dendl; + + std::vector<mirror_peer_t> mirror_peers; + int r = peer_list(io_ctx, &mirror_peers); + if (r < 0) { + return r; + } + + if (std::find_if(mirror_peers.begin(), mirror_peers.end(), + [&uuid](const librbd::mirror_peer_t& peer) { + return uuid == peer.uuid; + }) == mirror_peers.end()) { + ldout(cct, 5) << "mirror peer uuid " << uuid << " does not exist" << dendl; + return -ENOENT; + } + + std::stringstream ss; + ss << "{"; + for (auto& pair : attributes) { + ss << "\\\"" << pair.first << "\\\": " + << "\\\"" << pair.second << "\\\""; + if (&pair != &(*attributes.rbegin())) { + ss << ", "; + } + } + ss << "}"; + + librados::Rados rados(io_ctx); + r = set_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid), + ss.str()); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to update peer attributes: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int Mirror<I>::image_status_list(librados::IoCtx& io_ctx, + const std::string &start_id, size_t max, + IdToMirrorImageStatus *images) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + int r; + + map<string, string> id_to_name; + { + map<string, string> name_to_id; + r = Image<I>::list_images_v2(io_ctx, &name_to_id); + if (r < 0) { + return r; + } + for (auto it : name_to_id) { + id_to_name[it.second] = it.first; + } + } + + map<std::string, cls::rbd::MirrorImage> images_; + map<std::string, cls::rbd::MirrorImageStatus> statuses_; + + r = librbd::cls_client::mirror_image_status_list(&io_ctx, start_id, max, + &images_, &statuses_); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror image statuses: " + << cpp_strerror(r) << dendl; + return r; + } + + cls::rbd::MirrorImageStatus unknown_status( + cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN, "status not found"); + + for (auto it = images_.begin(); it != images_.end(); ++it) { + auto &image_id = it->first; + auto &info = it->second; + if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) { + continue; + } + + auto &image_name = id_to_name[image_id]; + if (image_name.empty()) { + lderr(cct) << "failed to find image name for image " << image_id << ", " + << "using image id as name" << dendl; + image_name = image_id; + } + auto s_it = statuses_.find(image_id); + auto &s = s_it != statuses_.end() ? s_it->second : unknown_status; + (*images)[image_id] = mirror_image_status_t{ + image_name, + mirror_image_info_t{ + info.global_image_id, + static_cast<mirror_image_state_t>(info.state), + false}, // XXX: To set "primary" right would require an additional call. + static_cast<mirror_image_status_state_t>(s.state), + s.description, + s.last_update.sec(), + s.up}; + } + + return 0; +} + +template <typename I> +int Mirror<I>::image_status_summary(librados::IoCtx& io_ctx, + MirrorImageStatusStates *states) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + + std::map<cls::rbd::MirrorImageStatusState, int> states_; + int r = cls_client::mirror_image_status_get_summary(&io_ctx, &states_); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to get mirror status summary: " + << cpp_strerror(r) << dendl; + return r; + } + for (auto &s : states_) { + (*states)[static_cast<mirror_image_status_state_t>(s.first)] = s.second; + } + return 0; +} + +template <typename I> +int Mirror<I>::image_instance_id_list( + librados::IoCtx& io_ctx, const std::string &start_image_id, size_t max, + std::map<std::string, std::string> *instance_ids) { + CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct()); + std::map<std::string, entity_inst_t> instances; + + int r = librbd::cls_client::mirror_image_instance_list( + &io_ctx, start_image_id, max, &instances); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror image instances: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto it : instances) { + (*instance_ids)[it.first] = stringify(it.second.name.num()); + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Mirror<librbd::ImageCtx>; diff --git a/src/librbd/api/Mirror.h b/src/librbd/api/Mirror.h new file mode 100644 index 00000000..94768acd --- /dev/null +++ b/src/librbd/api/Mirror.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_MIRROR_H +#define LIBRBD_API_MIRROR_H + +#include "include/rbd/librbd.hpp" +#include <map> +#include <string> +#include <vector> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Mirror { + typedef std::map<std::string, std::string> Attributes; + typedef std::map<std::string, mirror_image_status_t> IdToMirrorImageStatus; + typedef std::map<mirror_image_status_state_t, int> MirrorImageStatusStates; + + static int site_name_get(librados::Rados& rados, std::string* name); + static int site_name_set(librados::Rados& rados, const std::string& name); + + static int mode_get(librados::IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode); + static int mode_set(librados::IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode); + + static int peer_bootstrap_create(librados::IoCtx& io_ctx, std::string* token); + static int peer_bootstrap_import(librados::IoCtx& io_ctx, + rbd_mirror_peer_direction_t direction, + const std::string& token); + + static int peer_add(librados::IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name); + static int peer_remove(librados::IoCtx& io_ctx, const std::string &uuid); + static int peer_list(librados::IoCtx& io_ctx, + std::vector<mirror_peer_t> *peers); + static int peer_set_client(librados::IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name); + static int peer_set_cluster(librados::IoCtx& io_ctx, const std::string &uuid, + const std::string &cluster_name); + static int peer_get_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + Attributes* attributes); + static int peer_set_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + const Attributes& attributes); + + static int image_status_list(librados::IoCtx& io_ctx, + const std::string &start_id, size_t max, + IdToMirrorImageStatus *images); + static int image_status_summary(librados::IoCtx& io_ctx, + MirrorImageStatusStates *states); + static int image_instance_id_list(librados::IoCtx& io_ctx, + const std::string &start_image_id, + size_t max, + std::map<std::string, std::string> *ids); + + static int image_enable(ImageCtxT *ictx, bool relax_same_pool_parent_check); + static int image_disable(ImageCtxT *ictx, bool force); + static int image_promote(ImageCtxT *ictx, bool force); + static void image_promote(ImageCtxT *ictx, bool force, Context *on_finish); + static int image_demote(ImageCtxT *ictx); + static void image_demote(ImageCtxT *ictx, Context *on_finish); + static int image_resync(ImageCtxT *ictx); + static int image_get_info(ImageCtxT *ictx, + mirror_image_info_t *mirror_image_info); + static void image_get_info(ImageCtxT *ictx, + mirror_image_info_t *mirror_image_info, + Context *on_finish); + static int image_get_status(ImageCtxT *ictx, mirror_image_status_t *status); + static void image_get_status(ImageCtxT *ictx, mirror_image_status_t *status, + Context *on_finish); + static int image_get_instance_id(ImageCtxT *ictx, std::string *instance_id); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Mirror<librbd::ImageCtx>; + +#endif // LIBRBD_API_MIRROR_H diff --git a/src/librbd/api/Namespace.cc b/src/librbd/api/Namespace.cc new file mode 100644 index 00000000..ed7f8e28 --- /dev/null +++ b/src/librbd/api/Namespace.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/api/Namespace.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Namespace: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +const std::list<std::string> POOL_OBJECTS { + RBD_CHILDREN, + RBD_GROUP_DIRECTORY, + RBD_INFO, + RBD_MIRRORING, + RBD_TASK, + RBD_TRASH, + RBD_DIRECTORY +}; + +} // anonymous namespace + +template <typename I> +int Namespace<I>::create(librados::IoCtx& io_ctx, const std::string& name) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << "name=" << name << dendl; + + if (name.empty()) { + return -EINVAL; + } + + librados::Rados rados(io_ctx); + int8_t require_osd_release; + int r = rados.get_min_compatible_osd(&require_osd_release); + if (r < 0) { + lderr(cct) << "failed to retrieve min OSD release: " << cpp_strerror(r) + << dendl; + return r; + } + + if (require_osd_release < CEPH_RELEASE_NAUTILUS) { + ldout(cct, 1) << "namespace support requires nautilus or later OSD" + << dendl; + return -ENOSYS; + } + + + librados::IoCtx default_ns_ctx; + default_ns_ctx.dup(io_ctx); + default_ns_ctx.set_namespace(""); + + r = cls_client::namespace_add(&default_ns_ctx, name); + if (r < 0) { + lderr(cct) << "failed to add namespace: " << cpp_strerror(r) << dendl; + return r; + } + + librados::IoCtx ns_ctx; + ns_ctx.dup(io_ctx); + ns_ctx.set_namespace(name); + + r = cls_client::dir_state_set(&ns_ctx, RBD_DIRECTORY, + cls::rbd::DIRECTORY_STATE_READY); + if (r < 0) { + lderr(cct) << "failed to initialize image directory: " << cpp_strerror(r) + << dendl; + goto rollback; + } + + return 0; + +rollback: + int ret_val = cls_client::namespace_remove(&default_ns_ctx, name); + if (ret_val < 0) { + lderr(cct) << "failed to remove namespace: " << cpp_strerror(ret_val) << dendl; + } + + return r; +} + +template <typename I> +int Namespace<I>::remove(librados::IoCtx& io_ctx, const std::string& name) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << "name=" << name << dendl; + + if (name.empty()) { + return -EINVAL; + } + + librados::IoCtx default_ns_ctx; + default_ns_ctx.dup(io_ctx); + default_ns_ctx.set_namespace(""); + + librados::IoCtx ns_ctx; + ns_ctx.dup(io_ctx); + ns_ctx.set_namespace(name); + + std::map<std::string, cls::rbd::TrashImageSpec> trash_entries; + + librados::ObjectWriteOperation dir_op; + librbd::cls_client::dir_state_set( + &dir_op, cls::rbd::DIRECTORY_STATE_ADD_DISABLED); + dir_op.remove(); + + int r = ns_ctx.operate(RBD_DIRECTORY, &dir_op); + if (r == -EBUSY) { + ldout(cct, 5) << "image directory not empty" << dendl; + goto rollback; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to disable the namespace: " << cpp_strerror(r) + << dendl; + return r; + } + + r = cls_client::trash_list(&ns_ctx, "", 1, &trash_entries); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list trash directory: " << cpp_strerror(r) + << dendl; + return r; + } else if (!trash_entries.empty()) { + ldout(cct, 5) << "image trash not empty" << dendl; + goto rollback; + } + + for (auto& oid : POOL_OBJECTS) { + r = ns_ctx.remove(oid); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to remove object '" << oid << "': " + << cpp_strerror(r) << dendl; + return r; + } + } + + r = cls_client::namespace_remove(&default_ns_ctx, name); + if (r < 0) { + lderr(cct) << "failed to remove namespace: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; + +rollback: + + r = librbd::cls_client::dir_state_set( + &ns_ctx, RBD_DIRECTORY, cls::rbd::DIRECTORY_STATE_READY); + if (r < 0) { + lderr(cct) << "failed to restore directory state: " << cpp_strerror(r) + << dendl; + } + + return -EBUSY; +} + +template <typename I> +int Namespace<I>::list(IoCtx& io_ctx, vector<string> *names) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << dendl; + + librados::IoCtx default_ns_ctx; + default_ns_ctx.dup(io_ctx); + default_ns_ctx.set_namespace(""); + + int r; + int max_read = 1024; + std::string last_read = ""; + do { + std::list<std::string> name_list; + r = cls_client::namespace_list(&default_ns_ctx, last_read, max_read, + &name_list); + if (r == -ENOENT) { + return 0; + } else if (r < 0) { + lderr(cct) << "error listing namespaces: " << cpp_strerror(r) << dendl; + return r; + } + + names->insert(names->end(), name_list.begin(), name_list.end()); + if (!name_list.empty()) { + last_read = name_list.back(); + } + r = name_list.size(); + } while (r == max_read); + + return 0; +} + +template <typename I> +int Namespace<I>::exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << "name=" << name << dendl; + + *exists = false; + if (name.empty()) { + return -EINVAL; + } + + librados::IoCtx ns_ctx; + ns_ctx.dup(io_ctx); + ns_ctx.set_namespace(name); + + int r = librbd::cls_client::dir_state_assert(&ns_ctx, RBD_DIRECTORY, + cls::rbd::DIRECTORY_STATE_READY); + if (r == 0) { + *exists = true; + } else if (r != -ENOENT) { + lderr(cct) << "error asserting namespace: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Namespace<librbd::ImageCtx>; diff --git a/src/librbd/api/Namespace.h b/src/librbd/api/Namespace.h new file mode 100644 index 00000000..220eb28f --- /dev/null +++ b/src/librbd/api/Namespace.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_NAMESPACE_H +#define CEPH_LIBRBD_API_NAMESPACE_H + +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include <string> +#include <vector> + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Namespace { + + static int create(librados::IoCtx& io_ctx, const std::string& name); + static int remove(librados::IoCtx& io_ctx, const std::string& name); + static int list(librados::IoCtx& io_ctx, std::vector<std::string>* names); + static int exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Namespace<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_NAMESPACE_H diff --git a/src/librbd/api/Pool.cc b/src/librbd/api/Pool.cc new file mode 100644 index 00000000..88adf561 --- /dev/null +++ b/src/librbd/api/Pool.cc @@ -0,0 +1,377 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Pool.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Throttle.h" +#include "cls/rbd/cls_rbd_client.h" +#include "osd/osd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Image.h" +#include "librbd/api/Trash.h" +#include "librbd/image/ValidatePoolRequest.h" + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { +namespace api { + +namespace { + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Pool::ImageStatRequest: " \ + << __func__ << " " << this << ": " \ + << "(id=" << m_image_id << "): " + +template <typename I> +class ImageStatRequest { +public: + ImageStatRequest(librados::IoCtx& io_ctx, SimpleThrottle& throttle, + const std::string& image_id, bool scan_snaps, + std::atomic<uint64_t>* bytes, + std::atomic<uint64_t>* max_bytes, + std::atomic<uint64_t>* snaps) + : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())), + m_io_ctx(io_ctx), m_throttle(throttle), m_image_id(image_id), + m_scan_snaps(scan_snaps), m_bytes(bytes), m_max_bytes(max_bytes), + m_snaps(snaps) { + m_throttle.start_op(); + } + + void send() { + get_head(); + } + +protected: + void finish(int r) { + (*m_max_bytes) += m_max_size; + m_throttle.end_op(r); + + delete this; + } + +private: + CephContext* m_cct; + librados::IoCtx& m_io_ctx; + SimpleThrottle& m_throttle; + const std::string& m_image_id; + bool m_scan_snaps; + std::atomic<uint64_t>* m_bytes; + std::atomic<uint64_t>* m_max_bytes; + std::atomic<uint64_t>* m_snaps; + bufferlist m_out_bl; + + uint64_t m_max_size = 0; + ::SnapContext m_snapc; + + void get_head() { + ldout(m_cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::get_size_start(&op, CEPH_NOSNAP); + if (m_scan_snaps) { + cls_client::get_snapcontext_start(&op); + } + + m_out_bl.clear(); + auto aio_comp = util::create_rados_callback< + ImageStatRequest<I>, &ImageStatRequest<I>::handle_get_head>(this); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); + } + + void handle_get_head(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + auto it = m_out_bl.cbegin(); + if (r == 0) { + uint8_t order; + r = cls_client::get_size_finish(&it, &m_max_size, &order); + if (r == 0) { + (*m_bytes) += m_max_size; + } + } + if (m_scan_snaps && r == 0) { + r = cls_client::get_snapcontext_finish(&it, &m_snapc); + if (r == 0) { + (*m_snaps) += m_snapc.snaps.size(); + } + } + + if (r == -ENOENT) { + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to stat image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_snapc.is_valid()) { + lderr(m_cct) << "snap context is invalid" << dendl; + finish(-EIO); + return; + } + + get_snaps(); + } + + void get_snaps() { + if (!m_scan_snaps || m_snapc.snaps.empty()) { + finish(0); + return; + } + + ldout(m_cct, 15) << dendl; + librados::ObjectReadOperation op; + for (auto snap_seq : m_snapc.snaps) { + cls_client::get_size_start(&op, snap_seq); + } + + m_out_bl.clear(); + auto aio_comp = util::create_rados_callback< + ImageStatRequest<I>, &ImageStatRequest<I>::handle_get_snaps>(this); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); + } + + void handle_get_snaps(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + auto it = m_out_bl.cbegin(); + for ([[maybe_unused]] auto snap_seq : m_snapc.snaps) { + uint64_t size; + if (r == 0) { + uint8_t order; + r = cls_client::get_size_finish(&it, &size, &order); + } + if (r == 0 && m_max_size < size) { + m_max_size = size; + } + } + + if (r == -ENOENT) { + ldout(m_cct, 15) << "out-of-sync metadata" << dendl; + get_head(); + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve snap size: " << cpp_strerror(r) + << dendl; + finish(r); + } else { + finish(0); + } + } + +}; + +template <typename I> +void get_pool_stat_option_value(typename Pool<I>::StatOptions* stat_options, + rbd_pool_stat_option_t option, + uint64_t** value) { + auto it = stat_options->find(option); + if (it == stat_options->end()) { + *value = nullptr; + } else { + *value = it->second; + } +} + +template <typename I> +int get_pool_stats(librados::IoCtx& io_ctx, const ConfigProxy& config, + const std::vector<std::string>& image_ids, uint64_t* image_count, + uint64_t* provisioned_bytes, uint64_t* max_provisioned_bytes, + uint64_t* snapshot_count) { + + bool scan_snaps = ((max_provisioned_bytes != nullptr) || + (snapshot_count != nullptr)); + + SimpleThrottle throttle( + config.template get_val<uint64_t>("rbd_concurrent_management_ops"), true); + std::atomic<uint64_t> bytes{0}; + std::atomic<uint64_t> max_bytes{0}; + std::atomic<uint64_t> snaps{0}; + for (auto& image_id : image_ids) { + if (throttle.pending_error()) { + break; + } + + auto req = new ImageStatRequest<I>(io_ctx, throttle, image_id, + scan_snaps, &bytes, &max_bytes, &snaps); + req->send(); + } + + int r = throttle.wait_for_ret(); + if (r < 0) { + return r; + } + + if (image_count != nullptr) { + *image_count = image_ids.size(); + } + if (provisioned_bytes != nullptr) { + *provisioned_bytes = bytes.load(); + } + if (max_provisioned_bytes != nullptr) { + *max_provisioned_bytes = max_bytes.load(); + } + if (snapshot_count != nullptr) { + *snapshot_count = snaps.load(); + } + + return 0; +} + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Pool: " << __func__ << ": " + +template <typename I> +int Pool<I>::init(librados::IoCtx& io_ctx, bool force) { + auto cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 10) << dendl; + + int r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RBD, force); + if (r < 0) { + return r; + } + + ConfigProxy config{cct->_conf}; + api::Config<I>::apply_pool_overrides(io_ctx, &config); + if (!config.get_val<bool>("rbd_validate_pool")) { + return 0; + } + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond ctx; + auto req = image::ValidatePoolRequest<I>::create(io_ctx, op_work_queue, &ctx); + req->send(); + + return ctx.wait(); +} + +template <typename I> +int Pool<I>::add_stat_option(StatOptions* stat_options, + rbd_pool_stat_option_t option, + uint64_t* value) { + switch (option) { + case RBD_POOL_STAT_OPTION_IMAGES: + case RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS: + case RBD_POOL_STAT_OPTION_TRASH_IMAGES: + case RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS: + stat_options->emplace(option, value); + return 0; + default: + break; + } + return -ENOENT; +} + +template <typename I> +int Pool<I>::get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options) { + auto cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + ldout(cct, 10) << dendl; + + ConfigProxy config{cct->_conf}; + api::Config<I>::apply_pool_overrides(io_ctx, &config); + + uint64_t* image_count; + uint64_t* provisioned_bytes; + uint64_t* max_provisioned_bytes; + uint64_t* snapshot_count; + + std::vector<trash_image_info_t> trash_entries; + int r = Trash<I>::list(io_ctx, trash_entries, false); + if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_IMAGES, &image_count); + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES, + &provisioned_bytes); + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES, + &max_provisioned_bytes); + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snapshot_count); + if (image_count != nullptr || provisioned_bytes != nullptr || + max_provisioned_bytes != nullptr || snapshot_count != nullptr) { + typename Image<I>::ImageNameToIds images; + int r = Image<I>::list_images_v2(io_ctx, &images); + if (r < 0) { + return r; + } + + std::vector<std::string> image_ids; + image_ids.reserve(images.size() + trash_entries.size()); + for (auto& it : images) { + image_ids.push_back(std::move(it.second)); + } + for (auto& it : trash_entries) { + if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + image_ids.push_back(std::move(it.id)); + } + } + + r = get_pool_stats<I>(io_ctx, config, image_ids, image_count, + provisioned_bytes, max_provisioned_bytes, + snapshot_count); + if (r < 0) { + return r; + } + } + + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_TRASH_IMAGES, &image_count); + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES, + &provisioned_bytes); + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES, + &max_provisioned_bytes); + get_pool_stat_option_value<I>( + stat_options, RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &snapshot_count); + if (image_count != nullptr || provisioned_bytes != nullptr || + max_provisioned_bytes != nullptr || snapshot_count != nullptr) { + + std::vector<std::string> image_ids; + image_ids.reserve(trash_entries.size()); + for (auto& it : trash_entries) { + if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + continue; + } + image_ids.push_back(std::move(it.id)); + } + + r = get_pool_stats<I>(io_ctx, config, image_ids, image_count, + provisioned_bytes, max_provisioned_bytes, + snapshot_count); + if (r < 0) { + return r; + } + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Pool<librbd::ImageCtx>; diff --git a/src/librbd/api/Pool.h b/src/librbd/api/Pool.h new file mode 100644 index 00000000..7b607ab6 --- /dev/null +++ b/src/librbd/api/Pool.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_POOL_H +#define CEPH_LIBRBD_API_POOL_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.h" +#include <map> + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +class Pool { +public: + typedef std::map<rbd_pool_stat_option_t, uint64_t*> StatOptions; + + static int init(librados::IoCtx& io_ctx, bool force); + + static int add_stat_option(StatOptions* stat_options, + rbd_pool_stat_option_t option, + uint64_t* value); + + static int get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Pool<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_POOL_H diff --git a/src/librbd/api/PoolMetadata.cc b/src/librbd/api/PoolMetadata.cc new file mode 100644 index 00000000..f3d53944 --- /dev/null +++ b/src/librbd/api/PoolMetadata.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/PoolMetadata.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::PoolMetadata: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +void update_pool_timestamp(librados::IoCtx& io_ctx) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + auto now = ceph_clock_now(); + std::string cmd = + R"({)" + R"("prefix": "config set", )" + R"("who": "global", )" + R"("name": "rbd_config_pool_override_update_timestamp", )" + R"("value": ")" + stringify(now.sec()) + R"(")" + R"(})"; + + librados::Rados rados(io_ctx); + bufferlist in_bl; + std::string ss; + int r = rados.mon_command(cmd, in_bl, nullptr, &ss); + if (r < 0) { + lderr(cct) << "failed to notify clients of pool config update: " + << cpp_strerror(r) << dendl; + } +} + +} // anonymous namespace + +template <typename I> +int PoolMetadata<I>::get(librados::IoCtx& io_ctx, + const std::string &key, std::string *value) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, value); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r) + << dendl; + } + + return r; +} + +template <typename I> +int PoolMetadata<I>::set(librados::IoCtx& io_ctx, const std::string &key, + const std::string &value) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + bool need_update_pool_timestamp = false; + + std::string config_key; + if (util::is_metadata_config_override(key, &config_key)) { + if (!librbd::api::Config<I>::is_option_name(io_ctx, config_key)) { + lderr(cct) << "validation for " << key + << " failed: not allowed pool level override" << dendl; + return -EINVAL; + } + int r = ConfigProxy{false}.set_val(config_key.c_str(), value); + if (r < 0) { + lderr(cct) << "validation for " << key << " failed: " << cpp_strerror(r) + << dendl; + return -EINVAL; + } + + need_update_pool_timestamp = true; + } + + ceph::bufferlist bl; + bl.append(value); + + int r = cls_client::metadata_set(&io_ctx, RBD_INFO, {{key, bl}}); + if (r < 0) { + lderr(cct) << "failed setting metadata " << key << ": " << cpp_strerror(r) + << dendl; + return r; + } + + if (need_update_pool_timestamp) { + update_pool_timestamp(io_ctx); + } + + return 0; +} + +template <typename I> +int PoolMetadata<I>::remove(librados::IoCtx& io_ctx, const std::string &key) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + std::string value; + int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, &value); + if (r < 0) { + if (r == -ENOENT) { + ldout(cct, 1) << "metadata " << key << " does not exist" << dendl; + } else { + lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r) + << dendl; + } + return r; + } + + r = cls_client::metadata_remove(&io_ctx, RBD_INFO, key); + if (r < 0) { + lderr(cct) << "failed removing metadata " << key << ": " << cpp_strerror(r) + << dendl; + return r; + } + + std::string config_key; + if (util::is_metadata_config_override(key, &config_key)) { + update_pool_timestamp(io_ctx); + } + + return 0; +} + +template <typename I> +int PoolMetadata<I>::list(librados::IoCtx& io_ctx, const std::string &start, + uint64_t max, + std::map<std::string, ceph::bufferlist> *pairs) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + int r = cls_client::metadata_list(&io_ctx, RBD_INFO, start, max, pairs); + if (r == -ENOENT) { + r = 0; + } else if (r < 0) { + lderr(cct) << "failed listing metadata: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::PoolMetadata<librbd::ImageCtx>; diff --git a/src/librbd/api/PoolMetadata.h b/src/librbd/api/PoolMetadata.h new file mode 100644 index 00000000..c0a81735 --- /dev/null +++ b/src/librbd/api/PoolMetadata.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_POOL_METADATA_H +#define CEPH_LIBRBD_API_POOL_METADATA_H + +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" + +#include <map> +#include <string> + +namespace librbd { + +class ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +class PoolMetadata { +public: + static int get(librados::IoCtx& io_ctx, const std::string &key, + std::string *value); + static int set(librados::IoCtx& io_ctx, const std::string &key, + const std::string &value); + static int remove(librados::IoCtx& io_ctx, const std::string &key); + static int list(librados::IoCtx& io_ctx, const std::string &start, + uint64_t max, std::map<std::string, ceph::bufferlist> *pairs); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::PoolMetadata<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_POOL_METADATA_H diff --git a/src/librbd/api/Snapshot.cc b/src/librbd/api/Snapshot.cc new file mode 100644 index 00000000..3d29cfb2 --- /dev/null +++ b/src/librbd/api/Snapshot.cc @@ -0,0 +1,196 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Snapshot.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include <boost/variant.hpp> +#include "include/Context.h" +#include "common/Cond.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Snapshot: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +class GetGroupVisitor : public boost::static_visitor<int> { +public: + CephContext* cct; + librados::IoCtx *image_ioctx; + snap_group_namespace_t *group_snap; + + explicit GetGroupVisitor(CephContext* cct, librados::IoCtx *_image_ioctx, + snap_group_namespace_t *group_snap) + : cct(cct), image_ioctx(_image_ioctx), group_snap(group_snap) {}; + + template <typename T> + inline int operator()(const T&) const { + // ignore other than GroupSnapshotNamespace types. + return -EINVAL; + } + + inline int operator()( + const cls::rbd::GroupSnapshotNamespace& snap_namespace) { + IoCtx group_ioctx; + int r = util::create_ioctx(*image_ioctx, "group", snap_namespace.group_pool, + {}, &group_ioctx); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot group_snapshot; + + std::string group_name; + r = cls_client::dir_get_name(&group_ioctx, RBD_GROUP_DIRECTORY, + snap_namespace.group_id, &group_name); + if (r < 0) { + lderr(cct) << "failed to retrieve group name: " << cpp_strerror(r) + << dendl; + return r; + } + + string group_header_oid = util::group_header_name(snap_namespace.group_id); + r = cls_client::group_snap_get_by_id(&group_ioctx, + group_header_oid, + snap_namespace.group_snapshot_id, + &group_snapshot); + if (r < 0) { + lderr(cct) << "failed to retrieve group snapshot: " << cpp_strerror(r) + << dendl; + return r; + } + + group_snap->group_pool = group_ioctx.get_id(); + group_snap->group_name = group_name; + group_snap->group_snap_name = group_snapshot.name; + return 0; + } +}; + +class GetTrashVisitor : public boost::static_visitor<int> { +public: + std::string* original_name; + + explicit GetTrashVisitor(std::string* original_name) + : original_name(original_name) { + } + + template <typename T> + inline int operator()(const T&) const { + return -EINVAL; + } + + inline int operator()( + const cls::rbd::TrashSnapshotNamespace& snap_namespace) { + *original_name = snap_namespace.original_name; + return 0; + } +}; + +} // anonymous namespace + +template <typename I> +int Snapshot<I>::get_group_namespace(I *ictx, uint64_t snap_id, + snap_group_namespace_t *group_snap) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker snap_locker(ictx->snap_lock); + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + GetGroupVisitor ggv = GetGroupVisitor(ictx->cct, &ictx->md_ctx, group_snap); + r = boost::apply_visitor(ggv, snap_info->snap_namespace); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Snapshot<I>::get_trash_namespace(I *ictx, uint64_t snap_id, + std::string* original_name) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker snap_locker(ictx->snap_lock); + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + auto visitor = GetTrashVisitor(original_name); + r = boost::apply_visitor(visitor, snap_info->snap_namespace); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +int Snapshot<I>::get_namespace_type(I *ictx, uint64_t snap_id, + snap_namespace_type_t *namespace_type) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker l(ictx->snap_lock); + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + *namespace_type = static_cast<snap_namespace_type_t>( + cls::rbd::get_snap_namespace_type(snap_info->snap_namespace)); + return 0; +} + +template <typename I> +int Snapshot<I>::remove(I *ictx, uint64_t snap_id) { + ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_id << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::SnapshotNamespace snapshot_namespace; + std::string snapshot_name; + { + RWLock::RLocker snap_locker(ictx->snap_lock); + auto it = ictx->snap_info.find(snap_id); + if (it == ictx->snap_info.end()) { + return -ENOENT; + } + + snapshot_namespace = it->second.snap_namespace; + snapshot_name = it->second.name; + } + + C_SaferCond ctx; + ictx->operations->snap_remove(snapshot_namespace, snapshot_name, &ctx); + r = ctx.wait(); + return r; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Snapshot<librbd::ImageCtx>; diff --git a/src/librbd/api/Snapshot.h b/src/librbd/api/Snapshot.h new file mode 100644 index 00000000..453861c4 --- /dev/null +++ b/src/librbd/api/Snapshot.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_SNAPSHOT_H +#define CEPH_LIBRBD_API_SNAPSHOT_H + +#include "include/rbd/librbd.hpp" +#include <string> + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Snapshot { + + static int get_group_namespace(ImageCtxT *ictx, uint64_t snap_id, + snap_group_namespace_t *group_snap); + + static int get_trash_namespace(ImageCtxT *ictx, uint64_t snap_id, + std::string *original_name); + + static int get_namespace_type(ImageCtxT *ictx, uint64_t snap_id, + snap_namespace_type_t *namespace_type); + + static int remove(ImageCtxT *ictx, uint64_t snap_id); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Snapshot<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_API_SNAPSHOT_H diff --git a/src/librbd/api/Trash.cc b/src/librbd/api/Trash.cc new file mode 100644 index 00000000..0664461e --- /dev/null +++ b/src/librbd/api/Trash.cc @@ -0,0 +1,681 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Trash.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/TrashWatcher.h" +#include "librbd/Utils.h" +#include "librbd/api/DiffIterate.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/trash/MoveRequest.h" +#include "librbd/trash/RemoveRequest.h" +#include <json_spirit/json_spirit.h> +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/image/ListWatchersRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Trash: " << __func__ << ": " + +namespace librbd { +namespace api { + +template <typename I> +const typename Trash<I>::TrashImageSources Trash<I>::RESTORE_SOURCE_WHITELIST { + cls::rbd::TRASH_IMAGE_SOURCE_USER, + cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING + }; + +namespace { + +template <typename I> +int disable_mirroring(I *ictx) { + if (!ictx->test_features(RBD_FEATURE_JOURNALING)) { + return 0; + } + + cls::rbd::MirrorImage mirror_image; + int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, &mirror_image); + if (r == -ENOENT) { + ldout(ictx->cct, 10) << "mirroring is not enabled for this image" << dendl; + return 0; + } + + if (r < 0) { + lderr(ictx->cct) << "failed to retrieve mirror image: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(ictx->cct, 10) << dendl; + + C_SaferCond ctx; + auto req = mirror::DisableRequest<I>::create(ictx, false, true, &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(ictx->cct) << "failed to disable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template <typename I> +int enable_mirroring(IoCtx &io_ctx, const std::string &image_id) { + auto cct = reinterpret_cast<CephContext*>(io_ctx.cct()); + + uint64_t features; + uint64_t incompatible_features; + int r = cls_client::get_features(&io_ctx, util::header_name(image_id), true, + &features, &incompatible_features); + if (r < 0) { + lderr(cct) << "failed to retrieve features: " << cpp_strerror(r) << dendl; + return r; + } + + if ((features & RBD_FEATURE_JOURNALING) == 0) { + return 0; + } + + cls::rbd::MirrorMode mirror_mode; + r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + if (mirror_mode != cls::rbd::MIRROR_MODE_POOL) { + ldout(cct, 10) << "not pool mirroring mode" << dendl; + return 0; + } + + ldout(cct, 10) << dendl; + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + C_SaferCond ctx; + auto req = mirror::EnableRequest<I>::create(io_ctx, image_id, "", + op_work_queue, &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +} // anonymous namespace + +template <typename I> +int Trash<I>::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, const std::string &image_id, + uint64_t delay) { + ceph_assert(!image_name.empty() && !image_id.empty()); + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << &io_ctx << " name=" << image_name << ", id=" << image_id + << dendl; + + auto ictx = new I("", image_id, nullptr, io_ctx, false); + int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + return r; + } + + if (r == 0) { + if (ictx->test_features(RBD_FEATURE_JOURNALING)) { + RWLock::WLocker snap_locker(ictx->snap_lock); + ictx->set_journal_policy(new journal::DisabledPolicy()); + } + + ictx->owner_lock.get_read(); + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->block_requests(0); + + r = ictx->operations->prepare_image_update( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, false); + if (r < 0) { + lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl; + ictx->owner_lock.put_read(); + ictx->state->close(); + return -EBUSY; + } + } + ictx->owner_lock.put_read(); + + ictx->snap_lock.get_read(); + if (!ictx->migration_info.empty()) { + lderr(cct) << "cannot move migrating image to trash" << dendl; + ictx->snap_lock.put_read(); + ictx->state->close(); + return -EBUSY; + } + ictx->snap_lock.put_read(); + + r = disable_mirroring<I>(ictx); + if (r < 0) { + ictx->state->close(); + return r; + } + + ictx->state->close(); + } + + utime_t delete_time{ceph_clock_now()}; + utime_t deferment_end_time{delete_time}; + deferment_end_time += delay; + cls::rbd::TrashImageSpec trash_image_spec{ + static_cast<cls::rbd::TrashImageSource>(source), image_name, + delete_time, deferment_end_time}; + + trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_MOVING; + C_SaferCond ctx; + auto req = trash::MoveRequest<I>::create(io_ctx, image_id, trash_image_spec, + &ctx); + req->send(); + + r = ctx.wait(); + trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_NORMAL; + int ret = cls_client::trash_state_set(&io_ctx, image_id, + trash_image_spec.state, + cls::rbd::TRASH_IMAGE_STATE_MOVING); + if (ret < 0 && ret != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(ret) << dendl; + return ret; + } + if (r < 0) { + return r; + } + + C_SaferCond notify_ctx; + TrashWatcher<I>::notify_image_added(io_ctx, image_id, trash_image_spec, + ¬ify_ctx); + r = notify_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + + return 0; +} + +template <typename I> +int Trash<I>::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, uint64_t delay) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << &io_ctx << " name=" << image_name << dendl; + + // try to get image id from the directory + std::string image_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name, + &image_id); + if (r == -ENOENT) { + r = io_ctx.stat(util::old_header_name(image_name), nullptr, nullptr); + if (r == 0) { + // cannot move V1 image to trash + ldout(cct, 10) << "cannot move v1 image to trash" << dendl; + return -EOPNOTSUPP; + } + + // image doesn't exist -- perhaps already in the trash since removing + // from the directory is the last step + return -ENOENT; + } else if (r < 0) { + lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl; + return r; + } + + ceph_assert(!image_name.empty() && !image_id.empty()); + return Trash<I>::move(io_ctx, source, image_name, image_id, delay); +} + +template <typename I> +int Trash<I>::get(IoCtx &io_ctx, const std::string &id, + trash_image_info_t *info) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << __func__ << " " << &io_ctx << dendl; + + cls::rbd::TrashImageSpec spec; + int r = cls_client::trash_get(&io_ctx, id, &spec); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "error retrieving trash entry: " << cpp_strerror(r) + << dendl; + return r; + } + + rbd_trash_image_source_t source = static_cast<rbd_trash_image_source_t>( + spec.source); + *info = trash_image_info_t{id, spec.name, source, spec.deletion_time.sec(), + spec.deferment_end_time.sec()}; + return 0; +} + +template <typename I> +int Trash<I>::list(IoCtx &io_ctx, vector<trash_image_info_t> &entries, + bool exclude_user_remove_source) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "trash_list " << &io_ctx << dendl; + + bool more_entries; + uint32_t max_read = 1024; + std::string last_read = ""; + do { + map<string, cls::rbd::TrashImageSpec> trash_entries; + int r = cls_client::trash_list(&io_ctx, last_read, max_read, + &trash_entries); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing rbd trash entries: " << cpp_strerror(r) + << dendl; + return r; + } else if (r == -ENOENT) { + break; + } + + if (trash_entries.empty()) { + break; + } + + for (const auto &entry : trash_entries) { + rbd_trash_image_source_t source = + static_cast<rbd_trash_image_source_t>(entry.second.source); + if (exclude_user_remove_source && + source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + continue; + } + entries.push_back({entry.first, entry.second.name, source, + entry.second.deletion_time.sec(), + entry.second.deferment_end_time.sec()}); + } + last_read = trash_entries.rbegin()->first; + more_entries = (trash_entries.size() >= max_read); + } while (more_entries); + + return 0; +} + +template <typename I> +int Trash<I>::purge(IoCtx& io_ctx, time_t expire_ts, + float threshold, ProgressContext& pctx) { + auto *cct((CephContext *) io_ctx.cct()); + ldout(cct, 20) << &io_ctx << dendl; + + std::vector<librbd::trash_image_info_t> trash_entries; + int r = librbd::api::Trash<I>::list(io_ctx, trash_entries, true); + if (r < 0) { + return r; + } + + trash_entries.erase( + std::remove_if(trash_entries.begin(), trash_entries.end(), + [](librbd::trash_image_info_t info) { + return info.source != RBD_TRASH_IMAGE_SOURCE_USER; + }), + trash_entries.end()); + + std::set<std::string> to_be_removed; + if (threshold != -1) { + if (threshold < 0 || threshold > 1) { + lderr(cct) << "argument 'threshold' is out of valid range" + << dendl; + return -EINVAL; + } + + librados::bufferlist inbl; + librados::bufferlist outbl; + std::string pool_name = io_ctx.get_pool_name(); + + librados::Rados rados(io_ctx); + rados.mon_command(R"({"prefix": "df", "format": "json"})", inbl, + &outbl, nullptr); + + json_spirit::mValue json; + if (!json_spirit::read(outbl.to_str(), json)) { + lderr(cct) << "ceph df json output could not be parsed" + << dendl; + return -EBADMSG; + } + + json_spirit::mArray arr = json.get_obj()["pools"].get_array(); + + double pool_percent_used = 0; + uint64_t pool_total_bytes = 0; + + std::map<std::string, std::vector<std::string>> datapools; + + std::sort(trash_entries.begin(), trash_entries.end(), + [](librbd::trash_image_info_t a, librbd::trash_image_info_t b) { + return a.deferment_end_time < b.deferment_end_time; + } + ); + + for (const auto &entry : trash_entries) { + int64_t data_pool_id = -1; + r = cls_client::get_data_pool(&io_ctx, util::header_name(entry.id), + &data_pool_id); + if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) { + lderr(cct) << "failed to query data pool: " << cpp_strerror(r) << dendl; + return r; + } else if (data_pool_id == -1) { + data_pool_id = io_ctx.get_id(); + } + + if (data_pool_id != io_ctx.get_id()) { + librados::IoCtx data_io_ctx; + r = util::create_ioctx(io_ctx, "image", data_pool_id, + {}, &data_io_ctx); + if (r < 0) { + lderr(cct) << "error accessing data pool" << dendl; + continue; + } + auto data_pool = data_io_ctx.get_pool_name(); + datapools[data_pool].push_back(entry.id); + } else { + datapools[pool_name].push_back(entry.id); + } + } + + uint64_t bytes_to_free = 0; + + for (uint8_t i = 0; i < arr.size(); ++i) { + json_spirit::mObject obj = arr[i].get_obj(); + std::string name = obj.find("name")->second.get_str(); + auto img = datapools.find(name); + if (img != datapools.end()) { + json_spirit::mObject stats = arr[i].get_obj()["stats"].get_obj(); + pool_percent_used = stats["percent_used"].get_real(); + if (pool_percent_used <= threshold) continue; + + bytes_to_free = 0; + + pool_total_bytes = stats["max_avail"].get_uint64() + + stats["bytes_used"].get_uint64(); + + auto bytes_threshold = (uint64_t) (pool_total_bytes * + (pool_percent_used - threshold)); + + for (const auto &it : img->second) { + auto ictx = new I("", it, nullptr, io_ctx, false); + r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r == -ENOENT) { + continue; + } else if (r < 0) { + lderr(cct) << "failed to open image " << it << ": " + << cpp_strerror(r) << dendl; + } + + r = librbd::api::DiffIterate<I>::diff_iterate( + ictx, cls::rbd::UserSnapshotNamespace(), nullptr, 0, ictx->size, + false, true, + [](uint64_t offset, size_t len, int exists, void *arg) { + auto *to_free = reinterpret_cast<uint64_t *>(arg); + if (exists) + (*to_free) += len; + return 0; + }, &bytes_to_free); + + ictx->state->close(); + if (r < 0) { + lderr(cct) << "failed to calculate disk usage for image " << it + << ": " << cpp_strerror(r) << dendl; + continue; + } + + to_be_removed.insert(it); + if (bytes_to_free >= bytes_threshold) { + break; + } + } + } + } + + if (bytes_to_free == 0) { + ldout(cct, 10) << "pool usage is lower than or equal to " + << (threshold * 100) + << "%" << dendl; + return 0; + } + } + + if (expire_ts == 0) { + struct timespec now; + clock_gettime(CLOCK_REALTIME, &now); + expire_ts = now.tv_sec; + } + + for (const auto &entry : trash_entries) { + if (expire_ts >= entry.deferment_end_time) { + to_be_removed.insert(entry.id); + } + } + + NoOpProgressContext remove_pctx; + uint64_t list_size = to_be_removed.size(), i = 0; + for (const auto &entry_id : to_be_removed) { + r = librbd::api::Trash<I>::remove(io_ctx, entry_id, true, remove_pctx); + if (r < 0) { + if (r == -ENOTEMPTY) { + ldout(cct, 5) << "image has snapshots - these must be deleted " + << "with 'rbd snap purge' before the image can be " + << "removed." << dendl; + } else if (r == -EBUSY) { + ldout(cct, 5) << "error: image still has watchers" << std::endl + << "This means the image is still open or the client " + << "using it crashed. Try again after closing/unmapping " + << "it or waiting 30s for the crashed client to timeout." + << dendl; + } else if (r == -EMLINK) { + ldout(cct, 5) << "Remove the image from the group and try again." + << dendl; + } else { + lderr(cct) << "remove error: " << cpp_strerror(r) << dendl; + } + return r; + } + pctx.update_progress(++i, list_size); + } + + return 0; +} + +template <typename I> +int Trash<I>::remove(IoCtx &io_ctx, const std::string &image_id, bool force, + ProgressContext& prog_ctx) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "trash_remove " << &io_ctx << " " << image_id + << " " << force << dendl; + + cls::rbd::TrashImageSpec trash_spec; + int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec); + if (r < 0) { + lderr(cct) << "error getting image id " << image_id + << " info from trash: " << cpp_strerror(r) << dendl; + return r; + } + + utime_t now = ceph_clock_now(); + if (now < trash_spec.deferment_end_time && !force) { + lderr(cct) << "error: deferment time has not expired." << dendl; + return -EPERM; + } + if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL && + trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + lderr(cct) << "error: image is pending restoration." << dendl; + return -EBUSY; + } + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond cond; + auto req = librbd::trash::RemoveRequest<I>::create( + io_ctx, image_id, op_work_queue, force, prog_ctx, &cond); + req->send(); + + r = cond.wait(); + if (r < 0) { + return r; + } + + C_SaferCond notify_ctx; + TrashWatcher<I>::notify_image_removed(io_ctx, image_id, ¬ify_ctx); + r = notify_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + + return 0; +} + +template <typename I> +int Trash<I>::restore(librados::IoCtx &io_ctx, + const TrashImageSources& trash_image_sources, + const std::string &image_id, + const std::string &image_new_name) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "trash_restore " << &io_ctx << " " << image_id << " " + << image_new_name << dendl; + + cls::rbd::TrashImageSpec trash_spec; + int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec); + if (r < 0) { + lderr(cct) << "error getting image id " << image_id + << " info from trash: " << cpp_strerror(r) << dendl; + return r; + } + + if (trash_image_sources.count(trash_spec.source) == 0) { + lderr(cct) << "Current trash source '" << trash_spec.source << "' " + << "does not match expected: " + << trash_image_sources << dendl; + return -EINVAL; + } + + std::string image_name = image_new_name; + if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL && + trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_RESTORING) { + lderr(cct) << "error restoring image id " << image_id + << ", which is pending deletion" << dendl; + return -EBUSY; + } + r = cls_client::trash_state_set(&io_ctx, image_id, + cls::rbd::TRASH_IMAGE_STATE_RESTORING, + cls::rbd::TRASH_IMAGE_STATE_NORMAL); + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(r) << dendl; + return r; + } + + if (image_name.empty()) { + // if user didn't specify a new name, let's try using the old name + image_name = trash_spec.name; + ldout(cct, 20) << "restoring image id " << image_id << " with name " + << image_name << dendl; + } + + // check if no image exists with the same name + bool create_id_obj = true; + std::string existing_id; + r = cls_client::get_id(&io_ctx, util::id_obj_name(image_name), &existing_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error checking if image " << image_name << " exists: " + << cpp_strerror(r) << dendl; + int ret = cls_client::trash_state_set(&io_ctx, image_id, + cls::rbd::TRASH_IMAGE_STATE_NORMAL, + cls::rbd::TRASH_IMAGE_STATE_RESTORING); + if (ret < 0 && ret != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(ret) << dendl; + } + return r; + } else if (r != -ENOENT){ + // checking if we are recovering from an incomplete restore + if (existing_id != image_id) { + ldout(cct, 2) << "an image with the same name already exists" << dendl; + int r2 = cls_client::trash_state_set(&io_ctx, image_id, + cls::rbd::TRASH_IMAGE_STATE_NORMAL, + cls::rbd::TRASH_IMAGE_STATE_RESTORING); + if (r2 < 0 && r2 != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(r2) << dendl; + } + return -EEXIST; + } + create_id_obj = false; + } + + if (create_id_obj) { + ldout(cct, 2) << "adding id object" << dendl; + librados::ObjectWriteOperation op; + op.create(true); + cls_client::set_id(&op, image_id); + r = io_ctx.operate(util::id_obj_name(image_name), &op); + if (r < 0) { + lderr(cct) << "error adding id object for image " << image_name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + ldout(cct, 2) << "adding rbd image to v2 directory..." << dendl; + r = cls_client::dir_add_image(&io_ctx, RBD_DIRECTORY, image_name, + image_id); + if (r < 0 && r != -EEXIST) { + lderr(cct) << "error adding image to v2 directory: " + << cpp_strerror(r) << dendl; + return r; + } + + r = enable_mirroring<I>(io_ctx, image_id); + if (r < 0) { + // not fatal -- ignore + } + + ldout(cct, 2) << "removing image from trash..." << dendl; + r = cls_client::trash_remove(&io_ctx, image_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing image id " << image_id << " from trash: " + << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond notify_ctx; + TrashWatcher<I>::notify_image_removed(io_ctx, image_id, ¬ify_ctx); + r = notify_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Trash<librbd::ImageCtx>; diff --git a/src/librbd/api/Trash.h b/src/librbd/api/Trash.h new file mode 100644 index 00000000..65b6b8bc --- /dev/null +++ b/src/librbd/api/Trash.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_TRASH_H +#define LIBRBD_API_TRASH_H + +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include <set> +#include <string> +#include <vector> + +namespace librbd { + +class ProgressContext; + +struct ImageCtx; + +namespace api { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Trash { + typedef std::set<cls::rbd::TrashImageSource> TrashImageSources; + static const TrashImageSources RESTORE_SOURCE_WHITELIST; + + static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, uint64_t delay); + static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, const std::string &image_id, + uint64_t delay); + static int get(librados::IoCtx &io_ctx, const std::string &id, + trash_image_info_t *info); + static int list(librados::IoCtx &io_ctx, + std::vector<trash_image_info_t> &entries, + bool exclude_user_remove_source); + static int purge(IoCtx& io_ctx, time_t expire_ts, + float threshold, ProgressContext& pctx); + static int remove(librados::IoCtx &io_ctx, const std::string &image_id, + bool force, ProgressContext& prog_ctx); + static int restore(librados::IoCtx &io_ctx, + const TrashImageSources& trash_image_sources, + const std::string &image_id, + const std::string &image_new_name); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Trash<librbd::ImageCtx>; + +#endif // LIBRBD_API_TRASH_H diff --git a/src/librbd/cache/ImageCache.h b/src/librbd/cache/ImageCache.h new file mode 100644 index 00000000..71af11f2 --- /dev/null +++ b/src/librbd/cache/ImageCache.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_IMAGE_CACHE +#define CEPH_LIBRBD_CACHE_IMAGE_CACHE + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include <vector> + +class Context; + +namespace librbd { +namespace cache { + +/** + * client-side, image extent cache interface + */ +struct ImageCache { + typedef std::vector<std::pair<uint64_t,uint64_t> > Extents; + + virtual ~ImageCache() { + } + + /// client AIO methods + virtual void aio_read(Extents&& image_extents, ceph::bufferlist* bl, + int fadvise_flags, Context *on_finish) = 0; + virtual void aio_write(Extents&& image_extents, ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) = 0; + virtual void aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) = 0; + virtual void aio_flush(Context *on_finish) = 0; + virtual void aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) = 0; + virtual void aio_compare_and_write(Extents&& image_extents, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, + Context *on_finish) = 0; + + /// internal state methods + virtual void init(Context *on_finish) = 0; + virtual void shut_down(Context *on_finish) = 0; + + virtual void invalidate(Context *on_finish) = 0; + virtual void flush(Context *on_finish) = 0; + +}; + +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_IMAGE_CACHE diff --git a/src/librbd/cache/ImageWriteback.cc b/src/librbd/cache/ImageWriteback.cc new file mode 100644 index 00000000..ad479fbd --- /dev/null +++ b/src/librbd/cache/ImageWriteback.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ImageWriteback.h" +#include "include/buffer.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ReadResult.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageWriteback: " << __func__ << ": " + +namespace librbd { +namespace cache { + +template <typename I> +ImageWriteback<I>::ImageWriteback(I &image_ctx) : m_image_ctx(image_ctx) { +} + +template <typename I> +void ImageWriteback<I>::aio_read(Extents &&image_extents, bufferlist *bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx, + io::AIO_TYPE_READ); + io::ImageReadRequest<I> req(m_image_ctx, aio_comp, std::move(image_extents), + io::ReadResult{bl}, fadvise_flags, {}); + req.set_bypass_image_cache(); + req.send(); +} + +template <typename I> +void ImageWriteback<I>::aio_write(Extents &&image_extents, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx, + io::AIO_TYPE_WRITE); + io::ImageWriteRequest<I> req(m_image_ctx, aio_comp, std::move(image_extents), + std::move(bl), fadvise_flags, {}); + req.set_bypass_image_cache(); + req.send(); +} + +template <typename I> +void ImageWriteback<I>::aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "offset=" << offset << ", " + << "length=" << length << ", " + << "on_finish=" << on_finish << dendl; + + auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx, + io::AIO_TYPE_DISCARD); + io::ImageDiscardRequest<I> req(m_image_ctx, aio_comp, {{offset, length}}, + discard_granularity_bytes, {}); + req.set_bypass_image_cache(); + req.send(); +} + +template <typename I> +void ImageWriteback<I>::aio_flush(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "on_finish=" << on_finish << dendl; + + auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx, + io::AIO_TYPE_FLUSH); + io::ImageFlushRequest<I> req(m_image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, + {}); + req.set_bypass_image_cache(); + req.send(); +} + +template <typename I> +void ImageWriteback<I>::aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "offset=" << offset << ", " + << "length=" << length << ", " + << "data_len=" << bl.length() << ", " + << "on_finish=" << on_finish << dendl; + + auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx, + io::AIO_TYPE_WRITESAME); + io::ImageWriteSameRequest<I> req(m_image_ctx, aio_comp, {{offset, length}}, + std::move(bl), fadvise_flags, {}); + req.set_bypass_image_cache(); + req.send(); +} + +template <typename I> +void ImageWriteback<I>::aio_compare_and_write(Extents &&image_extents, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx, + io::AIO_TYPE_COMPARE_AND_WRITE); + io::ImageCompareAndWriteRequest<I> req(m_image_ctx, aio_comp, + std::move(image_extents), + std::move(cmp_bl), std::move(bl), + mismatch_offset, fadvise_flags, {}); + req.set_bypass_image_cache(); + req.send(); +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::ImageWriteback<librbd::ImageCtx>; + diff --git a/src/librbd/cache/ImageWriteback.h b/src/librbd/cache/ImageWriteback.h new file mode 100644 index 00000000..382c57c1 --- /dev/null +++ b/src/librbd/cache/ImageWriteback.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK +#define CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include <vector> + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +/** + * client-side, image extent cache writeback handler + */ +template <typename ImageCtxT = librbd::ImageCtx> +class ImageWriteback { +public: + typedef std::vector<std::pair<uint64_t,uint64_t> > Extents; + + explicit ImageWriteback(ImageCtxT &image_ctx); + + void aio_read(Extents &&image_extents, ceph::bufferlist *bl, + int fadvise_flags, Context *on_finish); + void aio_write(Extents &&image_extents, ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish); + void aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, Context *on_finish); + void aio_flush(Context *on_finish); + void aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish); + void aio_compare_and_write(Extents &&image_extents, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, Context *on_finish); +private: + ImageCtxT &m_image_ctx; + +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::ImageWriteback<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.cc b/src/librbd/cache/ObjectCacherObjectDispatch.cc new file mode 100644 index 00000000..5bced71b --- /dev/null +++ b/src/librbd/cache/ObjectCacherObjectDispatch.cc @@ -0,0 +1,408 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/ObjectCacherObjectDispatch.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/LibrbdWriteback.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/Utils.h" +#include "osd/osd_types.h" +#include "osdc/WritebackHandler.h" +#include <vector> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::ObjectCacherObjectDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { + +namespace { + +typedef std::vector<ObjectExtent> ObjectExtents; + +} // anonymous namespace + +template <typename I> +struct ObjectCacherObjectDispatch<I>::C_InvalidateCache : public Context { + ObjectCacherObjectDispatch* dispatcher; + bool purge_on_error; + Context *on_finish; + + C_InvalidateCache(ObjectCacherObjectDispatch* dispatcher, + bool purge_on_error, Context *on_finish) + : dispatcher(dispatcher), purge_on_error(purge_on_error), + on_finish(on_finish) { + } + + void finish(int r) override { + ceph_assert(dispatcher->m_cache_lock.is_locked()); + auto cct = dispatcher->m_image_ctx->cct; + + if (r == -EBLACKLISTED) { + lderr(cct) << "blacklisted during flush (purging)" << dendl; + dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set); + } else if (r < 0 && purge_on_error) { + lderr(cct) << "failed to invalidate cache (purging): " + << cpp_strerror(r) << dendl; + dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set); + } else if (r != 0) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r) << dendl; + } + + auto unclean = dispatcher->m_object_cacher->release_set( + dispatcher->m_object_set); + if (unclean == 0) { + r = 0; + } else { + lderr(cct) << "could not release all objects from cache: " + << unclean << " bytes remain" << dendl; + if (r == 0) { + r = -EBUSY; + } + } + + on_finish->complete(r); + } +}; + +template <typename I> +ObjectCacherObjectDispatch<I>::ObjectCacherObjectDispatch( + I* image_ctx) + : m_image_ctx(image_ctx), + m_cache_lock(util::unique_lock_name( + "librbd::cache::ObjectCacherObjectDispatch::cache_lock", this)) { + ceph_assert(m_image_ctx->data_ctx.is_valid()); +} + +template <typename I> +ObjectCacherObjectDispatch<I>::~ObjectCacherObjectDispatch() { + delete m_object_cacher; + delete m_object_set; + + delete m_writeback_handler; +} + +template <typename I> +void ObjectCacherObjectDispatch<I>::init() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_cache_lock.Lock(); + ldout(cct, 5) << "enabling caching..." << dendl; + m_writeback_handler = new LibrbdWriteback(m_image_ctx, m_cache_lock); + + uint64_t init_max_dirty = m_image_ctx->cache_max_dirty; + if (m_image_ctx->cache_writethrough_until_flush) { + init_max_dirty = 0; + } + + auto cache_size = + m_image_ctx->config.template get_val<Option::size_t>("rbd_cache_size"); + auto target_dirty = + m_image_ctx->config.template get_val<Option::size_t>("rbd_cache_target_dirty"); + auto max_dirty_age = + m_image_ctx->config.template get_val<double>("rbd_cache_max_dirty_age"); + auto block_writes_upfront = + m_image_ctx->config.template get_val<bool>("rbd_cache_block_writes_upfront"); + auto max_dirty_object = + m_image_ctx->config.template get_val<uint64_t>("rbd_cache_max_dirty_object"); + + ldout(cct, 5) << "Initial cache settings:" + << " size=" << cache_size + << " num_objects=" << 10 + << " max_dirty=" << init_max_dirty + << " target_dirty=" << target_dirty + << " max_dirty_age=" << max_dirty_age << dendl; + + m_object_cacher = new ObjectCacher(cct, m_image_ctx->perfcounter->get_name(), + *m_writeback_handler, m_cache_lock, + nullptr, nullptr, cache_size, + 10, /* reset this in init */ + init_max_dirty, target_dirty, + max_dirty_age, block_writes_upfront); + + // size object cache appropriately + if (max_dirty_object == 0) { + max_dirty_object = std::min<uint64_t>( + 2000, std::max<uint64_t>(10, cache_size / 100 / + sizeof(ObjectCacher::Object))); + } + ldout(cct, 5) << " cache bytes " << cache_size + << " -> about " << max_dirty_object << " objects" << dendl; + m_object_cacher->set_max_objects(max_dirty_object); + + m_object_set = new ObjectCacher::ObjectSet(nullptr, + m_image_ctx->data_ctx.get_id(), 0); + m_object_cacher->start(); + m_cache_lock.Unlock(); + + // add ourself to the IO object dispatcher chain + m_image_ctx->io_object_dispatcher->register_object_dispatch(this); +} + +template <typename I> +void ObjectCacherObjectDispatch<I>::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // chain shut down in reverse order + + // shut down the cache + on_finish = new FunctionContext([this, on_finish](int r) { + m_object_cacher->stop(); + on_finish->complete(r); + }); + + // ensure we aren't holding the cache lock post-flush + on_finish = util::create_async_context_callback(*m_image_ctx, on_finish); + + // invalidate any remaining cache entries + on_finish = new C_InvalidateCache(this, true, on_finish); + + // flush all pending writeback state + m_cache_lock.Lock(); + m_object_cacher->release_set(m_object_set); + m_object_cacher->flush_set(m_object_set, on_finish); + m_cache_lock.Unlock(); +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::read( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + io::ExtentMap* extent_map, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + // IO chained in reverse order + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << object_len << dendl; + + // ensure we aren't holding the cache lock post-read + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + m_image_ctx->snap_lock.get_read(); + auto rd = m_object_cacher->prepare_read(snap_id, read_data, op_flags); + m_image_ctx->snap_lock.put_read(); + + ObjectExtent extent(oid, object_no, object_off, object_len, 0); + extent.oloc.pool = m_image_ctx->data_ctx.get_id(); + extent.buffer_extents.push_back({0, object_len}); + rd->extents.push_back(extent); + + ZTracer::Trace trace(parent_trace); + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + + m_cache_lock.Lock(); + int r = m_object_cacher->readx(rd, m_object_set, on_dispatched, &trace); + m_cache_lock.Unlock(); + if (r != 0) { + on_dispatched->complete(r); + } + return true; +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << object_len << dendl; + + ObjectExtents object_extents; + object_extents.emplace_back(oid, object_no, object_off, object_len, 0); + + // discard the cache state after changes are committed to disk (and to + // prevent races w/ readahead) + auto ctx = *on_finish; + *on_finish = new FunctionContext( + [this, object_extents, ctx](int r) { + m_cache_lock.Lock(); + m_object_cacher->discard_set(m_object_set, object_extents); + m_cache_lock.Unlock(); + + ctx->complete(r); + }); + + // ensure we aren't holding the cache lock post-write + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + + // ensure any in-flight writeback is complete before advancing + // the discard request + m_cache_lock.Lock(); + m_object_cacher->discard_writeback(m_object_set, object_extents, + on_dispatched); + m_cache_lock.Unlock(); + return true; +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << data.length() << dendl; + + // ensure we aren't holding the cache lock post-write + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + m_image_ctx->snap_lock.get_read(); + ObjectCacher::OSDWrite *wr = m_object_cacher->prepare_write( + snapc, data, ceph::real_time::min(), op_flags, *journal_tid); + m_image_ctx->snap_lock.put_read(); + + ObjectExtent extent(oid, 0, object_off, data.length(), 0); + extent.oloc.pool = m_image_ctx->data_ctx.get_id(); + extent.buffer_extents.push_back({0, data.length()}); + wr->extents.push_back(extent); + + ZTracer::Trace trace(parent_trace); + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + + m_cache_lock.Lock(); + m_object_cacher->writex(wr, m_object_set, on_dispatched, &trace); + m_cache_lock.Unlock(); + return true; +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, io::Extents&& buffer_extents, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << object_len << dendl; + + // ObjectCacher doesn't support write-same so convert to regular write + ObjectExtent extent(oid, 0, object_off, object_len, 0); + extent.buffer_extents = std::move(buffer_extents); + + bufferlist ws_data; + io::util::assemble_write_same_extent(extent, data, &ws_data, true); + + return write(oid, object_no, object_off, std::move(ws_data), snapc, + op_flags, parent_trace, object_dispatch_flags, journal_tid, + dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << cmp_data.length() << dendl; + + // pass-through the compare-and-write request since it's not a supported + // operation of the ObjectCacher + + // ensure we aren't holding the cache lock post-flush + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + // flush any pending writes from the cache + ZTracer::Trace trace(parent_trace); + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + + ObjectExtents object_extents; + object_extents.emplace_back(oid, object_no, object_off, cmp_data.length(), + 0); + + Mutex::Locker cache_locker(m_cache_lock); + m_object_cacher->flush_set(m_object_set, object_extents, &trace, + on_dispatched); + return true; +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + // ensure we aren't holding the cache lock post-flush + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + m_cache_lock.Lock(); + if (flush_source == io::FLUSH_SOURCE_USER && !m_user_flushed && + m_image_ctx->cache_writethrough_until_flush && + m_image_ctx->cache_max_dirty > 0) { + m_user_flushed = true; + m_object_cacher->set_max_dirty(m_image_ctx->cache_max_dirty); + ldout(cct, 5) << "saw first user flush, enabling writeback" << dendl; + } + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + m_object_cacher->flush_set(m_object_set, on_dispatched); + m_cache_lock.Unlock(); + return true; +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::invalidate_cache(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // ensure we aren't holding the cache lock post-flush + on_finish = util::create_async_context_callback(*m_image_ctx, on_finish); + + // invalidate any remaining cache entries + on_finish = new C_InvalidateCache(this, false, on_finish); + + m_cache_lock.Lock(); + m_object_cacher->release_set(m_object_set); + m_object_cacher->flush_set(m_object_set, on_finish); + m_cache_lock.Unlock(); + return true; +} + +template <typename I> +bool ObjectCacherObjectDispatch<I>::reset_existence_cache( + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_cache_lock.Lock(); + m_object_cacher->clear_nonexistence(m_object_set); + m_cache_lock.Unlock(); + + return false; +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::ObjectCacherObjectDispatch<librbd::ImageCtx>; diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.h b/src/librbd/cache/ObjectCacherObjectDispatch.h new file mode 100644 index 00000000..cb145681 --- /dev/null +++ b/src/librbd/cache/ObjectCacherObjectDispatch.h @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H + +#include "librbd/io/ObjectDispatchInterface.h" +#include "common/Mutex.h" +#include "osdc/ObjectCacher.h" + +struct WritebackHandler; + +namespace librbd { + +class ImageCtx; + +namespace cache { + +/** + * Facade around the OSDC object cacher to make it align with + * the object dispatcher interface + */ +template <typename ImageCtxT = ImageCtx> +class ObjectCacherObjectDispatch : public io::ObjectDispatchInterface { +public: + static ObjectCacherObjectDispatch* create(ImageCtxT* image_ctx) { + return new ObjectCacherObjectDispatch(image_ctx); + } + + ObjectCacherObjectDispatch(ImageCtxT* image_ctx); + ~ObjectCacherObjectDispatch() override; + + io::ObjectDispatchLayer get_object_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_CACHE; + } + + void init(); + void shut_down(Context* on_finish) override; + + bool read( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + io::ExtentMap* extent_map, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, io::Extents&& buffer_extents, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override; + bool reset_existence_cache(Context* on_finish) override; + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + } + +private: + struct C_InvalidateCache; + + ImageCtxT* m_image_ctx; + + Mutex m_cache_lock; + ObjectCacher *m_object_cacher = nullptr; + ObjectCacher::ObjectSet *m_object_set = nullptr; + + WritebackHandler *m_writeback_handler = nullptr; + + bool m_user_flushed = false; + +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::ObjectCacherObjectDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H diff --git a/src/librbd/cache/PassthroughImageCache.cc b/src/librbd/cache/PassthroughImageCache.cc new file mode 100644 index 00000000..c3672f53 --- /dev/null +++ b/src/librbd/cache/PassthroughImageCache.cc @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PassthroughImageCache.h" +#include "include/buffer.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::PassthroughImageCache: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { + +template <typename I> +PassthroughImageCache<I>::PassthroughImageCache(ImageCtx &image_ctx) + : m_image_ctx(image_ctx), m_image_writeback(image_ctx) { +} + +template <typename I> +void PassthroughImageCache<I>::aio_read(Extents &&image_extents, bufferlist *bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + m_image_writeback.aio_read(std::move(image_extents), bl, fadvise_flags, + on_finish); +} + +template <typename I> +void PassthroughImageCache<I>::aio_write(Extents &&image_extents, + bufferlist&& bl, + int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + m_image_writeback.aio_write(std::move(image_extents), std::move(bl), + fadvise_flags, on_finish); +} + +template <typename I> +void PassthroughImageCache<I>::aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "offset=" << offset << ", " + << "length=" << length << ", " + << "on_finish=" << on_finish << dendl; + + m_image_writeback.aio_discard(offset, length, discard_granularity_bytes, + on_finish); +} + +template <typename I> +void PassthroughImageCache<I>::aio_flush(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "on_finish=" << on_finish << dendl; + + m_image_writeback.aio_flush(on_finish); +} + +template <typename I> +void PassthroughImageCache<I>::aio_writesame(uint64_t offset, uint64_t length, + bufferlist&& bl, int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "offset=" << offset << ", " + << "length=" << length << ", " + << "data_len=" << bl.length() << ", " + << "on_finish=" << on_finish << dendl; + + m_image_writeback.aio_writesame(offset, length, std::move(bl), fadvise_flags, + on_finish); +} + +template <typename I> +void PassthroughImageCache<I>::aio_compare_and_write(Extents &&image_extents, + bufferlist&& cmp_bl, + bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + m_image_writeback.aio_compare_and_write( + std::move(image_extents), std::move(cmp_bl), std::move(bl), mismatch_offset, + fadvise_flags, on_finish); +} + +template <typename I> +void PassthroughImageCache<I>::init(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + on_finish->complete(0); +} + +template <typename I> +void PassthroughImageCache<I>::shut_down(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + on_finish->complete(0); +} + +template <typename I> +void PassthroughImageCache<I>::invalidate(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + // dump cache contents (don't have anything) + on_finish->complete(0); +} + +template <typename I> +void PassthroughImageCache<I>::flush(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + // internal flush -- nothing to writeback but make sure + // in-flight IO is flushed + aio_flush(on_finish); +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::PassthroughImageCache<librbd::ImageCtx>; diff --git a/src/librbd/cache/PassthroughImageCache.h b/src/librbd/cache/PassthroughImageCache.h new file mode 100644 index 00000000..4be69a50 --- /dev/null +++ b/src/librbd/cache/PassthroughImageCache.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PASSTHROUGH_IMAGE_CACHE +#define CEPH_LIBRBD_CACHE_PASSTHROUGH_IMAGE_CACHE + +#include "ImageCache.h" +#include "ImageWriteback.h" + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +/** + * Example passthrough client-side, image extent cache + */ +template <typename ImageCtxT = librbd::ImageCtx> +class PassthroughImageCache : public ImageCache { +public: + explicit PassthroughImageCache(ImageCtx &image_ctx); + + /// client AIO methods + void aio_read(Extents&& image_extents, ceph::bufferlist *bl, + int fadvise_flags, Context *on_finish) override; + void aio_write(Extents&& image_extents, ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) override; + void aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) override; + void aio_flush(Context *on_finish) override; + void aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) override; + void aio_compare_and_write(Extents&& image_extents, + ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl, + uint64_t *mismatch_offset,int fadvise_flags, + Context *on_finish) override; + + /// internal state methods + void init(Context *on_finish) override; + void shut_down(Context *on_finish) override; + + void invalidate(Context *on_finish) override; + void flush(Context *on_finish) override; + +private: + ImageCtxT &m_image_ctx; + ImageWriteback<ImageCtxT> m_image_writeback; + +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::PassthroughImageCache<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_CACHE_PASSTHROUGH_IMAGE_CACHE diff --git a/src/librbd/deep_copy/ImageCopyRequest.cc b/src/librbd/deep_copy/ImageCopyRequest.cc new file mode 100644 index 00000000..62e5409c --- /dev/null +++ b/src/librbd/deep_copy/ImageCopyRequest.cc @@ -0,0 +1,187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ImageCopyRequest.h" +#include "ObjectCopyRequest.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/Utils.h" +#include "librbd/image/CloseRequest.h" +#include "librbd/image/OpenRequest.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::ImageCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; + +template <typename I> +ImageCopyRequest<I>::ImageCopyRequest(I *src_image_ctx, I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const ObjectNumber &object_number, + const SnapSeqs &snap_seqs, + ProgressContext *prog_ctx, + Context *on_finish) + : RefCountedObject(dst_image_ctx->cct, 1), m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start), + m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start), + m_flatten(flatten), m_object_number(object_number), m_snap_seqs(snap_seqs), + m_prog_ctx(prog_ctx), m_on_finish(on_finish), m_cct(dst_image_ctx->cct), + m_lock(unique_lock_name("ImageCopyRequest::m_lock", this)) { +} + +template <typename I> +void ImageCopyRequest<I>::send() { + m_dst_image_ctx->snap_lock.get_read(); + util::compute_snap_map(m_dst_image_ctx->cct, m_src_snap_id_start, + m_src_snap_id_end, m_dst_image_ctx->snaps, m_snap_seqs, + &m_snap_map); + m_dst_image_ctx->snap_lock.put_read(); + + if (m_snap_map.empty()) { + lderr(m_cct) << "failed to map snapshots within boundary" << dendl; + finish(-EINVAL); + return; + } + + send_object_copies(); +} + +template <typename I> +void ImageCopyRequest<I>::cancel() { + Mutex::Locker locker(m_lock); + + ldout(m_cct, 20) << dendl; + m_canceled = true; +} + +template <typename I> +void ImageCopyRequest<I>::send_object_copies() { + m_object_no = 0; + if (m_object_number) { + m_object_no = *m_object_number + 1; + } + + uint64_t size; + { + RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock); + size = m_src_image_ctx->get_image_size(CEPH_NOSNAP); + for (auto snap_id : m_src_image_ctx->snaps) { + size = std::max(size, m_src_image_ctx->get_image_size(snap_id)); + } + } + m_end_object_no = Striper::get_num_objects(m_dst_image_ctx->layout, size); + + ldout(m_cct, 20) << "start_object=" << m_object_no << ", " + << "end_object=" << m_end_object_no << dendl; + + bool complete; + { + Mutex::Locker locker(m_lock); + for (uint64_t i = 0; + i < m_src_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops"); + ++i) { + send_next_object_copy(); + if (m_ret_val < 0 && m_current_ops == 0) { + break; + } + } + complete = (m_current_ops == 0) && !m_updating_progress; + } + + if (complete) { + finish(m_ret_val); + } +} + +template <typename I> +void ImageCopyRequest<I>::send_next_object_copy() { + ceph_assert(m_lock.is_locked()); + + if (m_canceled && m_ret_val == 0) { + ldout(m_cct, 10) << "image copy canceled" << dendl; + m_ret_val = -ECANCELED; + } + + if (m_ret_val < 0 || m_object_no >= m_end_object_no) { + return; + } + + uint64_t ono = m_object_no++; + + ldout(m_cct, 20) << "object_num=" << ono << dendl; + + ++m_current_ops; + + Context *ctx = new FunctionContext( + [this, ono](int r) { + handle_object_copy(ono, r); + }); + ObjectCopyRequest<I> *req = ObjectCopyRequest<I>::create( + m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_dst_snap_id_start, + m_snap_map, ono, m_flatten, ctx); + req->send(); +} + +template <typename I> +void ImageCopyRequest<I>::handle_object_copy(uint64_t object_no, int r) { + ldout(m_cct, 20) << "object_no=" << object_no << ", r=" << r << dendl; + + bool complete; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_current_ops > 0); + --m_current_ops; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "object copy failed: " << cpp_strerror(r) << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + } else { + m_copied_objects.push(object_no); + while (!m_updating_progress && !m_copied_objects.empty() && + m_copied_objects.top() == + (m_object_number ? *m_object_number + 1 : 0)) { + m_object_number = m_copied_objects.top(); + m_copied_objects.pop(); + uint64_t progress_object_no = *m_object_number + 1; + m_updating_progress = true; + m_lock.Unlock(); + m_prog_ctx->update_progress(progress_object_no, m_end_object_no); + m_lock.Lock(); + ceph_assert(m_updating_progress); + m_updating_progress = false; + } + } + + send_next_object_copy(); + complete = (m_current_ops == 0) && !m_updating_progress; + } + + if (complete) { + finish(m_ret_val); + } +} + +template <typename I> +void ImageCopyRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + put(); +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::ImageCopyRequest<librbd::ImageCtx>; diff --git a/src/librbd/deep_copy/ImageCopyRequest.h b/src/librbd/deep_copy/ImageCopyRequest.h new file mode 100644 index 00000000..189cb237 --- /dev/null +++ b/src/librbd/deep_copy/ImageCopyRequest.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/Mutex.h" +#include "common/RefCountedObj.h" +#include "librbd/Types.h" +#include "librbd/deep_copy/Types.h" +#include <functional> +#include <map> +#include <queue> +#include <vector> +#include <boost/optional.hpp> + +class Context; + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace deep_copy { + +template <typename ImageCtxT = ImageCtx> +class ImageCopyRequest : public RefCountedObject { +public: + static ImageCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const ObjectNumber &object_number, + const SnapSeqs &snap_seqs, + ProgressContext *prog_ctx, + Context *on_finish) { + return new ImageCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start, + src_snap_id_end, dst_snap_id_start, flatten, + object_number, snap_seqs, prog_ctx, on_finish); + } + + ImageCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, const ObjectNumber &object_number, + const SnapSeqs &snap_seqs, ProgressContext *prog_ctx, + Context *on_finish); + + void send(); + void cancel(); + +private: + /** + * @verbatim + * + * <start> + * | . . . . . + * | . . (parallel execution of + * v v . multiple objects at once) + * COPY_OBJECT . . . . + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_src_snap_id_end; + librados::snap_t m_dst_snap_id_start; + bool m_flatten; + ObjectNumber m_object_number; + SnapSeqs m_snap_seqs; + ProgressContext *m_prog_ctx; + Context *m_on_finish; + + CephContext *m_cct; + Mutex m_lock; + bool m_canceled = false; + + uint64_t m_object_no = 0; + uint64_t m_end_object_no = 0; + uint64_t m_current_ops = 0; + std::priority_queue< + uint64_t, std::vector<uint64_t>, std::greater<uint64_t>> m_copied_objects; + bool m_updating_progress = false; + SnapMap m_snap_map; + int m_ret_val = 0; + + void send_object_copies(); + void send_next_object_copy(); + void handle_object_copy(uint64_t object_no, int r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::ImageCopyRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/MetadataCopyRequest.cc b/src/librbd/deep_copy/MetadataCopyRequest.cc new file mode 100644 index 00000000..cbce9a19 --- /dev/null +++ b/src/librbd/deep_copy/MetadataCopyRequest.cc @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MetadataCopyRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::MetadataCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +namespace { + +const uint64_t MAX_METADATA_ITEMS = 128; + +} // anonymous namespace + +using librbd::util::create_rados_callback; + +template <typename I> +MetadataCopyRequest<I>::MetadataCopyRequest(I *src_image_ctx, I *dst_image_ctx, + Context *on_finish) + : m_src_image_ctx(src_image_ctx), m_dst_image_ctx(dst_image_ctx), + m_on_finish(on_finish), m_cct(dst_image_ctx->cct) { +} + +template <typename I> +void MetadataCopyRequest<I>::send() { + list_src_metadata(); +} + +template <typename I> +void MetadataCopyRequest<I>::list_src_metadata() { + ldout(m_cct, 20) << "start_key=" << m_last_metadata_key << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::metadata_list_start(&op, m_last_metadata_key, MAX_METADATA_ITEMS); + + librados::AioCompletion *aio_comp = create_rados_callback< + MetadataCopyRequest<I>, + &MetadataCopyRequest<I>::handle_list_src_metadata>(this); + m_out_bl.clear(); + m_src_image_ctx->md_ctx.aio_operate(m_src_image_ctx->header_oid, + aio_comp, &op, &m_out_bl); + aio_comp->release(); +} + +template <typename I> +void MetadataCopyRequest<I>::handle_list_src_metadata(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + Metadata metadata; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::metadata_list_finish(&it, &metadata); + } + + if (r < 0) { + lderr(m_cct) << "failed to retrieve metadata: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (metadata.empty()) { + finish(0); + return; + } + + m_last_metadata_key = metadata.rbegin()->first; + m_more_metadata = (metadata.size() >= MAX_METADATA_ITEMS); + set_dst_metadata(metadata); +} + +template <typename I> +void MetadataCopyRequest<I>::set_dst_metadata(const Metadata& metadata) { + ldout(m_cct, 20) << "count=" << metadata.size() << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::metadata_set(&op, metadata); + + librados::AioCompletion *aio_comp = create_rados_callback< + MetadataCopyRequest<I>, + &MetadataCopyRequest<I>::handle_set_dst_metadata>(this); + m_dst_image_ctx->md_ctx.aio_operate(m_dst_image_ctx->header_oid, aio_comp, + &op); + aio_comp->release(); +} + +template <typename I> +void MetadataCopyRequest<I>::handle_set_dst_metadata(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to set metadata: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_more_metadata) { + list_src_metadata(); + return; + } + + finish(0); +} + +template <typename I> +void MetadataCopyRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + m_on_finish->complete(r); + delete this; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::MetadataCopyRequest<librbd::ImageCtx>; diff --git a/src/librbd/deep_copy/MetadataCopyRequest.h b/src/librbd/deep_copy/MetadataCopyRequest.h new file mode 100644 index 00000000..5bd69d8b --- /dev/null +++ b/src/librbd/deep_copy/MetadataCopyRequest.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/ImageCtx.h" +#include <map> +#include <string> + +class Context; + +namespace librbd { +namespace deep_copy { + +template <typename ImageCtxT = librbd::ImageCtx> +class MetadataCopyRequest { +public: + static MetadataCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + Context *on_finish) { + return new MetadataCopyRequest(src_image_ctx, dst_image_ctx, on_finish); + } + + MetadataCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * LIST_SRC_METADATA <------\ + * | | (repeat if additional + * v | metadata) + * SET_DST_METADATA --------/ + * | + * v + * <finish> + * + * @endverbatim + */ + typedef std::map<std::string, bufferlist> Metadata; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + Context *m_on_finish; + + CephContext *m_cct; + bufferlist m_out_bl; + + std::string m_last_metadata_key; + bool m_more_metadata = false; + + void list_src_metadata(); + void handle_list_src_metadata(int r); + + void set_dst_metadata(const Metadata& metadata); + void handle_set_dst_metadata(int r); + + void finish(int r); + +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::MetadataCopyRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/ObjectCopyRequest.cc b/src/librbd/deep_copy/ObjectCopyRequest.cc new file mode 100644 index 00000000..2400757a --- /dev/null +++ b/src/librbd/deep_copy/ObjectCopyRequest.cc @@ -0,0 +1,998 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ObjectCopyRequest.h" +#include "common/errno.h" +#include "librados/snap_set_diff.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ReadResult.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::ObjectCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librados { + +inline bool operator==(const clone_info_t& rhs, const clone_info_t& lhs) { + return (rhs.cloneid == lhs.cloneid && + rhs.snaps == lhs.snaps && + rhs.overlap == lhs.overlap && + rhs.size == lhs.size); +} + +inline bool operator==(const snap_set_t& rhs, const snap_set_t& lhs) { + return (rhs.clones == lhs.clones && + rhs.seq == lhs.seq); +} + +} // namespace librados + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +ObjectCopyRequest<I>::ObjectCopyRequest(I *src_image_ctx, + I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t dst_snap_id_start, + const SnapMap &snap_map, + uint64_t dst_object_number, + bool flatten, Context *on_finish) + : m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_cct(dst_image_ctx->cct), + m_src_snap_id_start(src_snap_id_start), + m_dst_snap_id_start(dst_snap_id_start), m_snap_map(snap_map), + m_dst_object_number(dst_object_number), m_flatten(flatten), + m_on_finish(on_finish) { + ceph_assert(src_image_ctx->data_ctx.is_valid()); + ceph_assert(dst_image_ctx->data_ctx.is_valid()); + ceph_assert(!m_snap_map.empty()); + + m_src_async_op = new io::AsyncOperation(); + m_src_async_op->start_op(*util::get_image_ctx(m_src_image_ctx)); + + m_src_io_ctx.dup(m_src_image_ctx->data_ctx); + m_dst_io_ctx.dup(m_dst_image_ctx->data_ctx); + + m_dst_oid = m_dst_image_ctx->get_object_name(dst_object_number); + + ldout(m_cct, 20) << "dst_oid=" << m_dst_oid << dendl; + + compute_src_object_extents(); +} + +template <typename I> +void ObjectCopyRequest<I>::send() { + send_list_snaps(); +} + +template <typename I> +void ObjectCopyRequest<I>::send_list_snaps() { + ceph_assert(!m_src_objects.empty()); + m_src_ono = *m_src_objects.begin(); + m_src_oid = m_src_image_ctx->get_object_name(m_src_ono); + + ldout(m_cct, 20) << "src_oid=" << m_src_oid << dendl; + + librados::AioCompletion *rados_completion = create_rados_callback< + ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_list_snaps>(this); + + librados::ObjectReadOperation op; + m_snap_set = {}; + m_snap_ret = 0; + op.list_snaps(&m_snap_set, &m_snap_ret); + + m_src_io_ctx.snap_set_read(CEPH_SNAPDIR); + int r = m_src_io_ctx.aio_operate(m_src_oid, rados_completion, &op, + nullptr); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void ObjectCopyRequest<I>::handle_list_snaps(int r) { + if (r == 0 && m_snap_ret < 0) { + r = m_snap_ret; + } + + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to list snaps: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_retry_missing_read) { + if (m_snap_set == m_retry_snap_set) { + lderr(m_cct) << "read encountered missing object using up-to-date snap set" + << dendl; + finish(-ENOENT); + return; + } + + ldout(m_cct, 20) << "retrying using updated snap set" << dendl; + m_retry_missing_read = false; + m_retry_snap_set = {}; + } + + if (r == -ENOENT) { + for (auto &it : m_src_object_extents) { + auto &e = it.second; + if (e.object_no == m_src_ono) { + e.noent = true; + } + } + m_read_ops = {}; + m_read_snaps = {}; + m_zero_interval = {}; + } else { + compute_read_ops(); + } + send_read_object(); +} + +template <typename I> +void ObjectCopyRequest<I>::send_read_object() { + + if (m_read_snaps.empty()) { + // all snapshots have been read + merge_write_ops(); + + ceph_assert(!m_src_objects.empty()); + m_src_objects.erase(m_src_objects.begin()); + + if (!m_src_objects.empty()) { + send_list_snaps(); + return; + } + + // all objects have been read + send_read_from_parent(); + return; + } + + auto index = *m_read_snaps.begin(); + auto src_snap_seq = index.second; + + bool read_required = false; + librados::ObjectReadOperation op; + + for (auto ©_op : m_read_ops[index]) { + if (!read_required) { + // map the copy op start snap id back to the necessary read snap id + m_src_io_ctx.snap_set_read(src_snap_seq); + + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << dendl; + read_required = true; + } + ldout(m_cct, 20) << "read op: " << copy_op.src_offset << "~" + << copy_op.length << dendl; + op.sparse_read(copy_op.src_offset, copy_op.length, ©_op.src_extent_map, + ©_op.out_bl, nullptr); + op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + } + + if (!read_required) { + // nothing written to this object for this snapshot (must be trunc/remove) + handle_read_object(0); + return; + } + + auto ctx = create_context_callback< + ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_read_object>(this); + auto comp = create_rados_callback(ctx); + + ldout(m_cct, 20) << "read " << m_src_oid << dendl; + + int r = m_src_io_ctx.aio_operate(m_src_oid, comp, &op, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void ObjectCopyRequest<I>::handle_read_object(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + m_retry_snap_set = m_snap_set; + m_retry_missing_read = true; + + ldout(m_cct, 5) << "object missing potentially due to removed snapshot" + << dendl; + send_list_snaps(); + return; + } + + if (r < 0) { + lderr(m_cct) << "failed to read from source object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ceph_assert(!m_read_snaps.empty()); + m_read_snaps.erase(m_read_snaps.begin()); + + send_read_object(); +} + +template <typename I> +void ObjectCopyRequest<I>::send_read_from_parent() { + m_src_image_ctx->snap_lock.get_read(); + m_src_image_ctx->parent_lock.get_read(); + io::Extents image_extents; + compute_read_from_parent_ops(&image_extents); + m_src_image_ctx->snap_lock.put_read(); + + if (image_extents.empty()) { + m_src_image_ctx->parent_lock.put_read(); + handle_read_from_parent(0); + return; + } + + ldout(m_cct, 20) << dendl; + + ceph_assert(m_src_image_ctx->parent != nullptr); + + auto ctx = create_context_callback< + ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_read_from_parent>(this); + auto comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(m_src_image_ctx->parent), io::AIO_TYPE_READ); + ldout(m_cct, 20) << "completion " << comp << ", extents " << image_extents + << dendl; + + auto src_image_ctx = m_src_image_ctx; + io::ImageRequest<I>::aio_read(src_image_ctx->parent, comp, + std::move(image_extents), + io::ReadResult{&m_read_from_parent_data}, 0, + ZTracer::Trace()); + src_image_ctx->parent_lock.put_read(); +} + +template <typename I> +void ObjectCopyRequest<I>::handle_read_from_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to read from parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_read_ops.empty()) { + ceph_assert(m_read_ops.size() == 1); + auto src_snap_seq = m_read_ops.begin()->first.first; + auto ©_ops = m_read_ops.begin()->second; + uint64_t offset = 0; + for (auto it = copy_ops.begin(); it != copy_ops.end(); ) { + it->out_bl.substr_of(m_read_from_parent_data, offset, it->length); + offset += it->length; + if (it->out_bl.is_zero()) { + m_zero_interval[src_snap_seq].insert(it->dst_offset, it->length); + it = copy_ops.erase(it); + } else { + it++; + } + } + merge_write_ops(); + } + + compute_dst_object_may_exist(); + compute_zero_ops(); + + if (m_write_ops.empty()) { + // nothing to copy + finish(-ENOENT); + return; + } + + send_write_object(); + return; +} + +template <typename I> +void ObjectCopyRequest<I>::send_write_object() { + ceph_assert(!m_write_ops.empty()); + auto& copy_ops = m_write_ops.begin()->second; + + // retrieve the destination snap context for the op + SnapIds dst_snap_ids; + librados::snap_t dst_snap_seq = 0; + librados::snap_t src_snap_seq = m_write_ops.begin()->first; + if (src_snap_seq != 0) { + auto snap_map_it = m_snap_map.find(src_snap_seq); + ceph_assert(snap_map_it != m_snap_map.end()); + + auto dst_snap_id = snap_map_it->second.front(); + auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_id); + ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end()); + if (!dst_may_exist_it->second && !copy_ops.empty()) { + // if the object cannot exist, the only valid op is to remove it + ceph_assert(copy_ops.size() == 1U); + ceph_assert(copy_ops.begin()->type == COPY_OP_TYPE_REMOVE); + } + + // write snapshot context should be before actual snapshot + ceph_assert(!snap_map_it->second.empty()); + auto dst_snap_ids_it = snap_map_it->second.begin(); + ++dst_snap_ids_it; + + dst_snap_ids = SnapIds{dst_snap_ids_it, snap_map_it->second.end()}; + if (!dst_snap_ids.empty()) { + dst_snap_seq = dst_snap_ids.front(); + } + ceph_assert(dst_snap_seq != CEPH_NOSNAP); + } + + ldout(m_cct, 20) << "dst_snap_seq=" << dst_snap_seq << ", " + << "dst_snaps=" << dst_snap_ids << dendl; + + librados::ObjectWriteOperation op; + uint64_t buffer_offset; + + if (!m_dst_image_ctx->migration_info.empty()) { + cls_client::assert_snapc_seq(&op, dst_snap_seq, + cls::rbd::ASSERT_SNAPC_SEQ_GT_SNAPSET_SEQ); + } + + for (auto ©_op : copy_ops) { + switch (copy_op.type) { + case COPY_OP_TYPE_WRITE: + buffer_offset = 0; + for (auto &e : copy_op.dst_extent_map) { + ldout(m_cct, 20) << "write op: " << e.first << "~" << e.second + << dendl; + bufferlist tmpbl; + tmpbl.substr_of(copy_op.out_bl, buffer_offset, e.second); + op.write(e.first, tmpbl); + op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + buffer_offset += e.second; + } + break; + case COPY_OP_TYPE_ZERO: + ldout(m_cct, 20) << "zero op: " << copy_op.dst_offset << "~" + << copy_op.length << dendl; + op.zero(copy_op.dst_offset, copy_op.length); + break; + case COPY_OP_TYPE_REMOVE_TRUNC: + ldout(m_cct, 20) << "create op" << dendl; + op.create(false); + // fall through + case COPY_OP_TYPE_TRUNC: + ldout(m_cct, 20) << "trunc op: " << copy_op.dst_offset << dendl; + op.truncate(copy_op.dst_offset); + break; + case COPY_OP_TYPE_REMOVE: + ldout(m_cct, 20) << "remove op" << dendl; + op.remove(); + break; + default: + ceph_abort(); + } + } + + if (op.size() == (m_dst_image_ctx->migration_info.empty() ? 0 : 1)) { + handle_write_object(0); + return; + } + + int r; + Context *finish_op_ctx; + { + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r); + } + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_write_object(r); + finish_op_ctx->complete(0); + }); + librados::AioCompletion *comp = create_rados_callback(ctx); + r = m_dst_io_ctx.aio_operate(m_dst_oid, comp, &op, dst_snap_seq, dst_snap_ids, + nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void ObjectCopyRequest<I>::handle_write_object(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } else if (r == -ERANGE) { + ldout(m_cct, 10) << "concurrent deep copy" << dendl; + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed to write to destination object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + m_write_ops.erase(m_write_ops.begin()); + if (!m_write_ops.empty()) { + send_write_object(); + return; + } + + send_update_object_map(); +} + +template <typename I> +void ObjectCopyRequest<I>::send_update_object_map() { + if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP) || + m_dst_object_state.empty()) { + finish(0); + return; + } + + m_dst_image_ctx->owner_lock.get_read(); + m_dst_image_ctx->snap_lock.get_read(); + if (m_dst_image_ctx->object_map == nullptr) { + // possible that exclusive lock was lost in background + lderr(m_cct) << "object map is not initialized" << dendl; + + m_dst_image_ctx->snap_lock.put_read(); + m_dst_image_ctx->owner_lock.put_read(); + finish(-EINVAL); + return; + } + + auto &dst_object_state = *m_dst_object_state.begin(); + auto it = m_snap_map.find(dst_object_state.first); + ceph_assert(it != m_snap_map.end()); + auto dst_snap_id = it->second.front(); + auto object_state = dst_object_state.second; + m_dst_object_state.erase(m_dst_object_state.begin()); + + ldout(m_cct, 20) << "dst_snap_id=" << dst_snap_id << ", object_state=" + << static_cast<uint32_t>(object_state) << dendl; + + int r; + auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + m_dst_image_ctx->snap_lock.put_read(); + m_dst_image_ctx->owner_lock.put_read(); + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_update_object_map(r); + finish_op_ctx->complete(0); + }); + + auto dst_image_ctx = m_dst_image_ctx; + dst_image_ctx->object_map_lock.get_write(); + bool sent = dst_image_ctx->object_map->template aio_update< + Context, &Context::complete>(dst_snap_id, m_dst_object_number, object_state, + {}, {}, false, ctx); + + // NOTE: state machine might complete before we reach here + dst_image_ctx->object_map_lock.put_write(); + dst_image_ctx->snap_lock.put_read(); + dst_image_ctx->owner_lock.put_read(); + if (!sent) { + ceph_assert(dst_snap_id == CEPH_NOSNAP); + ctx->complete(0); + } +} + +template <typename I> +void ObjectCopyRequest<I>::handle_update_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_dst_object_state.empty()) { + send_update_object_map(); + return; + } + finish(0); +} + +template <typename I> +Context *ObjectCopyRequest<I>::start_lock_op(RWLock &owner_lock, int* r) { + ceph_assert(m_dst_image_ctx->owner_lock.is_locked()); + if (m_dst_image_ctx->exclusive_lock == nullptr) { + return new FunctionContext([](int r) {}); + } + return m_dst_image_ctx->exclusive_lock->start_op(r); +} + +template <typename I> +uint64_t ObjectCopyRequest<I>::src_to_dst_object_offset(uint64_t objectno, + uint64_t offset) { + std::vector<std::pair<uint64_t, uint64_t>> image_extents; + Striper::extent_to_file(m_cct, &m_src_image_ctx->layout, objectno, offset, 1, + image_extents); + ceph_assert(image_extents.size() == 1); + auto dst_object_offset = image_extents.begin()->first; + + std::map<object_t, std::vector<ObjectExtent>> dst_object_extents; + Striper::file_to_extents(m_cct, m_dst_image_ctx->format_string, + &m_dst_image_ctx->layout, dst_object_offset, 1, 0, + dst_object_extents); + ceph_assert(dst_object_extents.size() == 1); + ceph_assert(dst_object_extents.begin()->second.size() == 1); + auto &e = *dst_object_extents.begin()->second.begin(); + ceph_assert(e.objectno == m_dst_object_number); + + return e.offset; +} + +template <typename I> +void ObjectCopyRequest<I>::compute_src_object_extents() { + std::vector<std::pair<uint64_t, uint64_t>> image_extents; + Striper::extent_to_file(m_cct, &m_dst_image_ctx->layout, m_dst_object_number, + 0, m_dst_image_ctx->layout.object_size, image_extents); + + size_t total = 0; + for (auto &e : image_extents) { + std::map<object_t, std::vector<ObjectExtent>> src_object_extents; + Striper::file_to_extents(m_cct, m_src_image_ctx->format_string, + &m_src_image_ctx->layout, e.first, e.second, 0, + src_object_extents); + auto stripe_unit = std::min(m_src_image_ctx->layout.stripe_unit, + m_dst_image_ctx->layout.stripe_unit); + for (auto &p : src_object_extents) { + for (auto &s : p.second) { + m_src_objects.insert(s.objectno); + total += s.length; + while (s.length > 0) { + ceph_assert(s.length >= stripe_unit); + auto dst_object_offset = src_to_dst_object_offset(s.objectno, s.offset); + m_src_object_extents[dst_object_offset] = {s.objectno, s.offset, + stripe_unit}; + s.offset += stripe_unit; + s.length -= stripe_unit; + } + } + } + } + + ceph_assert(total == m_dst_image_ctx->layout.object_size); + + ldout(m_cct, 20) << m_src_object_extents.size() << " src extents" << dendl; +} + +template <typename I> +void ObjectCopyRequest<I>::compute_read_ops() { + m_read_ops = {}; + m_read_snaps = {}; + m_zero_interval = {}; + + m_src_image_ctx->parent_lock.get_read(); + bool hide_parent = (m_src_image_ctx->parent != nullptr); + m_src_image_ctx->parent_lock.put_read(); + + librados::snap_t src_copy_point_snap_id = m_snap_map.rbegin()->first; + bool prev_exists = (hide_parent || m_src_snap_id_start > 0); + uint64_t prev_end_size = prev_exists ? + m_src_image_ctx->layout.object_size : 0; + librados::snap_t start_src_snap_id = m_src_snap_id_start; + + for (auto &pair : m_snap_map) { + ceph_assert(!pair.second.empty()); + librados::snap_t end_src_snap_id = pair.first; + librados::snap_t end_dst_snap_id = pair.second.front(); + + interval_set<uint64_t> diff; + uint64_t end_size; + bool exists; + librados::snap_t clone_end_snap_id; + calc_snap_set_diff(m_cct, m_snap_set, start_src_snap_id, + end_src_snap_id, &diff, &end_size, &exists, + &clone_end_snap_id, &m_read_whole_object); + + if (m_read_whole_object) { + ldout(m_cct, 1) << "need to read full object" << dendl; + diff.insert(0, m_src_image_ctx->layout.object_size); + exists = true; + end_size = m_src_image_ctx->layout.object_size; + clone_end_snap_id = end_src_snap_id; + } else if (!exists) { + end_size = 0; + if (hide_parent && end_src_snap_id == m_snap_map.begin()->first && + m_snap_set.clones.empty()) { + ldout(m_cct, 20) << "no clones for existing object" << dendl; + exists = true; + diff.insert(0, m_src_image_ctx->layout.object_size); + clone_end_snap_id = end_src_snap_id; + } + } + + ldout(m_cct, 20) << "start_src_snap_id=" << start_src_snap_id << ", " + << "end_src_snap_id=" << end_src_snap_id << ", " + << "clone_end_snap_id=" << clone_end_snap_id << ", " + << "end_dst_snap_id=" << end_dst_snap_id << ", " + << "diff=" << diff << ", " + << "end_size=" << end_size << ", " + << "exists=" << exists << dendl; + + m_zero_interval[end_src_snap_id] = {}; + + if (exists || prev_exists) { + // clip diff to size of object (in case it was truncated) + if (end_size < prev_end_size) { + interval_set<uint64_t> trunc; + trunc.insert(end_size, prev_end_size); + trunc.intersection_of(diff); + diff.subtract(trunc); + ldout(m_cct, 20) << "clearing truncate diff: " << trunc << dendl; + } + + if (exists) { + // reads should be issued against the newest (existing) snapshot within + // the associated snapshot object clone. writes should be issued + // against the oldest snapshot in the snap_map. + ceph_assert(clone_end_snap_id >= end_src_snap_id); + if (clone_end_snap_id > src_copy_point_snap_id) { + // do not read past the copy point snapshot + clone_end_snap_id = src_copy_point_snap_id; + } + } + + for (auto &it : m_src_object_extents) { + auto dst_object_offset = it.first; + auto &e = it.second; + + if (e.object_no != m_src_ono) { + continue; + } + + interval_set<uint64_t> read_interval; + read_interval.insert(e.offset, e.length); + + if (end_size < prev_end_size) { + interval_set<uint64_t> zero_interval; + zero_interval.insert(end_size, prev_end_size - end_size); + zero_interval.intersection_of(read_interval); + if (!zero_interval.empty()) { + auto it = zero_interval.begin(); + auto offset = it.get_start() - e.offset; + m_zero_interval[end_src_snap_id].insert(dst_object_offset + offset, + it.get_len()); + ldout(m_cct, 20) << "extent " << e.offset << "~" << e.length + << " intersects truncation " << end_size << "~" + << prev_end_size - end_size << ", inserting zero " + << dst_object_offset + offset << "~" + << it.get_len() << dendl; + } + } + + // limit read interval to diff + read_interval.intersection_of(diff); + + ldout(m_cct, 20) << "src_object_extent: " << e.offset << "~" << e.length + << ", dst_object_offset=" << dst_object_offset + << ", read: " << read_interval << dendl; + + ceph_assert(exists || read_interval.empty()); + + for (auto it = read_interval.begin(); it != read_interval.end(); + it++) { + ceph_assert(it.get_start() >= e.offset); + auto offset = it.get_start() - e.offset; + ldout(m_cct, 20) << "read/write op: " << it.get_start() << "~" + << it.get_len() << " dst: " + << dst_object_offset + offset << dendl; + m_read_ops[{end_src_snap_id, clone_end_snap_id}] + .emplace_back(COPY_OP_TYPE_WRITE, it.get_start(), + dst_object_offset + offset, it.get_len()); + } + } + } + + prev_end_size = end_size; + prev_exists = exists; + if (hide_parent && prev_exists && prev_end_size == 0) { + // hide parent + prev_end_size = m_src_image_ctx->layout.object_size; + } + start_src_snap_id = end_src_snap_id; + } + + for (auto &it : m_read_ops) { + m_read_snaps.push_back(it.first); + } +} + +template <typename I> +void ObjectCopyRequest<I>::compute_read_from_parent_ops( + io::Extents *parent_image_extents) { + assert(m_src_image_ctx->snap_lock.is_locked()); + assert(m_src_image_ctx->parent_lock.is_locked()); + + m_read_ops = {}; + m_zero_interval = {}; + parent_image_extents->clear(); + + if (m_src_image_ctx->parent == nullptr) { + ldout(m_cct, 20) << "no parent" << dendl; + return; + } + + size_t noent_count = 0; + for (auto &it : m_src_object_extents) { + if (it.second.noent) { + noent_count++; + } + } + + if (noent_count == 0) { + ldout(m_cct, 20) << "no extents need read from parent" << dendl; + return; + } + + if (noent_count == m_src_object_extents.size() && !m_flatten) { + ldout(m_cct, 20) << "reading all extents skipped when no flatten" + << dendl; + return; + } + + ldout(m_cct, 20) << dendl; + + auto src_snap_seq = m_snap_map.begin()->first; + + uint64_t parent_overlap; + int r = m_src_image_ctx->get_parent_overlap(src_snap_seq, &parent_overlap); + if (r < 0) { + ldout(m_cct, 5) << "failed getting parent overlap for snap_id: " + << src_snap_seq << ": " << cpp_strerror(r) << dendl; + return; + } + if (parent_overlap == 0) { + ldout(m_cct, 20) << "no parent overlap" << dendl; + return; + } + + for (auto &it : m_src_object_extents) { + auto dst_object_offset = it.first; + auto &e = it.second; + + if (!e.noent) { + continue; + } + + std::vector<std::pair<uint64_t, uint64_t>> image_extents; + Striper::extent_to_file(m_cct, &m_src_image_ctx->layout, e.object_no, + e.offset, e.length, image_extents); + + uint64_t overlap = m_src_image_ctx->prune_parent_extents(image_extents, + parent_overlap); + if (overlap == 0) { + ldout(m_cct, 20) << "no parent overlap for object_no " << e.object_no + << " extent " << e.offset << "~" << e.length << dendl; + continue; + } + + ldout(m_cct, 20) << "object_no " << e.object_no << " extent " << e.offset + << "~" << e.length << " overlap " << parent_overlap + << " parent extents " << image_extents << dendl; + + ceph_assert(image_extents.size() == 1); + + auto src_image_offset = image_extents.begin()->first; + auto length = image_extents.begin()->second; + m_read_ops[{src_snap_seq, 0}].emplace_back(COPY_OP_TYPE_WRITE, e.offset, + dst_object_offset, length); + m_read_ops[{src_snap_seq, 0}].rbegin()->src_extent_map[e.offset] = length; + parent_image_extents->emplace_back(src_image_offset, length); + } + + if (!parent_image_extents->empty()) { + m_dst_object_state[src_snap_seq] = OBJECT_EXISTS; + } +} + +template <typename I> +void ObjectCopyRequest<I>::merge_write_ops() { + ldout(m_cct, 20) << dendl; + + for (auto &it : m_zero_interval) { + m_dst_zero_interval[it.first].insert(it.second); + } + + for (auto &it : m_read_ops) { + auto src_snap_seq = it.first.first; + auto ©_ops = it.second; + for (auto ©_op : copy_ops) { + uint64_t src_offset = copy_op.src_offset; + uint64_t dst_offset = copy_op.dst_offset; + for (auto &e : copy_op.src_extent_map) { + uint64_t zero_len = e.first - src_offset; + if (zero_len > 0) { + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq + << ", inserting zero " << dst_offset << "~" + << zero_len << dendl; + m_dst_zero_interval[src_snap_seq].insert(dst_offset, zero_len); + src_offset += zero_len; + dst_offset += zero_len; + } + copy_op.dst_extent_map[dst_offset] = e.second; + src_offset += e.second; + dst_offset += e.second; + } + if (dst_offset < copy_op.dst_offset + copy_op.length) { + uint64_t zero_len = copy_op.dst_offset + copy_op.length - dst_offset; + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq + << ", inserting zero " << dst_offset << "~" + << zero_len << dendl; + m_dst_zero_interval[src_snap_seq].insert(dst_offset, zero_len); + } else { + ceph_assert(dst_offset == copy_op.dst_offset + copy_op.length); + } + m_write_ops[src_snap_seq].emplace_back(std::move(copy_op)); + } + } +} + +template <typename I> +void ObjectCopyRequest<I>::compute_zero_ops() { + ldout(m_cct, 20) << dendl; + + bool fast_diff = m_dst_image_ctx->test_features(RBD_FEATURE_FAST_DIFF); + uint64_t prev_end_size = 0; + + m_src_image_ctx->parent_lock.get_read(); + bool hide_parent = (m_src_image_ctx->parent != nullptr); + m_src_image_ctx->parent_lock.put_read(); + + for (auto &it : m_dst_zero_interval) { + auto src_snap_seq = it.first; + auto &zero_interval = it.second; + + auto snap_map_it = m_snap_map.find(src_snap_seq); + ceph_assert(snap_map_it != m_snap_map.end()); + auto dst_snap_seq = snap_map_it->second.front(); + + auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_seq); + ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end()); + if (!dst_may_exist_it->second && prev_end_size > 0) { + ldout(m_cct, 5) << "object DNE for snap_id: " << dst_snap_seq << dendl; + m_write_ops[src_snap_seq].emplace_back(COPY_OP_TYPE_REMOVE, 0, 0, 0); + prev_end_size = 0; + continue; + } + + if (hide_parent) { + RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock); + RWLock::RLocker parent_locker(m_dst_image_ctx->parent_lock); + uint64_t parent_overlap = 0; + int r = m_dst_image_ctx->get_parent_overlap(dst_snap_seq, &parent_overlap); + if (r < 0) { + ldout(m_cct, 5) << "failed getting parent overlap for snap_id: " + << dst_snap_seq << ": " << cpp_strerror(r) << dendl; + } + if (parent_overlap == 0) { + ldout(m_cct, 20) << "no parent overlap" << dendl; + hide_parent = false; + } else { + std::vector<std::pair<uint64_t, uint64_t>> image_extents; + Striper::extent_to_file(m_cct, &m_dst_image_ctx->layout, + m_dst_object_number, 0, + m_dst_image_ctx->layout.object_size, + image_extents); + uint64_t overlap = m_dst_image_ctx->prune_parent_extents(image_extents, + parent_overlap); + if (overlap == 0) { + ldout(m_cct, 20) << "no parent overlap" << dendl; + hide_parent = false; + } else if (src_snap_seq == m_dst_zero_interval.begin()->first) { + for (auto e : image_extents) { + prev_end_size += e.second; + } + ceph_assert(prev_end_size <= m_dst_image_ctx->layout.object_size); + } + } + } + + uint64_t end_size = prev_end_size; + + // update end_size if there are writes into higher offsets + auto iter = m_write_ops.find(src_snap_seq); + if (iter != m_write_ops.end()) { + for (auto ©_op : iter->second) { + for (auto &e : copy_op.dst_extent_map) { + end_size = std::max(end_size, e.first + e.second); + } + } + } + + for (auto z = zero_interval.begin(); z != zero_interval.end(); z++) { + if (z.get_start() + z.get_len() >= end_size) { + // zero interval at the object end + if (z.get_start() == 0 && hide_parent) { + m_write_ops[src_snap_seq] + .emplace_back(COPY_OP_TYPE_REMOVE_TRUNC, 0, 0, 0); + ldout(m_cct, 20) << "COPY_OP_TYPE_REMOVE_TRUNC" << dendl; + } else if (z.get_start() < prev_end_size) { + if (z.get_start() == 0) { + m_write_ops[src_snap_seq] + .emplace_back(COPY_OP_TYPE_REMOVE, 0, 0, 0); + ldout(m_cct, 20) << "COPY_OP_TYPE_REMOVE" << dendl; + } else { + m_write_ops[src_snap_seq] + .emplace_back(COPY_OP_TYPE_TRUNC, 0, z.get_start(), 0); + ldout(m_cct, 20) << "COPY_OP_TYPE_TRUNC " << z.get_start() << dendl; + } + } + end_size = std::min(end_size, z.get_start()); + } else { + // zero interval inside the object + m_write_ops[src_snap_seq] + .emplace_back(COPY_OP_TYPE_ZERO, 0, z.get_start(), z.get_len()); + ldout(m_cct, 20) << "COPY_OP_TYPE_ZERO " << z.get_start() << "~" + << z.get_len() << dendl; + } + } + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", end_size=" + << end_size << dendl; + if (end_size > 0 || hide_parent) { + m_dst_object_state[src_snap_seq] = OBJECT_EXISTS; + if (fast_diff && end_size == prev_end_size && + m_write_ops[src_snap_seq].empty()) { + m_dst_object_state[src_snap_seq] = OBJECT_EXISTS_CLEAN; + } + } + prev_end_size = end_size; + } +} + +template <typename I> +void ObjectCopyRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + // ensure IoCtxs are closed prior to proceeding + auto on_finish = m_on_finish; + + m_src_async_op->finish_op(); + delete m_src_async_op; + delete this; + + on_finish->complete(r); +} + +template <typename I> +void ObjectCopyRequest<I>::compute_dst_object_may_exist() { + RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock); + + auto snap_ids = m_dst_image_ctx->snaps; + snap_ids.push_back(CEPH_NOSNAP); + + for (auto snap_id : snap_ids) { + m_dst_object_may_exist[snap_id] = + (m_dst_object_number < m_dst_image_ctx->get_object_count(snap_id)); + } +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::ObjectCopyRequest<librbd::ImageCtx>; diff --git a/src/librbd/deep_copy/ObjectCopyRequest.h b/src/librbd/deep_copy/ObjectCopyRequest.h new file mode 100644 index 00000000..a0445f3b --- /dev/null +++ b/src/librbd/deep_copy/ObjectCopyRequest.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/interval_set.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/deep_copy/Types.h" +#include "librbd/io/Types.h" +#include <list> +#include <map> +#include <string> + +class Context; +class RWLock; + +namespace librbd { + +namespace io { class AsyncOperation; } + +namespace deep_copy { + +template <typename ImageCtxT = librbd::ImageCtx> +class ObjectCopyRequest { +public: + static ObjectCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t dst_snap_id_start, + const SnapMap &snap_map, + uint64_t object_number, bool flatten, + Context *on_finish) { + return new ObjectCopyRequest(src_image_ctx, dst_image_ctx, + src_snap_id_start, dst_snap_id_start, snap_map, + object_number, flatten, on_finish); + } + + ObjectCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t dst_snap_id_start, const SnapMap &snap_map, + uint64_t object_number, bool flatten, Context *on_finish); + + void send(); + + // testing support + inline librados::IoCtx &get_src_io_ctx() { + return m_src_io_ctx; + } + inline librados::IoCtx &get_dst_io_ctx() { + return m_dst_io_ctx; + } + +private: + /** + * @verbatim + * + * <start> + * | /----------------------\ + * | | | + * v v | (repeat for each src object) + * LIST_SNAPS < * * * | + * | * (-ENOENT and snap set stale) + * | * * * * * * | + * | * /-----------\ | + * | * | | (repeat for each snapshot) + * v * v | | + * READ_OBJECT ---------/ | + * | | | + * | \----------------------/ + * v + * READ_FROM_PARENT (skip if not needed) + * | + * | /-----------\ + * | | | (repeat for each snapshot) + * v v | + * WRITE_OBJECT --------/ + * | + * | /-----------\ + * | | | (repeat for each snapshot) + * v v | + * UPDATE_OBJECT_MAP ---/ (skip if object + * | map disabled) + * v + * <finish> + * + * @endverbatim + */ + + struct SrcObjectExtent { + uint64_t object_no = 0; + uint64_t offset = 0; + uint64_t length = 0; + bool noent = false; + + SrcObjectExtent() { + } + SrcObjectExtent(uint64_t object_no, uint64_t offset, uint64_t length) + : object_no(object_no), offset(offset), length(length) { + } + }; + + typedef std::map<uint64_t, SrcObjectExtent> SrcObjectExtents; + + enum CopyOpType { + COPY_OP_TYPE_WRITE, + COPY_OP_TYPE_ZERO, + COPY_OP_TYPE_TRUNC, + COPY_OP_TYPE_REMOVE, + COPY_OP_TYPE_REMOVE_TRUNC, + }; + + typedef std::map<uint64_t, uint64_t> ExtentMap; + + struct CopyOp { + CopyOp(CopyOpType type, uint64_t src_offset, uint64_t dst_offset, + uint64_t length) + : type(type), src_offset(src_offset), dst_offset(dst_offset), + length(length) { + } + + CopyOpType type; + uint64_t src_offset; + uint64_t dst_offset; + uint64_t length; + + ExtentMap src_extent_map; + ExtentMap dst_extent_map; + bufferlist out_bl; + }; + + typedef std::list<CopyOp> CopyOps; + typedef std::pair<librados::snap_t, librados::snap_t> WriteReadSnapIds; + typedef std::map<librados::snap_t, uint8_t> SnapObjectStates; + typedef std::map<librados::snap_t, std::map<uint64_t, uint64_t>> SnapObjectSizes; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + CephContext *m_cct; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_dst_snap_id_start; + SnapMap m_snap_map; + uint64_t m_dst_object_number; + bool m_flatten; + Context *m_on_finish; + + decltype(m_src_image_ctx->data_ctx) m_src_io_ctx; + decltype(m_dst_image_ctx->data_ctx) m_dst_io_ctx; + std::string m_dst_oid; + + std::set<uint64_t> m_src_objects; + uint64_t m_src_ono; + std::string m_src_oid; + SrcObjectExtents m_src_object_extents; + librados::snap_set_t m_snap_set; + int m_snap_ret = 0; + bool m_retry_missing_read = false; + librados::snap_set_t m_retry_snap_set; + bool m_read_whole_object = false; + + std::map<WriteReadSnapIds, CopyOps> m_read_ops; + std::list<WriteReadSnapIds> m_read_snaps; + std::map<librados::snap_t, CopyOps> m_write_ops; + std::map<librados::snap_t, interval_set<uint64_t>> m_zero_interval; + std::map<librados::snap_t, interval_set<uint64_t>> m_dst_zero_interval; + std::map<librados::snap_t, uint8_t> m_dst_object_state; + std::map<librados::snap_t, bool> m_dst_object_may_exist; + bufferlist m_read_from_parent_data; + + io::AsyncOperation* m_src_async_op = nullptr; + + void send_list_snaps(); + void handle_list_snaps(int r); + + void send_read_object(); + void handle_read_object(int r); + + void send_read_from_parent(); + void handle_read_from_parent(int r); + + void send_write_object(); + void handle_write_object(int r); + + void send_update_object_map(); + void handle_update_object_map(int r); + + Context *start_lock_op(RWLock &owner_lock, int* r); + + uint64_t src_to_dst_object_offset(uint64_t objectno, uint64_t offset); + + void compute_src_object_extents(); + void compute_read_ops(); + void compute_read_from_parent_ops(io::Extents *image_extents); + void merge_write_ops(); + void compute_zero_ops(); + + void compute_dst_object_may_exist(); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::ObjectCopyRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/SetHeadRequest.cc b/src/librbd/deep_copy/SetHeadRequest.cc new file mode 100644 index 00000000..5670ba07 --- /dev/null +++ b/src/librbd/deep_copy/SetHeadRequest.cc @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SetHeadRequest.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/Utils.h" +#include "librbd/image/AttachParentRequest.h" +#include "librbd/image/DetachParentRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::SetHeadRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +SetHeadRequest<I>::SetHeadRequest(I *image_ctx, uint64_t size, + const cls::rbd::ParentImageSpec &spec, + uint64_t parent_overlap, + Context *on_finish) + : m_image_ctx(image_ctx), m_size(size), m_parent_spec(spec), + m_parent_overlap(parent_overlap), m_on_finish(on_finish), + m_cct(image_ctx->cct) { + ceph_assert(m_parent_overlap <= m_size); +} + +template <typename I> +void SetHeadRequest<I>::send() { + send_set_size(); +} + +template <typename I> +void SetHeadRequest<I>::send_set_size() { + m_image_ctx->snap_lock.get_read(); + if (m_image_ctx->size == m_size) { + m_image_ctx->snap_lock.put_read(); + send_detach_parent(); + return; + } + m_image_ctx->snap_lock.put_read(); + + ldout(m_cct, 20) << dendl; + + // Change the image size on disk so that the snapshot picks up + // the expected size. We can do this because the last snapshot + // we process is the sync snapshot which was created to match the + // image size. We also don't need to worry about trimming because + // we track the highest possible object number within the sync record + librados::ObjectWriteOperation op; + librbd::cls_client::set_size(&op, m_size); + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_set_size(r); + finish_op_ctx->complete(0); + }); + librados::AioCompletion *comp = create_rados_callback(ctx); + r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void SetHeadRequest<I>::handle_set_size(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update image size: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // adjust in-memory image size now that it's updated on disk + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + if (m_image_ctx->size > m_size) { + RWLock::WLocker parent_locker(m_image_ctx->parent_lock); + if (m_image_ctx->parent_md.spec.pool_id != -1 && + m_image_ctx->parent_md.overlap > m_size) { + m_image_ctx->parent_md.overlap = m_size; + } + } + m_image_ctx->size = m_size; + } + + send_detach_parent(); +} + +template <typename I> +void SetHeadRequest<I>::send_detach_parent() { + m_image_ctx->parent_lock.get_read(); + if (m_image_ctx->parent_md.spec.pool_id == -1 || + (m_image_ctx->parent_md.spec == m_parent_spec && + m_image_ctx->parent_md.overlap == m_parent_overlap)) { + m_image_ctx->parent_lock.put_read(); + send_attach_parent(); + return; + } + m_image_ctx->parent_lock.put_read(); + + ldout(m_cct, 20) << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_detach_parent(r); + finish_op_ctx->complete(0); + }); + auto req = image::DetachParentRequest<I>::create(*m_image_ctx, ctx); + req->send(); +} + +template <typename I> +void SetHeadRequest<I>::handle_detach_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // adjust in-memory parent now that it's updated on disk + RWLock::WLocker parent_locker(m_image_ctx->parent_lock); + m_image_ctx->parent_md.spec = {}; + m_image_ctx->parent_md.overlap = 0; + } + + send_attach_parent(); +} + +template <typename I> +void SetHeadRequest<I>::send_attach_parent() { + m_image_ctx->parent_lock.get_read(); + if (m_image_ctx->parent_md.spec == m_parent_spec && + m_image_ctx->parent_md.overlap == m_parent_overlap) { + m_image_ctx->parent_lock.put_read(); + finish(0); + return; + } + m_image_ctx->parent_lock.put_read(); + + ldout(m_cct, 20) << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_attach_parent(r); + finish_op_ctx->complete(0); + }); + auto req = image::AttachParentRequest<I>::create( + *m_image_ctx, m_parent_spec, m_parent_overlap, false, ctx); + req->send(); +} + +template <typename I> +void SetHeadRequest<I>::handle_attach_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // adjust in-memory parent now that it's updated on disk + RWLock::WLocker parent_locker(m_image_ctx->parent_lock); + m_image_ctx->parent_md.spec = m_parent_spec; + m_image_ctx->parent_md.overlap = m_parent_overlap; + } + + finish(0); +} + +template <typename I> +Context *SetHeadRequest<I>::start_lock_op(int* r) { + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + if (m_image_ctx->exclusive_lock == nullptr) { + return new FunctionContext([](int r) {}); + } + return m_image_ctx->exclusive_lock->start_op(r); +} + +template <typename I> +void SetHeadRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::SetHeadRequest<librbd::ImageCtx>; diff --git a/src/librbd/deep_copy/SetHeadRequest.h b/src/librbd/deep_copy/SetHeadRequest.h new file mode 100644 index 00000000..9a17c9fd --- /dev/null +++ b/src/librbd/deep_copy/SetHeadRequest.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include <map> +#include <set> +#include <string> +#include <tuple> + +class Context; + +namespace librbd { +namespace deep_copy { + +template <typename ImageCtxT = librbd::ImageCtx> +class SetHeadRequest { +public: + static SetHeadRequest* create(ImageCtxT *image_ctx, uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, + Context *on_finish) { + return new SetHeadRequest(image_ctx, size, parent_spec, parent_overlap, + on_finish); + } + + SetHeadRequest(ImageCtxT *image_ctx, uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v (skip if not needed) + * SET_SIZE + * | + * v (skip if not needed) + * DETACH_PARENT + * | + * v (skip if not needed) + * ATTACH_PARENT + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_size; + cls::rbd::ParentImageSpec m_parent_spec; + uint64_t m_parent_overlap; + Context *m_on_finish; + + CephContext *m_cct; + + void send_set_size(); + void handle_set_size(int r); + + void send_detach_parent(); + void handle_detach_parent(int r); + + void send_attach_parent(); + void handle_attach_parent(int r); + + Context *start_lock_op(int* r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::SetHeadRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.cc b/src/librbd/deep_copy/SnapshotCopyRequest.cc new file mode 100644 index 00000000..d1375a42 --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCopyRequest.cc @@ -0,0 +1,730 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SnapshotCopyRequest.h" +#include "SetHeadRequest.h" +#include "SnapshotCreateRequest.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +namespace { + +template <typename I> +const std::string &get_snapshot_name(I *image_ctx, librados::snap_t snap_id) { + auto snap_it = std::find_if(image_ctx->snap_ids.begin(), + image_ctx->snap_ids.end(), + [snap_id]( + const std::pair< + std::pair<cls::rbd::SnapshotNamespace, + std::string>, + librados::snap_t> &pair) { + return pair.second == snap_id; + }); + ceph_assert(snap_it != image_ctx->snap_ids.end()); + return snap_it->first.second; +} + +} // anonymous namespace + +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; + +template <typename I> +SnapshotCopyRequest<I>::SnapshotCopyRequest(I *src_image_ctx, + I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, ContextWQ *work_queue, + SnapSeqs *snap_seqs, + Context *on_finish) + : RefCountedObject(dst_image_ctx->cct, 1), m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start), + m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start), + m_flatten(flatten), m_work_queue(work_queue), m_snap_seqs_result(snap_seqs), + m_snap_seqs(*snap_seqs), m_on_finish(on_finish), m_cct(dst_image_ctx->cct), + m_lock(unique_lock_name("SnapshotCopyRequest::m_lock", this)) { + ceph_assert((m_src_snap_id_start == 0 && m_dst_snap_id_start == 0) || + (m_src_snap_id_start > 0 && m_dst_snap_id_start > 0)); + + // snap ids ordered from oldest to newest + m_src_image_ctx->snap_lock.get_read(); + m_src_snap_ids.insert(src_image_ctx->snaps.begin(), + src_image_ctx->snaps.end()); + m_src_image_ctx->snap_lock.put_read(); + + m_dst_image_ctx->snap_lock.get_read(); + m_dst_snap_ids.insert(dst_image_ctx->snaps.begin(), + dst_image_ctx->snaps.end()); + m_dst_image_ctx->snap_lock.put_read(); + + if (m_src_snap_id_end != CEPH_NOSNAP) { + m_src_snap_ids.erase(m_src_snap_ids.upper_bound(m_src_snap_id_end), + m_src_snap_ids.end()); + } +} + +template <typename I> +void SnapshotCopyRequest<I>::send() { + cls::rbd::ParentImageSpec src_parent_spec; + int r = validate_parent(m_src_image_ctx, &src_parent_spec); + if (r < 0) { + lderr(m_cct) << "source image parent spec mismatch" << dendl; + error(r); + return; + } + + r = validate_parent(m_dst_image_ctx, &m_dst_parent_spec); + if (r < 0) { + lderr(m_cct) << "destination image parent spec mismatch" << dendl; + error(r); + return; + } + + send_snap_unprotect(); +} + +template <typename I> +void SnapshotCopyRequest<I>::cancel() { + Mutex::Locker locker(m_lock); + + ldout(m_cct, 20) << dendl; + m_canceled = true; +} + +template <typename I> +void SnapshotCopyRequest<I>::send_snap_unprotect() { + + SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_dst_snap_id_start > 0) { + snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start); + } + + for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) { + librados::snap_t dst_snap_id = *snap_id_it; + + m_dst_image_ctx->snap_lock.get_read(); + + bool dst_unprotected; + int r = m_dst_image_ctx->is_snap_unprotected(dst_snap_id, &dst_unprotected); + if (r < 0) { + lderr(m_cct) << "failed to retrieve destination snap unprotect status: " + << cpp_strerror(r) << dendl; + m_dst_image_ctx->snap_lock.put_read(); + finish(r); + return; + } + m_dst_image_ctx->snap_lock.put_read(); + + if (dst_unprotected) { + // snap is already unprotected -- check next snap + continue; + } + + // if destination snapshot is protected and (1) it isn't in our mapping + // table, or (2) the source snapshot isn't protected, unprotect it + auto snap_seq_it = std::find_if( + m_snap_seqs.begin(), m_snap_seqs.end(), + [dst_snap_id](const SnapSeqs::value_type& pair) { + return pair.second == dst_snap_id; + }); + + if (snap_seq_it != m_snap_seqs.end()) { + m_src_image_ctx->snap_lock.get_read(); + bool src_unprotected; + r = m_src_image_ctx->is_snap_unprotected(snap_seq_it->first, + &src_unprotected); + ldout(m_cct, 20) << "m_src_image_ctx->is_snap_unprotected(" + << snap_seq_it->first << "): r=" << r + << ", src_unprotected=" << src_unprotected << dendl; + if (r == -ENOENT) { + src_unprotected = true; + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed to retrieve source snap unprotect status: " + << cpp_strerror(r) << dendl; + m_src_image_ctx->snap_lock.put_read(); + finish(r); + return; + } + m_src_image_ctx->snap_lock.put_read(); + + if (src_unprotected) { + // source is unprotected -- unprotect destination snap + break; + } + } else { + // source snapshot doesn't exist -- unprotect destination snap + break; + } + } + + if (snap_id_it == m_dst_snap_ids.end()) { + // no destination snapshots to unprotect + m_prev_snap_id = CEPH_NOSNAP; + send_snap_remove(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id); + + ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_snap_unprotect(r); + finish_op_ctx->complete(0); + }); + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + m_dst_image_ctx->operations->execute_snap_unprotect( + cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx); +} + +template <typename I> +void SnapshotCopyRequest<I>::handle_snap_unprotect(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to unprotect snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // avoid the need to refresh to delete the newly unprotected snapshot + RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock); + auto snap_info_it = m_dst_image_ctx->snap_info.find(m_prev_snap_id); + if (snap_info_it != m_dst_image_ctx->snap_info.end()) { + snap_info_it->second.protection_status = + RBD_PROTECTION_STATUS_UNPROTECTED; + } + } + + if (handle_cancellation()) { + return; + } + + send_snap_unprotect(); +} + +template <typename I> +void SnapshotCopyRequest<I>::send_snap_remove() { + SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_dst_snap_id_start > 0) { + snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start); + } + + for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) { + librados::snap_t dst_snap_id = *snap_id_it; + + cls::rbd::SnapshotNamespace snap_namespace; + m_dst_image_ctx->snap_lock.get_read(); + int r = m_dst_image_ctx->get_snap_namespace(dst_snap_id, &snap_namespace); + m_dst_image_ctx->snap_lock.put_read(); + if (r < 0) { + lderr(m_cct) << "failed to retrieve destination snap namespace: " + << m_snap_name << dendl; + finish(r); + return; + } + + if (boost::get<cls::rbd::UserSnapshotNamespace>(&snap_namespace) == + nullptr) { + continue; + } + + // if the destination snapshot isn't in our mapping table, remove it + auto snap_seq_it = std::find_if( + m_snap_seqs.begin(), m_snap_seqs.end(), + [dst_snap_id](const SnapSeqs::value_type& pair) { + return pair.second == dst_snap_id; + }); + + if (snap_seq_it == m_snap_seqs.end()) { + break; + } + } + + if (snap_id_it == m_dst_snap_ids.end()) { + // no destination snapshots to delete + m_prev_snap_id = CEPH_NOSNAP; + send_snap_create(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id); + + ldout(m_cct, 20) << "" + << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_snap_remove(r); + finish_op_ctx->complete(0); + }); + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + m_dst_image_ctx->operations->execute_snap_remove( + cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx); +} + +template <typename I> +void SnapshotCopyRequest<I>::handle_snap_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + if (handle_cancellation()) { + return; + } + + send_snap_remove(); +} + +template <typename I> +void SnapshotCopyRequest<I>::send_snap_create() { + SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_src_snap_id_start > 0) { + snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start); + } + + for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) { + librados::snap_t src_snap_id = *snap_id_it; + + cls::rbd::SnapshotNamespace snap_namespace; + m_src_image_ctx->snap_lock.get_read(); + int r = m_src_image_ctx->get_snap_namespace(src_snap_id, &snap_namespace); + m_src_image_ctx->snap_lock.put_read(); + if (r < 0) { + lderr(m_cct) << "failed to retrieve source snap namespace: " + << m_snap_name << dendl; + finish(r); + return; + } + + if (m_snap_seqs.find(src_snap_id) == m_snap_seqs.end()) { + // the source snapshot is not in our mapping table, ... + if (boost::get<cls::rbd::UserSnapshotNamespace>(&snap_namespace) != + nullptr) { + // ... create it since it's a user snapshot + break; + } else if (src_snap_id == m_src_snap_id_end) { + // ... map it to destination HEAD since it's not a user snapshot that we + // will create (e.g. MirrorPrimarySnapshotNamespace) + m_snap_seqs[src_snap_id] = CEPH_NOSNAP; + } + } + } + + if (snap_id_it == m_src_snap_ids.end()) { + // no source snapshots to create + m_prev_snap_id = CEPH_NOSNAP; + send_snap_protect(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id); + + m_src_image_ctx->snap_lock.get_read(); + auto snap_info_it = m_src_image_ctx->snap_info.find(m_prev_snap_id); + if (snap_info_it == m_src_image_ctx->snap_info.end()) { + m_src_image_ctx->snap_lock.put_read(); + lderr(m_cct) << "failed to retrieve source snap info: " << m_snap_name + << dendl; + finish(-ENOENT); + return; + } + + uint64_t size = snap_info_it->second.size; + m_snap_namespace = snap_info_it->second.snap_namespace; + cls::rbd::ParentImageSpec parent_spec; + uint64_t parent_overlap = 0; + if (!m_flatten && snap_info_it->second.parent.spec.pool_id != -1) { + parent_spec = m_dst_parent_spec; + parent_overlap = snap_info_it->second.parent.overlap; + } + m_src_image_ctx->snap_lock.put_read(); + + ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << ", " + << "size=" << size << ", " + << "parent_info=[" + << "pool_id=" << parent_spec.pool_id << ", " + << "image_id=" << parent_spec.image_id << ", " + << "snap_id=" << parent_spec.snap_id << ", " + << "overlap=" << parent_overlap << "]" << dendl; + + int r; + Context *finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_snap_create(r); + finish_op_ctx->complete(0); + }); + SnapshotCreateRequest<I> *req = SnapshotCreateRequest<I>::create( + m_dst_image_ctx, m_snap_name, m_snap_namespace, size, parent_spec, + parent_overlap, ctx); + req->send(); +} + +template <typename I> +void SnapshotCopyRequest<I>::handle_snap_create(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + if (handle_cancellation()) { + return; + } + + ceph_assert(m_prev_snap_id != CEPH_NOSNAP); + + auto snap_it = m_dst_image_ctx->snap_ids.find( + {cls::rbd::UserSnapshotNamespace(), m_snap_name}); + ceph_assert(snap_it != m_dst_image_ctx->snap_ids.end()); + librados::snap_t dst_snap_id = snap_it->second; + + ldout(m_cct, 20) << "mapping source snap id " << m_prev_snap_id << " to " + << dst_snap_id << dendl; + m_snap_seqs[m_prev_snap_id] = dst_snap_id; + + send_snap_create(); +} + +template <typename I> +void SnapshotCopyRequest<I>::send_snap_protect() { + SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_src_snap_id_start > 0) { + snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start); + } + + for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) { + librados::snap_t src_snap_id = *snap_id_it; + + m_src_image_ctx->snap_lock.get_read(); + + bool src_protected; + int r = m_src_image_ctx->is_snap_protected(src_snap_id, &src_protected); + if (r < 0) { + lderr(m_cct) << "failed to retrieve source snap protect status: " + << cpp_strerror(r) << dendl; + m_src_image_ctx->snap_lock.put_read(); + finish(r); + return; + } + m_src_image_ctx->snap_lock.put_read(); + + if (!src_protected) { + // snap is not protected -- check next snap + continue; + } + + // if destination snapshot is not protected, protect it + auto snap_seq_it = m_snap_seqs.find(src_snap_id); + ceph_assert(snap_seq_it != m_snap_seqs.end()); + if (snap_seq_it->second == CEPH_NOSNAP) { + // implies src end snapshot is mapped to a non-copyable snapshot + ceph_assert(src_snap_id == m_src_snap_id_end); + break; + } + + m_dst_image_ctx->snap_lock.get_read(); + bool dst_protected; + r = m_dst_image_ctx->is_snap_protected(snap_seq_it->second, &dst_protected); + if (r < 0) { + lderr(m_cct) << "failed to retrieve destination snap protect status: " + << cpp_strerror(r) << dendl; + m_dst_image_ctx->snap_lock.put_read(); + finish(r); + return; + } + m_dst_image_ctx->snap_lock.put_read(); + + if (!dst_protected) { + break; + } + } + + if (snap_id_it == m_src_snap_ids.end()) { + // no destination snapshots to protect + m_prev_snap_id = CEPH_NOSNAP; + send_set_head(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id); + + ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_snap_protect(r); + finish_op_ctx->complete(0); + }); + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + m_dst_image_ctx->operations->execute_snap_protect( + cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx); +} + +template <typename I> +void SnapshotCopyRequest<I>::handle_snap_protect(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to protect snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + if (handle_cancellation()) { + return; + } + + send_snap_protect(); +} + +template <typename I> +void SnapshotCopyRequest<I>::send_set_head() { + auto snap_seq_it = m_snap_seqs.find(m_src_snap_id_end); + if (m_src_snap_id_end != CEPH_NOSNAP && + (snap_seq_it == m_snap_seqs.end() || + snap_seq_it->second != CEPH_NOSNAP)) { + // not copying to src nor dst HEAD revision + finish(0); + return; + } + + ldout(m_cct, 20) << dendl; + + uint64_t size; + cls::rbd::ParentImageSpec parent_spec; + uint64_t parent_overlap = 0; + { + RWLock::RLocker src_locker(m_src_image_ctx->snap_lock); + auto snap_info_it = m_src_image_ctx->snap_info.find(m_src_snap_id_end); + if (snap_info_it != m_src_image_ctx->snap_info.end()) { + auto& snap_info = snap_info_it->second; + size = snap_info.size; + if (!m_flatten) { + parent_spec = snap_info.parent.spec; + parent_overlap = snap_info.parent.overlap; + } + } else { + size = m_src_image_ctx->size; + if (!m_flatten) { + parent_spec = m_src_image_ctx->parent_md.spec; + parent_overlap = m_src_image_ctx->parent_md.overlap; + } + } + } + + auto ctx = create_context_callback< + SnapshotCopyRequest<I>, &SnapshotCopyRequest<I>::handle_set_head>(this); + auto req = SetHeadRequest<I>::create(m_dst_image_ctx, size, parent_spec, + parent_overlap, ctx); + req->send(); +} + +template <typename I> +void SnapshotCopyRequest<I>::handle_set_head(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (handle_cancellation()) { + return; + } + + send_resize_object_map(); +} + +template <typename I> +void SnapshotCopyRequest<I>::send_resize_object_map() { + int r = 0; + + if (m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) { + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock); + + if (m_dst_image_ctx->object_map != nullptr && + Striper::get_num_objects(m_dst_image_ctx->layout, + m_dst_image_ctx->size) != + m_dst_image_ctx->object_map->size()) { + + ldout(m_cct, 20) << dendl; + + auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r); + if (finish_op_ctx != nullptr) { + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_resize_object_map(r); + finish_op_ctx->complete(0); + }); + + m_dst_image_ctx->object_map->aio_resize(m_dst_image_ctx->size, + OBJECT_NONEXISTENT, ctx); + return; + } + + lderr(m_cct) << "lost exclusive lock" << dendl; + } + } + + finish(r); +} + +template <typename I> +void SnapshotCopyRequest<I>::handle_resize_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to resize object map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +bool SnapshotCopyRequest<I>::handle_cancellation() { + { + Mutex::Locker locker(m_lock); + if (!m_canceled) { + return false; + } + } + ldout(m_cct, 10) << "snapshot copy canceled" << dendl; + finish(-ECANCELED); + return true; +} + +template <typename I> +void SnapshotCopyRequest<I>::error(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_work_queue->queue(new FunctionContext([this, r](int r1) { finish(r); })); +} + +template <typename I> +int SnapshotCopyRequest<I>::validate_parent(I *image_ctx, + cls::rbd::ParentImageSpec *spec) { + RWLock::RLocker owner_locker(image_ctx->owner_lock); + RWLock::RLocker snap_locker(image_ctx->snap_lock); + + // ensure source image's parent specs are still consistent + *spec = image_ctx->parent_md.spec; + for (auto &snap_info_pair : image_ctx->snap_info) { + auto &parent_spec = snap_info_pair.second.parent.spec; + if (parent_spec.pool_id == -1) { + continue; + } else if (spec->pool_id == -1) { + *spec = parent_spec; + continue; + } + + if (*spec != parent_spec) { + return -EINVAL; + } + } + return 0; +} + +template <typename I> +Context *SnapshotCopyRequest<I>::start_lock_op(int* r) { + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + return start_lock_op(m_dst_image_ctx->owner_lock, r); +} + +template <typename I> +Context *SnapshotCopyRequest<I>::start_lock_op(RWLock &owner_lock, int* r) { + ceph_assert(m_dst_image_ctx->owner_lock.is_locked()); + if (m_dst_image_ctx->exclusive_lock == nullptr) { + return new FunctionContext([](int r) {}); + } + return m_dst_image_ctx->exclusive_lock->start_op(r); +} + +template <typename I> +void SnapshotCopyRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0) { + *m_snap_seqs_result = m_snap_seqs; + } + + m_on_finish->complete(r); + put(); +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::SnapshotCopyRequest<librbd::ImageCtx>; diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.h b/src/librbd/deep_copy/SnapshotCopyRequest.h new file mode 100644 index 00000000..12068c9a --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCopyRequest.h @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/RefCountedObj.h" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include <map> +#include <set> +#include <string> +#include <tuple> + +class Context; + +namespace librbd { +namespace deep_copy { + +template <typename ImageCtxT = librbd::ImageCtx> +class SnapshotCopyRequest : public RefCountedObject { +public: + static SnapshotCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, ContextWQ *work_queue, + SnapSeqs *snap_seqs, Context *on_finish) { + return new SnapshotCopyRequest(src_image_ctx, dst_image_ctx, + src_snap_id_start, src_snap_id_end, + dst_snap_id_start, flatten, work_queue, + snap_seqs, on_finish); + } + + SnapshotCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, ContextWQ *work_queue, SnapSeqs *snap_seqs, + Context *on_finish); + + void send(); + void cancel(); + +private: + /** + * @verbatim + * + * <start> + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * UNPROTECT_SNAP ----/ + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * REMOVE_SNAP -------/ + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * CREATE_SNAP -------/ + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * PROTECT_SNAP ------/ + * | + * v + * SET_HEAD (skip if not needed) + * | + * v + * RESIZE_OBJECT_MAP (skip if not needed) + * | + * v + * <finish> + * + * @endverbatim + */ + + typedef std::set<librados::snap_t> SnapIdSet; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_src_snap_id_end; + librados::snap_t m_dst_snap_id_start; + bool m_flatten; + ContextWQ *m_work_queue; + SnapSeqs *m_snap_seqs_result; + SnapSeqs m_snap_seqs; + Context *m_on_finish; + + CephContext *m_cct; + SnapIdSet m_src_snap_ids; + SnapIdSet m_dst_snap_ids; + librados::snap_t m_prev_snap_id = CEPH_NOSNAP; + + std::string m_snap_name; + cls::rbd::SnapshotNamespace m_snap_namespace; + + cls::rbd::ParentImageSpec m_dst_parent_spec; + + Mutex m_lock; + bool m_canceled = false; + + void send_snap_unprotect(); + void handle_snap_unprotect(int r); + + void send_snap_remove(); + void handle_snap_remove(int r); + + void send_snap_create(); + void handle_snap_create(int r); + + void send_snap_protect(); + void handle_snap_protect(int r); + + void send_set_head(); + void handle_set_head(int r); + + void send_resize_object_map(); + void handle_resize_object_map(int r); + + bool handle_cancellation(); + + void error(int r); + + int validate_parent(ImageCtxT *image_ctx, cls::rbd::ParentImageSpec *spec); + + Context *start_lock_op(int* r); + Context *start_lock_op(RWLock &owner_locki, int* r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::SnapshotCopyRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.cc b/src/librbd/deep_copy/SnapshotCreateRequest.cc new file mode 100644 index 00000000..0ff9c832 --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCreateRequest.cc @@ -0,0 +1,187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SetHeadRequest.h" +#include "SnapshotCreateRequest.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCreateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +SnapshotCreateRequest<I>::SnapshotCreateRequest( + I *dst_image_ctx, const std::string &snap_name, + const cls::rbd::SnapshotNamespace &snap_namespace, + uint64_t size, const cls::rbd::ParentImageSpec &spec, + uint64_t parent_overlap, Context *on_finish) + : m_dst_image_ctx(dst_image_ctx), m_snap_name(snap_name), + m_snap_namespace(snap_namespace), m_size(size), + m_parent_spec(spec), m_parent_overlap(parent_overlap), + m_on_finish(on_finish), m_cct(dst_image_ctx->cct) { +} + +template <typename I> +void SnapshotCreateRequest<I>::send() { + send_set_head(); +} + +template <typename I> +void SnapshotCreateRequest<I>::send_set_head() { + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + SnapshotCreateRequest<I>, &SnapshotCreateRequest<I>::handle_set_head>(this); + auto req = SetHeadRequest<I>::create(m_dst_image_ctx, m_size, m_parent_spec, + m_parent_overlap, ctx); + req->send(); +} + +template <typename I> +void SnapshotCreateRequest<I>::handle_set_head(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_create_snap(); +} + +template <typename I> +void SnapshotCreateRequest<I>::send_create_snap() { + ldout(m_cct, 20) << "snap_name=" << m_snap_name << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_create_snap(r); + finish_op_ctx->complete(0); + }); + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + m_dst_image_ctx->operations->execute_snap_create(m_snap_namespace, + m_snap_name.c_str(), + ctx, + 0U, true); +} + +template <typename I> +void SnapshotCreateRequest<I>::handle_create_snap(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_create_object_map(); +} +template <typename I> +void SnapshotCreateRequest<I>::send_create_object_map() { + + if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) { + finish(0); + return; + } + + m_dst_image_ctx->snap_lock.get_read(); + auto snap_it = m_dst_image_ctx->snap_ids.find( + {cls::rbd::UserSnapshotNamespace(), m_snap_name}); + if (snap_it == m_dst_image_ctx->snap_ids.end()) { + lderr(m_cct) << "failed to locate snap: " << m_snap_name << dendl; + m_dst_image_ctx->snap_lock.put_read(); + finish(-ENOENT); + return; + } + librados::snap_t local_snap_id = snap_it->second; + m_dst_image_ctx->snap_lock.put_read(); + + std::string object_map_oid(librbd::ObjectMap<>::object_map_name( + m_dst_image_ctx->id, local_snap_id)); + uint64_t object_count = Striper::get_num_objects(m_dst_image_ctx->layout, + m_size); + ldout(m_cct, 20) << "object_map_oid=" << object_map_oid << ", " + << "object_count=" << object_count << dendl; + + // initialize an empty object map of the correct size (object sync + // will populate the object map) + librados::ObjectWriteOperation op; + librbd::cls_client::object_map_resize(&op, object_count, OBJECT_NONEXISTENT); + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_create_object_map(r); + finish_op_ctx->complete(0); + }); + librados::AioCompletion *comp = create_rados_callback(ctx); + r = m_dst_image_ctx->md_ctx.aio_operate(object_map_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void SnapshotCreateRequest<I>::handle_create_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create object map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::start_lock_op(int* r) { + RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock); + if (m_dst_image_ctx->exclusive_lock == nullptr) { + return new FunctionContext([](int r) {}); + } + return m_dst_image_ctx->exclusive_lock->start_op(r); +} + +template <typename I> +void SnapshotCreateRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::SnapshotCreateRequest<librbd::ImageCtx>; diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.h b/src/librbd/deep_copy/SnapshotCreateRequest.h new file mode 100644 index 00000000..8c228063 --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCreateRequest.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include <map> +#include <set> +#include <string> +#include <tuple> + +class Context; + +namespace librbd { +namespace deep_copy { + +template <typename ImageCtxT = librbd::ImageCtx> +class SnapshotCreateRequest { +public: + static SnapshotCreateRequest* create(ImageCtxT *dst_image_ctx, + const std::string &snap_name, + const cls::rbd::SnapshotNamespace &snap_namespace, + uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, + Context *on_finish) { + return new SnapshotCreateRequest(dst_image_ctx, snap_name, snap_namespace, size, + parent_spec, parent_overlap, on_finish); + } + + SnapshotCreateRequest(ImageCtxT *dst_image_ctx, + const std::string &snap_name, + const cls::rbd::SnapshotNamespace &snap_namespace, + uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * SET_HEAD + * | + * v + * CREATE_SNAP + * | + * v (skip if not needed) + * CREATE_OBJECT_MAP + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT *m_dst_image_ctx; + std::string m_snap_name; + cls::rbd::SnapshotNamespace m_snap_namespace; + uint64_t m_size; + cls::rbd::ParentImageSpec m_parent_spec; + uint64_t m_parent_overlap; + Context *m_on_finish; + + CephContext *m_cct; + + void send_set_head(); + void handle_set_head(int r); + + void send_create_snap(); + void handle_create_snap(int r); + + void send_create_object_map(); + void handle_create_object_map(int r); + + Context *start_lock_op(int* r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::SnapshotCreateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H diff --git a/src/librbd/deep_copy/Types.h b/src/librbd/deep_copy/Types.h new file mode 100644 index 00000000..10d3c7c1 --- /dev/null +++ b/src/librbd/deep_copy/Types.h @@ -0,0 +1,22 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_TYPES_H +#define CEPH_LIBRBD_DEEP_COPY_TYPES_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include <boost/optional.hpp> + +namespace librbd { +namespace deep_copy { + +typedef std::vector<librados::snap_t> SnapIds; +typedef std::map<librados::snap_t, SnapIds> SnapMap; + +typedef boost::optional<uint64_t> ObjectNumber; + +} // namespace deep_copy +} // namespace librbd + +#endif // CEPH_LIBRBD_DEEP_COPY_TYPES_H diff --git a/src/librbd/deep_copy/Utils.cc b/src/librbd/deep_copy/Utils.cc new file mode 100644 index 00000000..c2dd2502 --- /dev/null +++ b/src/librbd/deep_copy/Utils.cc @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "Utils.h" +#include <set> + +namespace librbd { +namespace deep_copy { +namespace util { + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::util::" << __func__ << ": " + +void compute_snap_map(CephContext* cct, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + const SnapIds& dst_snap_ids, + const SnapSeqs &snap_seqs, + SnapMap *snap_map) { + std::set<librados::snap_t> ordered_dst_snap_ids{ + dst_snap_ids.begin(), dst_snap_ids.end()}; + auto dst_snap_id_it = ordered_dst_snap_ids.begin(); + + SnapIds snap_ids; + for (auto &it : snap_seqs) { + // ensure all dst snap ids are included in the mapping table since + // deep copy will skip non-user snapshots + while (dst_snap_id_it != ordered_dst_snap_ids.end()) { + if (*dst_snap_id_it < it.second) { + snap_ids.insert(snap_ids.begin(), *dst_snap_id_it); + } else if (*dst_snap_id_it > it.second) { + break; + } + ++dst_snap_id_it; + } + + // we should only have the HEAD revision in the the last snap seq + ceph_assert(snap_ids.empty() || snap_ids[0] != CEPH_NOSNAP); + snap_ids.insert(snap_ids.begin(), it.second); + + if (it.first < src_snap_id_start) { + continue; + } else if (it.first > src_snap_id_end) { + break; + } + + (*snap_map)[it.first] = snap_ids; + } + + ldout(cct, 10) << "src_snap_id_start=" << src_snap_id_start << ", " + << "src_snap_id_end=" << src_snap_id_end << ", " + << "dst_snap_ids=" << dst_snap_ids << ", " + << "snap_seqs=" << snap_seqs << ", " + << "snap_map=" << *snap_map << dendl; +} + +} // namespace util +} // namespace deep_copy +} // namespace librbd diff --git a/src/librbd/deep_copy/Utils.h b/src/librbd/deep_copy/Utils.h new file mode 100644 index 00000000..0681548d --- /dev/null +++ b/src/librbd/deep_copy/Utils.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_UTILS_H +#define CEPH_LIBRBD_DEEP_COPY_UTILS_H + +#include "include/rados/librados.hpp" +#include "librbd/Types.h" +#include "librbd/deep_copy/Types.h" + +#include <boost/optional.hpp> + +struct CephContext; + +namespace librbd { +namespace deep_copy { +namespace util { + +void compute_snap_map(CephContext* cct, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + const SnapIds& dst_snap_ids, + const SnapSeqs &snap_seqs, + SnapMap *snap_map); + +} // namespace util +} // namespace deep_copy +} // namespace librbd + +#endif // CEPH_LIBRBD_DEEP_COPY_UTILS_H diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.cc b/src/librbd/exclusive_lock/AutomaticPolicy.cc new file mode 100644 index 00000000..4d5f48b1 --- /dev/null +++ b/src/librbd/exclusive_lock/AutomaticPolicy.cc @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/AutomaticPolicy.h" +#include "librbd/ImageCtx.h" +#include "librbd/ExclusiveLock.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ExclusiveLock::AutomaticPolicy " + +namespace librbd { +namespace exclusive_lock { + +int AutomaticPolicy::lock_requested(bool force) { + ceph_assert(m_image_ctx->owner_lock.is_locked()); + ceph_assert(m_image_ctx->exclusive_lock != nullptr); + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force + << dendl; + + // release the lock upon request (ignore forced requests) + m_image_ctx->exclusive_lock->release_lock(nullptr); + return 0; +} + +} // namespace exclusive_lock +} // namespace librbd + diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.h b/src/librbd/exclusive_lock/AutomaticPolicy.h new file mode 100644 index 00000000..12ba9b6c --- /dev/null +++ b/src/librbd/exclusive_lock/AutomaticPolicy.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H + +#include "librbd/exclusive_lock/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace exclusive_lock { + +class AutomaticPolicy : public Policy { +public: + AutomaticPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) { + } + + bool may_auto_request_lock() override { + return true; + } + + int lock_requested(bool force) override; + +private: + ImageCtx *m_image_ctx; + +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H diff --git a/src/librbd/exclusive_lock/Policy.h b/src/librbd/exclusive_lock/Policy.h new file mode 100644 index 00000000..8dcbf444 --- /dev/null +++ b/src/librbd/exclusive_lock/Policy.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H + +namespace librbd { +namespace exclusive_lock { + +enum OperationRequestType { + OPERATION_REQUEST_TYPE_GENERAL = 0, + OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE = 1, +}; + +struct Policy { + virtual ~Policy() { + } + + virtual bool may_auto_request_lock() = 0; + virtual int lock_requested(bool force) = 0; + + virtual bool accept_blocked_request(OperationRequestType) { + return false; + } +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.cc b/src/librbd/exclusive_lock/PostAcquireRequest.cc new file mode 100644 index 00000000..7e67e41c --- /dev/null +++ b/src/librbd/exclusive_lock/PostAcquireRequest.cc @@ -0,0 +1,308 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/PostAcquireRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "include/stringify.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/RefreshRequest.h" +#include "librbd/journal/Policy.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::PostAcquireRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +PostAcquireRequest<I>* PostAcquireRequest<I>::create(I &image_ctx, + Context *on_acquire, + Context *on_finish) { + return new PostAcquireRequest(image_ctx, on_acquire, on_finish); +} + +template <typename I> +PostAcquireRequest<I>::PostAcquireRequest(I &image_ctx, Context *on_acquire, + Context *on_finish) + : m_image_ctx(image_ctx), + m_on_acquire(on_acquire), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_object_map(nullptr), m_journal(nullptr), m_error_result(0) { +} + +template <typename I> +PostAcquireRequest<I>::~PostAcquireRequest() { + if (!m_prepare_lock_completed) { + m_image_ctx.state->handle_prepare_lock_complete(); + } + delete m_on_acquire; +} + +template <typename I> +void PostAcquireRequest<I>::send() { + send_refresh(); +} + +template <typename I> +void PostAcquireRequest<I>::send_refresh() { + if (!m_image_ctx.state->is_refresh_required()) { + send_open_object_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest<I>; + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback<klass, &klass::handle_refresh>(this)); + + // ImageState is blocked waiting for lock to complete -- safe to directly + // refresh + image::RefreshRequest<I> *req = image::RefreshRequest<I>::create( + m_image_ctx, true, false, ctx); + req->send(); +} + +template <typename I> +void PostAcquireRequest<I>::handle_refresh(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -ERESTART) { + // next issued IO or op will (re)-refresh the image and shut down lock + ldout(cct, 5) << "exclusive lock dynamically disabled" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + save_result(r); + revert(); + finish(); + return; + } + + send_open_object_map(); +} + +template <typename I> +void PostAcquireRequest<I>::send_open_journal() { + // alert caller that we now own the exclusive lock + m_on_acquire->complete(0); + m_on_acquire = nullptr; + + bool journal_enabled; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + journal_enabled = (m_image_ctx.test_features(RBD_FEATURE_JOURNALING, + m_image_ctx.snap_lock) && + !m_image_ctx.get_journal_policy()->journal_disabled()); + } + if (!journal_enabled) { + apply(); + finish(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_open_journal>( + this); + m_journal = m_image_ctx.create_journal(); + + // journal playback requires object map (if enabled) and itself + apply(); + + m_journal->open(ctx); +} + +template <typename I> +void PostAcquireRequest<I>::handle_open_journal(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl; + send_close_journal(); + return; + } + + send_allocate_journal_tag(); +} + +template <typename I> +void PostAcquireRequest<I>::send_allocate_journal_tag() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + using klass = PostAcquireRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_allocate_journal_tag>(this); + m_image_ctx.get_journal_policy()->allocate_tag_on_lock(ctx); +} + +template <typename I> +void PostAcquireRequest<I>::handle_allocate_journal_tag(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to allocate journal tag: " << cpp_strerror(r) + << dendl; + send_close_journal(); + return; + } + + finish(); +} + +template <typename I> +void PostAcquireRequest<I>::send_close_journal() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_close_journal>( + this); + m_journal->close(ctx); +} + +template <typename I> +void PostAcquireRequest<I>::handle_close_journal(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl; + } + + send_close_object_map(); +} + +template <typename I> +void PostAcquireRequest<I>::send_open_object_map() { + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + send_open_journal(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_open_object_map>( + this); + + m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP); + m_object_map->open(ctx); +} + +template <typename I> +void PostAcquireRequest<I>::handle_open_object_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open object map: " << cpp_strerror(r) << dendl; + delete m_object_map; + m_object_map = nullptr; + + if (r != -EFBIG) { + save_result(r); + revert(); + finish(); + return; + } + } + + send_open_journal(); +} + +template <typename I> +void PostAcquireRequest<I>::send_close_object_map() { + if (m_object_map == nullptr) { + revert(); + finish(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_close_object_map>(this); + m_object_map->close(ctx); +} + +template <typename I> +void PostAcquireRequest<I>::handle_close_object_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl; + } + + revert(); + finish(); +} + +template <typename I> +void PostAcquireRequest<I>::apply() { + { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + ceph_assert(m_image_ctx.object_map == nullptr); + m_image_ctx.object_map = m_object_map; + + ceph_assert(m_image_ctx.journal == nullptr); + m_image_ctx.journal = m_journal; + } + + m_prepare_lock_completed = true; + m_image_ctx.state->handle_prepare_lock_complete(); +} + +template <typename I> +void PostAcquireRequest<I>::revert() { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + m_image_ctx.object_map = nullptr; + m_image_ctx.journal = nullptr; + + delete m_object_map; + delete m_journal; + + ceph_assert(m_error_result < 0); +} + +template <typename I> +void PostAcquireRequest<I>::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::PostAcquireRequest<librbd::ImageCtx>; diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.h b/src/librbd/exclusive_lock/PostAcquireRequest.h new file mode 100644 index 00000000..06fdce39 --- /dev/null +++ b/src/librbd/exclusive_lock/PostAcquireRequest.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "librbd/ImageCtx.h" +#include "msg/msg_types.h" +#include <string> + +class Context; + +namespace librbd { + +namespace exclusive_lock { + +template <typename ImageCtxT = ImageCtx> +class PostAcquireRequest { +public: + static PostAcquireRequest* create(ImageCtxT &image_ctx, Context *on_acquire, + Context *on_finish); + + ~PostAcquireRequest(); + void send(); + +private: + + /** + * @verbatim + * + * <start> + * | + * | + * v + * REFRESH (skip if not + * | needed) + * v + * OPEN_OBJECT_MAP (skip if + * | disabled) + * v + * OPEN_JOURNAL (skip if + * | * disabled) + * | * + * | * * * * * * * * + * v * + * ALLOCATE_JOURNAL_TAG * + * | * * + * | * * + * | v v + * | CLOSE_JOURNAL + * | | + * | v + * | CLOSE_OBJECT_MAP + * | | + * v | + * <finish> <----------/ + * + * @endverbatim + */ + + PostAcquireRequest(ImageCtxT &image_ctx, Context *on_acquire, + Context *on_finish); + + ImageCtxT &m_image_ctx; + Context *m_on_acquire; + Context *m_on_finish; + + decltype(m_image_ctx.object_map) m_object_map; + decltype(m_image_ctx.journal) m_journal; + + bool m_prepare_lock_completed = false; + int m_error_result; + + void send_refresh(); + void handle_refresh(int r); + + void send_open_journal(); + void handle_open_journal(int r); + + void send_allocate_journal_tag(); + void handle_allocate_journal_tag(int r); + + void send_open_object_map(); + void handle_open_object_map(int r); + + void send_close_journal(); + void handle_close_journal(int r); + + void send_close_object_map(); + void handle_close_object_map(int r); + + void apply(); + void revert(); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace exclusive_lock +} // namespace librbd + +extern template class librbd::exclusive_lock::PostAcquireRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.cc b/src/librbd/exclusive_lock/PreAcquireRequest.cc new file mode 100644 index 00000000..ba3da1a2 --- /dev/null +++ b/src/librbd/exclusive_lock/PreAcquireRequest.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/PreAcquireRequest.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ImageState.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::PreAcquireRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +PreAcquireRequest<I>* PreAcquireRequest<I>::create(I &image_ctx, + Context *on_finish) { + return new PreAcquireRequest(image_ctx, on_finish); +} + +template <typename I> +PreAcquireRequest<I>::PreAcquireRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template <typename I> +PreAcquireRequest<I>::~PreAcquireRequest() { +} + +template <typename I> +void PreAcquireRequest<I>::send() { + send_prepare_lock(); +} + +template <typename I> +void PreAcquireRequest<I>::send_prepare_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + // acquire the lock if the image is not busy performing other actions + Context *ctx = create_context_callback< + PreAcquireRequest<I>, &PreAcquireRequest<I>::handle_prepare_lock>(this); + m_image_ctx.state->prepare_lock(ctx); +} + +template <typename I> +void PreAcquireRequest<I>::handle_prepare_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + send_flush_notifies(); +} + +template <typename I> +void PreAcquireRequest<I>::send_flush_notifies() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreAcquireRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_flush_notifies>( + this); + m_image_ctx.image_watcher->flush(ctx); +} + +template <typename I> +void PreAcquireRequest<I>::handle_flush_notifies(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(r == 0); + finish(); +} + +template <typename I> +void PreAcquireRequest<I>::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::PreAcquireRequest<librbd::ImageCtx>; diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.h b/src/librbd/exclusive_lock/PreAcquireRequest.h new file mode 100644 index 00000000..15d4b2c1 --- /dev/null +++ b/src/librbd/exclusive_lock/PreAcquireRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "librbd/ImageCtx.h" +#include "msg/msg_types.h" +#include <string> + +class Context; + +namespace librbd { + +namespace exclusive_lock { + +template <typename ImageCtxT = ImageCtx> +class PreAcquireRequest { +public: + static PreAcquireRequest* create(ImageCtxT &image_ctx, Context *on_finish); + + ~PreAcquireRequest(); + void send(); + +private: + + /** + * @verbatim + * + * <start> + * | + * v + * PREPARE_LOCK + * | + * v + * FLUSH_NOTIFIES + * | + * | + * | + v + * <finish> + * + * @endverbatim + */ + + PreAcquireRequest(ImageCtxT &image_ctx, Context *on_finish); + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + int m_error_result; + + void send_prepare_lock(); + void handle_prepare_lock(int r); + + void send_flush_notifies(); + void handle_flush_notifies(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace exclusive_lock +} // namespace librbd + +extern template class librbd::exclusive_lock::PreAcquireRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.cc b/src/librbd/exclusive_lock/PreReleaseRequest.cc new file mode 100644 index 00000000..7dbae6c5 --- /dev/null +++ b/src/librbd/exclusive_lock/PreReleaseRequest.cc @@ -0,0 +1,299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/PreReleaseRequest.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatcher.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::PreReleaseRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_async_context_callback; +using util::create_context_callback; + +template <typename I> +PreReleaseRequest<I>* PreReleaseRequest<I>::create( + I &image_ctx, bool shutting_down, AsyncOpTracker &async_op_tracker, + Context *on_finish) { + return new PreReleaseRequest(image_ctx, shutting_down, async_op_tracker, + on_finish); +} + +template <typename I> +PreReleaseRequest<I>::PreReleaseRequest(I &image_ctx, bool shutting_down, + AsyncOpTracker &async_op_tracker, + Context *on_finish) + : m_image_ctx(image_ctx), m_shutting_down(shutting_down), + m_async_op_tracker(async_op_tracker), + m_on_finish(create_async_context_callback(image_ctx, on_finish)) { +} + +template <typename I> +PreReleaseRequest<I>::~PreReleaseRequest() { + if (!m_shutting_down) { + m_image_ctx.state->handle_prepare_lock_complete(); + } +} + +template <typename I> +void PreReleaseRequest<I>::send() { + send_prepare_lock(); +} + +template <typename I> +void PreReleaseRequest<I>::send_prepare_lock() { + if (m_shutting_down) { + send_cancel_op_requests(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + // release the lock if the image is not busy performing other actions + Context *ctx = create_context_callback< + PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_prepare_lock>(this); + m_image_ctx.state->prepare_lock(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_prepare_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + send_cancel_op_requests(); +} + +template <typename I> +void PreReleaseRequest<I>::send_cancel_op_requests() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_cancel_op_requests>(this); + m_image_ctx.cancel_async_requests(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_cancel_op_requests(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + ceph_assert(r == 0); + + send_block_writes(); +} + +template <typename I> +void PreReleaseRequest<I>::send_block_writes() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_block_writes>(this); + + { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + // setting the lock as required will automatically cause the IO + // queue to re-request the lock if any IO is queued + if (m_image_ctx.clone_copy_on_read || + m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, true); + } else { + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_WRITE, true); + } + m_image_ctx.io_work_queue->block_writes(ctx); + } +} + +template <typename I> +void PreReleaseRequest<I>::handle_block_writes(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -EBLACKLISTED) { + // allow clean shut down if blacklisted + lderr(cct) << "failed to block writes because client is blacklisted" + << dendl; + } else if (r < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(r) << dendl; + m_image_ctx.io_work_queue->unblock_writes(); + save_result(r); + finish(); + return; + } + + send_wait_for_ops(); +} + +template <typename I> +void PreReleaseRequest<I>::send_wait_for_ops() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + Context *ctx = create_context_callback< + PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_wait_for_ops>(this); + m_async_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_wait_for_ops(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + send_invalidate_cache(); +} + +template <typename I> +void PreReleaseRequest<I>::send_invalidate_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + Context *ctx = create_context_callback< + PreReleaseRequest<I>, + &PreReleaseRequest<I>::handle_invalidate_cache>(this); + m_image_ctx.io_object_dispatcher->invalidate_cache(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_invalidate_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -EBLACKLISTED && r != -EBUSY) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r) + << dendl; + m_image_ctx.io_work_queue->unblock_writes(); + save_result(r); + finish(); + return; + } + + send_flush_notifies(); +} + +template <typename I> +void PreReleaseRequest<I>::send_flush_notifies() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest<I>; + Context *ctx = + create_context_callback<klass, &klass::handle_flush_notifies>(this); + m_image_ctx.image_watcher->flush(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_flush_notifies(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(r == 0); + send_close_journal(); +} + +template <typename I> +void PreReleaseRequest<I>::send_close_journal() { + { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + std::swap(m_journal, m_image_ctx.journal); + } + + if (m_journal == nullptr) { + send_close_object_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_close_journal>( + this); + m_journal->close(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_close_journal(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + // error implies some journal events were not flushed -- continue + lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl; + } + + delete m_journal; + + send_close_object_map(); +} + +template <typename I> +void PreReleaseRequest<I>::send_close_object_map() { + { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + std::swap(m_object_map, m_image_ctx.object_map); + } + + if (m_object_map == nullptr) { + send_unlock(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_close_object_map>(this); + m_object_map->close(ctx); +} + +template <typename I> +void PreReleaseRequest<I>::handle_close_object_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl; + } + + delete m_object_map; + send_unlock(); +} + +template <typename I> +void PreReleaseRequest<I>::send_unlock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + finish(); +} + +template <typename I> +void PreReleaseRequest<I>::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::PreReleaseRequest<librbd::ImageCtx>; diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.h b/src/librbd/exclusive_lock/PreReleaseRequest.h new file mode 100644 index 00000000..e5b85a88 --- /dev/null +++ b/src/librbd/exclusive_lock/PreReleaseRequest.h @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H + +#include "librbd/ImageCtx.h" +#include <string> + +class AsyncOpTracker; +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace exclusive_lock { + +template <typename ImageCtxT = ImageCtx> +class PreReleaseRequest { +public: + static PreReleaseRequest* create(ImageCtxT &image_ctx, bool shutting_down, + AsyncOpTracker &async_op_tracker, + Context *on_finish); + + ~PreReleaseRequest(); + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * PREPARE_LOCK + * | + * v + * CANCEL_OP_REQUESTS + * | + * v + * BLOCK_WRITES + * | + * v + * WAIT_FOR_OPS + * | + * v + * INVALIDATE_CACHE + * | + * v + * FLUSH_NOTIFIES . . . . . . . . . . . . . . + * | . + * v . + * CLOSE_JOURNAL . + * | (journal disabled, . + * v object map enabled) . + * CLOSE_OBJECT_MAP < . . . . . . . . . . . . + * | . + * v (object map disabled) . + * <finish> < . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + PreReleaseRequest(ImageCtxT &image_ctx, bool shutting_down, + AsyncOpTracker &async_op_tracker, Context *on_finish); + + ImageCtxT &m_image_ctx; + bool m_shutting_down; + AsyncOpTracker &m_async_op_tracker; + Context *m_on_finish; + + int m_error_result = 0; + + decltype(m_image_ctx.object_map) m_object_map = nullptr; + decltype(m_image_ctx.journal) m_journal = nullptr; + + void send_prepare_lock(); + void handle_prepare_lock(int r); + + void send_cancel_op_requests(); + void handle_cancel_op_requests(int r); + + void send_block_writes(); + void handle_block_writes(int r); + + void send_wait_for_ops(); + void handle_wait_for_ops(int r); + + void send_invalidate_cache(); + void handle_invalidate_cache(int r); + + void send_flush_notifies(); + void handle_flush_notifies(int r); + + void send_close_journal(); + void handle_close_journal(int r); + + void send_close_object_map(); + void handle_close_object_map(int r); + + void send_unlock(); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } + +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H diff --git a/src/librbd/exclusive_lock/StandardPolicy.cc b/src/librbd/exclusive_lock/StandardPolicy.cc new file mode 100644 index 00000000..6bdb313b --- /dev/null +++ b/src/librbd/exclusive_lock/StandardPolicy.cc @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/ImageCtx.h" +#include "librbd/ExclusiveLock.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ExclusiveLock::StandardPolicy " + +namespace librbd { +namespace exclusive_lock { + +int StandardPolicy::lock_requested(bool force) { + ceph_assert(m_image_ctx->owner_lock.is_locked()); + ceph_assert(m_image_ctx->exclusive_lock != nullptr); + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force + << dendl; + + return -EROFS; +} + +} // namespace exclusive_lock +} // namespace librbd + diff --git a/src/librbd/exclusive_lock/StandardPolicy.h b/src/librbd/exclusive_lock/StandardPolicy.h new file mode 100644 index 00000000..c756db4f --- /dev/null +++ b/src/librbd/exclusive_lock/StandardPolicy.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H + +#include "librbd/exclusive_lock/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace exclusive_lock { + +class StandardPolicy : public Policy { +public: + StandardPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) { + } + + bool may_auto_request_lock() override { + return false; + } + + int lock_requested(bool force) override; + +private: + ImageCtx *m_image_ctx; + +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H diff --git a/src/librbd/image/AttachChildRequest.cc b/src/librbd/image/AttachChildRequest.cc new file mode 100644 index 00000000..f3631bf3 --- /dev/null +++ b/src/librbd/image/AttachChildRequest.cc @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/AttachChildRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/image/RefreshRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::AttachChildRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +AttachChildRequest<I>::AttachChildRequest( + I *image_ctx, I *parent_image_ctx, const librados::snap_t &parent_snap_id, + I *old_parent_image_ctx, const librados::snap_t &old_parent_snap_id, + uint32_t clone_format, Context* on_finish) + : m_image_ctx(image_ctx), m_parent_image_ctx(parent_image_ctx), + m_parent_snap_id(parent_snap_id), + m_old_parent_image_ctx(old_parent_image_ctx), + m_old_parent_snap_id(old_parent_snap_id), m_clone_format(clone_format), + m_on_finish(on_finish), m_cct(m_image_ctx->cct) { +} + +template <typename I> +void AttachChildRequest<I>::send() { + if (m_clone_format == 1) { + v1_add_child(); + } else { + v2_set_op_feature(); + } +} + +template <typename I> +void AttachChildRequest<I>::v1_add_child() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::add_child(&op, {m_parent_image_ctx->md_ctx.get_id(), "", + m_parent_image_ctx->id, + m_parent_snap_id}, m_image_ctx->id); + + using klass = AttachChildRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_v1_add_child>(this); + int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void AttachChildRequest<I>::handle_v1_add_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + if (r == -EEXIST && m_old_parent_image_ctx != nullptr) { + ldout(m_cct, 5) << "child already exists" << dendl; + } else { + lderr(m_cct) << "couldn't add child: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + } + + v1_refresh(); +} + +template <typename I> +void AttachChildRequest<I>::v1_refresh() { + ldout(m_cct, 15) << dendl; + + using klass = AttachChildRequest<I>; + RefreshRequest<I> *req = RefreshRequest<I>::create( + *m_parent_image_ctx, false, false, + create_context_callback<klass, &klass::handle_v1_refresh>(this)); + req->send(); +} + +template <typename I> +void AttachChildRequest<I>::handle_v1_refresh(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + bool snap_protected = false; + if (r == 0) { + RWLock::RLocker snap_locker(m_parent_image_ctx->snap_lock); + r = m_parent_image_ctx->is_snap_protected(m_parent_snap_id, + &snap_protected); + } + + if (r < 0 || !snap_protected) { + lderr(m_cct) << "validate protected failed" << dendl; + finish(-EINVAL); + return; + } + + v1_remove_child_from_old_parent(); +} + +template <typename I> +void AttachChildRequest<I>::v1_remove_child_from_old_parent() { + if (m_old_parent_image_ctx == nullptr) { + finish(0); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::remove_child(&op, {m_old_parent_image_ctx->md_ctx.get_id(), + m_old_parent_image_ctx->md_ctx.get_namespace(), + m_old_parent_image_ctx->id, + m_old_parent_snap_id}, m_image_ctx->id); + + using klass = AttachChildRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_remove_child_from_old_parent>(this); + int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void AttachChildRequest<I>::handle_v1_remove_child_from_old_parent(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "couldn't remove child: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void AttachChildRequest<I>::v2_set_op_feature() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::op_features_set(&op, RBD_OPERATION_FEATURE_CLONE_CHILD, + RBD_OPERATION_FEATURE_CLONE_CHILD); + + using klass = AttachChildRequest<I>; + auto aio_comp = create_rados_callback< + klass, &klass::handle_v2_set_op_feature>(this); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void AttachChildRequest<I>::handle_v2_set_op_feature(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to enable clone v2: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + v2_child_attach(); +} + +template <typename I> +void AttachChildRequest<I>::v2_child_attach() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::child_attach(&op, m_parent_snap_id, + {m_image_ctx->md_ctx.get_id(), + m_image_ctx->md_ctx.get_namespace(), + m_image_ctx->id}); + + using klass = AttachChildRequest<I>; + auto aio_comp = create_rados_callback< + klass, &klass::handle_v2_child_attach>(this); + int r = m_parent_image_ctx->md_ctx.aio_operate(m_parent_image_ctx->header_oid, + aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void AttachChildRequest<I>::handle_v2_child_attach(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + if (r == -EEXIST && m_old_parent_image_ctx != nullptr) { + ldout(m_cct, 5) << "child already exists" << dendl; + } else { + lderr(m_cct) << "failed to attach child image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + } + + v2_child_detach_from_old_parent(); +} + +template <typename I> +void AttachChildRequest<I>::v2_child_detach_from_old_parent() { + if (m_old_parent_image_ctx == nullptr) { + finish(0); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::child_detach(&op, m_old_parent_snap_id, + {m_image_ctx->md_ctx.get_id(), + m_image_ctx->md_ctx.get_namespace(), + m_image_ctx->id}); + + using klass = AttachChildRequest<I>; + auto aio_comp = create_rados_callback< + klass, &klass::handle_v2_child_detach_from_old_parent>(this); + int r = m_old_parent_image_ctx->md_ctx.aio_operate( + m_old_parent_image_ctx->header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void AttachChildRequest<I>::handle_v2_child_detach_from_old_parent(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to detach child image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void AttachChildRequest<I>::finish(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::AttachChildRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/AttachChildRequest.h b/src/librbd/image/AttachChildRequest.h new file mode 100644 index 00000000..d1450e8d --- /dev/null +++ b/src/librbd/image/AttachChildRequest.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H +#define CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" + +class CephContext; +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class AttachChildRequest { +public: + static AttachChildRequest* create(ImageCtxT *image_ctx, + ImageCtxT *parent_image_ctx, + const librados::snap_t &parent_snap_id, + ImageCtxT *old_parent_image_ctx, + const librados::snap_t &old_parent_snap_id, + uint32_t clone_format, + Context* on_finish) { + return new AttachChildRequest(image_ctx, parent_image_ctx, parent_snap_id, + old_parent_image_ctx, old_parent_snap_id, + clone_format, on_finish); + } + + AttachChildRequest(ImageCtxT *image_ctx, + ImageCtxT *parent_image_ctx, + const librados::snap_t &parent_snap_id, + ImageCtxT *old_parent_image_ctx, + const librados::snap_t &old_parent_snap_id, + uint32_t clone_format, + Context* on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * (clone v1) | (clone v2) + * /----------------/ \---------------\ + * | | + * v v + * V1 ADD CHILD V2 SET CLONE + * | | + * v v + * V1 VALIDATE PROTECTED V2 ATTACH CHILD + * | | + * | v + * V1 REMOVE CHILD FROM OLD PARENT V2 DETACH CHILD FROM OLD PARENT + * | | + * \----------------\ /---------------/ + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + ImageCtxT *m_parent_image_ctx; + librados::snap_t m_parent_snap_id; + ImageCtxT *m_old_parent_image_ctx; + librados::snap_t m_old_parent_snap_id; + uint32_t m_clone_format; + Context* m_on_finish; + + CephContext *m_cct; + + void v1_add_child(); + void handle_v1_add_child(int r); + + void v1_refresh(); + void handle_v1_refresh(int r); + + void v1_remove_child_from_old_parent(); + void handle_v1_remove_child_from_old_parent(int r); + + void v2_set_op_feature(); + void handle_v2_set_op_feature(int r); + + void v2_child_attach(); + void handle_v2_child_attach(int r); + + void v2_child_detach_from_old_parent(); + void handle_v2_child_detach_from_old_parent(int r); + + void finish(int r); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::AttachChildRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H diff --git a/src/librbd/image/AttachParentRequest.cc b/src/librbd/image/AttachParentRequest.cc new file mode 100644 index 00000000..19e69594 --- /dev/null +++ b/src/librbd/image/AttachParentRequest.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/AttachParentRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::AttachParentRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_rados_callback; + +template <typename I> +void AttachParentRequest<I>::send() { + attach_parent(); +} + +template <typename I> +void AttachParentRequest<I>::attach_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + if (!m_legacy_parent) { + librbd::cls_client::parent_attach(&op, m_parent_image_spec, + m_parent_overlap, m_reattach); + } else { + librbd::cls_client::set_parent(&op, m_parent_image_spec, m_parent_overlap); + } + + auto aio_comp = create_rados_callback< + AttachParentRequest<I>, + &AttachParentRequest<I>::handle_attach_parent>(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void AttachParentRequest<I>::handle_attach_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + if (!m_legacy_parent && r == -EOPNOTSUPP && !m_reattach) { + if (m_parent_image_spec.pool_namespace == + m_image_ctx.md_ctx.get_namespace()) { + m_parent_image_spec.pool_namespace = ""; + } + if (m_parent_image_spec.pool_namespace.empty()) { + ldout(cct, 10) << "retrying using legacy parent method" << dendl; + m_legacy_parent = true; + attach_parent(); + return; + } + + // namespaces require newer OSDs + r = -EXDEV; + } + + if (r < 0) { + lderr(cct) << "attach parent encountered an error: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void AttachParentRequest<I>::finish(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::AttachParentRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/AttachParentRequest.h b/src/librbd/image/AttachParentRequest.h new file mode 100644 index 00000000..482e0327 --- /dev/null +++ b/src/librbd/image/AttachParentRequest.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H +#define CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class AttachParentRequest { +public: + static AttachParentRequest* create(ImageCtxT& image_ctx, + const cls::rbd::ParentImageSpec& pspec, + uint64_t parent_overlap, + bool reattach, + Context* on_finish) { + return new AttachParentRequest(image_ctx, pspec, parent_overlap, reattach, + on_finish); + } + + AttachParentRequest(ImageCtxT& image_ctx, + const cls::rbd::ParentImageSpec& pspec, + uint64_t parent_overlap, bool reattach, + Context* on_finish) + : m_image_ctx(image_ctx), m_parent_image_spec(pspec), + m_parent_overlap(parent_overlap), m_reattach(reattach), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | * * * * * * + * | * * -EOPNOTSUPP + * v v * + * ATTACH_PARENT * * * + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + cls::rbd::ParentImageSpec m_parent_image_spec; + uint64_t m_parent_overlap; + bool m_reattach; + Context* m_on_finish; + + bool m_legacy_parent = false; + + void attach_parent(); + void handle_attach_parent(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::AttachParentRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H diff --git a/src/librbd/image/CloneRequest.cc b/src/librbd/image/CloneRequest.cc new file mode 100644 index 00000000..b7b07410 --- /dev/null +++ b/src/librbd/image/CloneRequest.cc @@ -0,0 +1,647 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/image/AttachChildRequest.h" +#include "librbd/image/AttachParentRequest.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/mirror/EnableRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CloneRequest: " << this << " " \ + << __func__ << ": " + +#define MAX_KEYS 64 + +namespace librbd { +namespace image { + +using util::create_rados_callback; +using util::create_context_callback; +using util::create_async_context_callback; + +template <typename I> +CloneRequest<I>::CloneRequest(ConfigProxy& config, + IoCtx& parent_io_ctx, + const std::string& parent_image_id, + const std::string& parent_snap_name, + uint64_t parent_snap_id, + IoCtx &c_ioctx, + const std::string &c_name, + const std::string &c_id, + ImageOptions c_options, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) + : m_config(config), m_parent_io_ctx(parent_io_ctx), + m_parent_image_id(parent_image_id), m_parent_snap_name(parent_snap_name), + m_parent_snap_id(parent_snap_id), m_ioctx(c_ioctx), m_name(c_name), + m_id(c_id), m_opts(c_options), + m_non_primary_global_image_id(non_primary_global_image_id), + m_primary_mirror_uuid(primary_mirror_uuid), + m_op_work_queue(op_work_queue), m_on_finish(on_finish), + m_use_p_features(true) { + + m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + + bool default_format_set; + m_opts.is_set(RBD_IMAGE_OPTION_FORMAT, &default_format_set); + if (!default_format_set) { + m_opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2)); + } + + ldout(m_cct, 20) << "parent_pool_id=" << parent_io_ctx.get_id() << ", " + << "parent_image_id=" << parent_image_id << ", " + << "parent_snap=" << parent_snap_name << "/" + << parent_snap_id << " clone to " + << "pool_id=" << m_ioctx.get_id() << ", " + << "name=" << m_name << ", " + << "opts=" << m_opts << dendl; +} + +template <typename I> +void CloneRequest<I>::send() { + ldout(m_cct, 20) << dendl; + validate_options(); +} + +template <typename I> +void CloneRequest<I>::validate_options() { + ldout(m_cct, 20) << dendl; + + uint64_t format = 0; + m_opts.get(RBD_IMAGE_OPTION_FORMAT, &format); + if (format < 2) { + lderr(m_cct) << "format 2 or later required for clone" << dendl; + complete(-EINVAL); + return; + } + + if (m_opts.get(RBD_IMAGE_OPTION_FEATURES, &m_features) == 0) { + if (m_features & ~RBD_FEATURES_ALL) { + lderr(m_cct) << "librbd does not support requested features" << dendl; + complete(-ENOSYS); + return; + } + m_use_p_features = false; + } + + if (m_opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &m_clone_format) < 0) { + std::string default_clone_format = m_config.get_val<std::string>( + "rbd_default_clone_format"); + if (default_clone_format == "1") { + m_clone_format = 1; + } else if (default_clone_format == "auto") { + librados::Rados rados(m_ioctx); + int8_t min_compat_client; + int8_t require_min_compat_client; + int r = rados.get_min_compatible_client(&min_compat_client, + &require_min_compat_client); + if (r < 0) { + complete(r); + return; + } + if (std::max(min_compat_client, require_min_compat_client) < + CEPH_RELEASE_MIMIC) { + m_clone_format = 1; + } + } + } + + if (m_clone_format == 1 && + m_parent_io_ctx.get_namespace() != m_ioctx.get_namespace()) { + ldout(m_cct, 1) << "clone v2 required for cross-namespace clones" << dendl; + complete(-EXDEV); + return; + } + + open_parent(); +} + +template <typename I> +void CloneRequest<I>::open_parent() { + ldout(m_cct, 20) << dendl; + ceph_assert(m_parent_snap_name.empty() ^ (m_parent_snap_id == CEPH_NOSNAP)); + + if (m_parent_snap_id != CEPH_NOSNAP) { + m_parent_image_ctx = I::create("", m_parent_image_id, m_parent_snap_id, + m_parent_io_ctx, true); + } else { + m_parent_image_ctx = I::create("", m_parent_image_id, + m_parent_snap_name.c_str(), m_parent_io_ctx, + true); + } + + Context *ctx = create_context_callback< + CloneRequest<I>, &CloneRequest<I>::handle_open_parent>(this); + m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template <typename I> +void CloneRequest<I>::handle_open_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_parent_image_ctx->destroy(); + m_parent_image_ctx = nullptr; + + lderr(m_cct) << "failed to open parent image: " << cpp_strerror(r) << dendl; + complete(r); + return; + } + + m_parent_snap_id = m_parent_image_ctx->snap_id; + m_pspec = {m_parent_io_ctx.get_id(), m_parent_io_ctx.get_namespace(), + m_parent_image_id, m_parent_snap_id}; + validate_parent(); +} + +template <typename I> +void CloneRequest<I>::validate_parent() { + ldout(m_cct, 20) << dendl; + + if (m_parent_image_ctx->operations_disabled) { + lderr(m_cct) << "image operations disabled due to unsupported op features" + << dendl; + m_r_saved = -EROFS; + close_parent(); + return; + } + + if (m_parent_image_ctx->snap_id == CEPH_NOSNAP) { + lderr(m_cct) << "image to be cloned must be a snapshot" << dendl; + m_r_saved = -EINVAL; + close_parent(); + return; + } + + if (m_parent_image_ctx->old_format) { + lderr(m_cct) << "parent image must be in new format" << dendl; + m_r_saved = -EINVAL; + close_parent(); + return; + } + + m_parent_image_ctx->snap_lock.get_read(); + uint64_t p_features = m_parent_image_ctx->features; + m_size = m_parent_image_ctx->get_image_size(m_parent_image_ctx->snap_id); + + bool snap_protected; + int r = m_parent_image_ctx->is_snap_protected(m_parent_image_ctx->snap_id, &snap_protected); + m_parent_image_ctx->snap_lock.put_read(); + + if ((p_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) { + lderr(m_cct) << "parent image must support layering" << dendl; + m_r_saved = -ENOSYS; + close_parent(); + return; + } + if (m_use_p_features) { + m_features = (p_features & ~RBD_FEATURES_IMPLICIT_ENABLE); + } + + if (r < 0) { + lderr(m_cct) << "unable to locate parent's snapshot" << dendl; + m_r_saved = r; + close_parent(); + return; + } + + if (m_clone_format == 1 && !snap_protected) { + lderr(m_cct) << "parent snapshot must be protected" << dendl; + m_r_saved = -EINVAL; + close_parent(); + return; + } + + validate_child(); +} + +template <typename I> +void CloneRequest<I>::validate_child() { + ldout(m_cct, 15) << dendl; + + if ((m_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) { + lderr(m_cct) << "cloning image must support layering" << dendl; + m_r_saved = -ENOSYS; + close_parent(); + return; + } + + using klass = CloneRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_validate_child>(this); + + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + + int r = m_ioctx.aio_operate(util::old_header_name(m_name), comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void CloneRequest<I>::handle_validate_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r != -ENOENT) { + lderr(m_cct) << "rbd image " << m_name << " already exists" << dendl; + m_r_saved = r; + close_parent(); + return; + } + + create_child(); +} + +template <typename I> +void CloneRequest<I>::create_child() { + ldout(m_cct, 15) << dendl; + + uint64_t order = m_parent_image_ctx->order; + if (m_opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + m_opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + m_opts.set(RBD_IMAGE_OPTION_FEATURES, m_features); + + using klass = CloneRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_create_child>(this); + + RWLock::RLocker snap_locker(m_parent_image_ctx->snap_lock); + CreateRequest<I> *req = CreateRequest<I>::create( + m_config, m_ioctx, m_name, m_id, m_size, m_opts, + m_non_primary_global_image_id, m_primary_mirror_uuid, true, + m_op_work_queue, ctx); + req->send(); +} + +template <typename I> +void CloneRequest<I>::handle_create_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EBADF) { + ldout(m_cct, 5) << "image id already in-use" << dendl; + complete(r); + return; + } else if (r < 0) { + lderr(m_cct) << "error creating child: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_parent(); + return; + } + open_child(); +} + +template <typename I> +void CloneRequest<I>::open_child() { + ldout(m_cct, 15) << dendl; + + m_imctx = I::create(m_name, "", nullptr, m_ioctx, false); + + using klass = CloneRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_open_child>(this); + + uint64_t flags = OPEN_FLAG_SKIP_OPEN_PARENT; + if ((m_features & RBD_FEATURE_MIGRATING) != 0) { + flags |= OPEN_FLAG_IGNORE_MIGRATING; + } + + m_imctx->state->open(flags, ctx); +} + +template <typename I> +void CloneRequest<I>::handle_open_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + m_imctx->destroy(); + m_imctx = nullptr; + + lderr(m_cct) << "Error opening new image: " << cpp_strerror(r) << dendl; + m_r_saved = r; + remove_child(); + return; + } + + attach_parent(); +} + +template <typename I> +void CloneRequest<I>::attach_parent() { + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CloneRequest<I>, &CloneRequest<I>::handle_attach_parent>(this); + auto req = AttachParentRequest<I>::create( + *m_imctx, m_pspec, m_size, false, ctx); + req->send(); +} + +template <typename I> +void CloneRequest<I>::handle_attach_parent(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + return; + } + + attach_child(); +} + +template <typename I> +void CloneRequest<I>::attach_child() { + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CloneRequest<I>, &CloneRequest<I>::handle_attach_child>(this); + auto req = AttachChildRequest<I>::create( + m_imctx, m_parent_image_ctx, m_parent_image_ctx->snap_id, nullptr, 0, + m_clone_format, ctx); + req->send(); +} + +template <typename I> +void CloneRequest<I>::handle_attach_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + return; + } + + metadata_list(); +} + +template <typename I> +void CloneRequest<I>::metadata_list() { + ldout(m_cct, 15) << "start_key=" << m_last_metadata_key << dendl; + + librados::ObjectReadOperation op; + cls_client::metadata_list_start(&op, m_last_metadata_key, 0); + + using klass = CloneRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_metadata_list>(this); + m_out_bl.clear(); + m_parent_image_ctx->md_ctx.aio_operate(m_parent_image_ctx->header_oid, + comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +void CloneRequest<I>::handle_metadata_list(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + map<string, bufferlist> metadata; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::metadata_list_finish(&it, &metadata); + } + + if (r < 0) { + if (r == -EOPNOTSUPP || r == -EIO) { + ldout(m_cct, 10) << "config metadata not supported by OSD" << dendl; + get_mirror_mode(); + } else { + lderr(m_cct) << "couldn't list metadata: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + } + return; + } + + if (!metadata.empty()) { + m_pairs.insert(metadata.begin(), metadata.end()); + m_last_metadata_key = m_pairs.rbegin()->first; + } + + if (metadata.size() == MAX_KEYS) { + metadata_list(); + } else { + metadata_set(); + } +} + +template <typename I> +void CloneRequest<I>::metadata_set() { + if (m_pairs.empty()) { + get_mirror_mode(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::metadata_set(&op, m_pairs); + + using klass = CloneRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_metadata_set>(this); + int r = m_ioctx.aio_operate(m_imctx->header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void CloneRequest<I>::handle_metadata_set(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "couldn't set metadata: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + } else { + get_mirror_mode(); + } +} + +template <typename I> +void CloneRequest<I>::get_mirror_mode() { + ldout(m_cct, 15) << dendl; + + if (!m_imctx->test_features(RBD_FEATURE_JOURNALING)) { + close_child(); + return; + } + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = CloneRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_mirror_mode>(this); + m_out_bl.clear(); + m_imctx->md_ctx.aio_operate(RBD_MIRRORING, + comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +void CloneRequest<I>::handle_get_mirror_mode(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode); + } + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + close_child(); + } else { + if (m_mirror_mode == cls::rbd::MIRROR_MODE_POOL || + !m_non_primary_global_image_id.empty()) { + enable_mirror(); + } else { + close_child(); + } + } +} + +template <typename I> +void CloneRequest<I>::enable_mirror() { + ldout(m_cct, 15) << dendl; + + using klass = CloneRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_enable_mirror>(this); + + mirror::EnableRequest<I> *req = mirror::EnableRequest<I>::create( + m_imctx->md_ctx, m_id, m_non_primary_global_image_id, + m_imctx->op_work_queue, ctx); + req->send(); +} + +template <typename I> +void CloneRequest<I>::handle_enable_mirror(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + m_r_saved = r; + } + close_child(); +} + +template <typename I> +void CloneRequest<I>::close_child() { + ldout(m_cct, 15) << dendl; + + ceph_assert(m_imctx != nullptr); + + using klass = CloneRequest<I>; + Context *ctx = create_async_context_callback( + *m_imctx, create_context_callback< + klass, &klass::handle_close_child>(this)); + m_imctx->state->close(ctx); +} + +template <typename I> +void CloneRequest<I>::handle_close_child(int r) { + ldout(m_cct, 15) << dendl; + + m_imctx->destroy(); + m_imctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "couldn't close image: " << cpp_strerror(r) << dendl; + if (m_r_saved == 0) { + m_r_saved = r; + } + } + + if (m_r_saved < 0) { + remove_child(); + return; + } + + close_parent(); +} + +template <typename I> +void CloneRequest<I>::remove_child() { + ldout(m_cct, 15) << dendl; + + using klass = CloneRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_remove_child>(this); + + auto req = librbd::image::RemoveRequest<I>::create( + m_ioctx, m_name, m_id, false, false, m_no_op, m_op_work_queue, ctx); + req->send(); +} + +template <typename I> +void CloneRequest<I>::handle_remove_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "Error removing failed clone: " + << cpp_strerror(r) << dendl; + } + + close_parent(); +} + +template <typename I> +void CloneRequest<I>::close_parent() { + ldout(m_cct, 20) << dendl; + ceph_assert(m_parent_image_ctx != nullptr); + + Context *ctx = create_async_context_callback( + *m_parent_image_ctx, create_context_callback< + CloneRequest<I>, &CloneRequest<I>::handle_close_parent>(this)); + m_parent_image_ctx->state->close(ctx); +} + +template <typename I> +void CloneRequest<I>::handle_close_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_parent_image_ctx->destroy(); + m_parent_image_ctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "failed to close parent image: " + << cpp_strerror(r) << dendl; + if (m_r_saved == 0) { + m_r_saved = r; + } + } + + complete(m_r_saved); +} + +template <typename I> +void CloneRequest<I>::complete(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} //namespace image +} //namespace librbd + +template class librbd::image::CloneRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/CloneRequest.h b/src/librbd/image/CloneRequest.h new file mode 100644 index 00000000..640de9fe --- /dev/null +++ b/src/librbd/image/CloneRequest.h @@ -0,0 +1,178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H + +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/internal.h" + +class ConfigProxy; +class Context; + +using librados::IoCtx; + +namespace librbd { +namespace image { + +template <typename ImageCtxT = ImageCtx> +class CloneRequest { +public: + static CloneRequest *create(ConfigProxy& config, IoCtx& parent_io_ctx, + const std::string& parent_image_id, + const std::string& parent_snap_name, + uint64_t parent_snap_id, + IoCtx &c_ioctx, const std::string &c_name, + const std::string &c_id, ImageOptions c_options, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) { + return new CloneRequest(config, parent_io_ctx, parent_image_id, + parent_snap_name, parent_snap_id, c_ioctx, c_name, + c_id, c_options, non_primary_global_image_id, + primary_mirror_uuid, op_work_queue, on_finish); + } + + CloneRequest(ConfigProxy& config, IoCtx& parent_io_ctx, + const std::string& parent_image_id, + const std::string& parent_snap_name, + uint64_t parent_snap_id, + IoCtx &c_ioctx, const std::string &c_name, + const std::string &c_id, ImageOptions c_options, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * OPEN PARENT + * | + * v + * VALIDATE CHILD <finish> + * | ^ + * v | + * CREATE CHILD * * * * * * * * * > CLOSE PARENT + * | ^ + * v | + * OPEN CHILD * * * * * * * * * * > REMOVE CHILD + * | ^ + * v | + * ATTACH PARENT * * * * * * * * > CLOSE CHILD + * | ^ + * v * + * ATTACH CHILD * * * * * * * * * * * * + * | * + * v * + * GET PARENT META * * * * * * * * * ^ + * | * + * v (skip if not needed) * + * SET CHILD META * * * * * * * * * * ^ + * | * + * v (skip if not needed) * + * GET MIRROR MODE * * * * * * * * * ^ + * | * + * v (skip if not needed) * + * SET MIRROR ENABLED * * * * * * * * * + * | + * v + * CLOSE CHILD + * | + * v + * CLOSE PARENT + * | + * v + * <finish> + * + * @endverbatim + */ + + ConfigProxy& m_config; + IoCtx &m_parent_io_ctx; + std::string m_parent_image_id; + std::string m_parent_snap_name; + uint64_t m_parent_snap_id; + ImageCtxT *m_parent_image_ctx; + + IoCtx &m_ioctx; + std::string m_name; + std::string m_id; + ImageOptions m_opts; + cls::rbd::ParentImageSpec m_pspec; + ImageCtxT *m_imctx; + cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + const std::string m_non_primary_global_image_id; + const std::string m_primary_mirror_uuid; + NoOpProgressContext m_no_op; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + uint64_t m_clone_format = 2; + bool m_use_p_features; + uint64_t m_features; + map<string, bufferlist> m_pairs; + std::string m_last_metadata_key; + bufferlist m_out_bl; + uint64_t m_size; + int m_r_saved = 0; + + void validate_options(); + + void open_parent(); + void handle_open_parent(int r); + + void validate_parent(); + + void validate_child(); + void handle_validate_child(int r); + + void create_child(); + void handle_create_child(int r); + + void open_child(); + void handle_open_child(int r); + + void attach_parent(); + void handle_attach_parent(int r); + + void attach_child(); + void handle_attach_child(int r); + + void metadata_list(); + void handle_metadata_list(int r); + + void metadata_set(); + void handle_metadata_set(int r); + + void get_mirror_mode(); + void handle_get_mirror_mode(int r); + + void enable_mirror(); + void handle_enable_mirror(int r); + + void close_child(); + void handle_close_child(int r); + + void remove_child(); + void handle_remove_child(int r); + + void close_parent(); + void handle_close_parent(int r); + + void complete(int r); +}; + +} //namespace image +} //namespace librbd + +extern template class librbd::image::CloneRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H diff --git a/src/librbd/image/CloseRequest.cc b/src/librbd/image/CloseRequest.cc new file mode 100644 index 00000000..c43e1833 --- /dev/null +++ b/src/librbd/image/CloseRequest.cc @@ -0,0 +1,342 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/CloseRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ConfigWatcher.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatcher.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CloseRequest: " + +namespace librbd { +namespace image { + +using util::create_async_context_callback; +using util::create_context_callback; + +template <typename I> +CloseRequest<I>::CloseRequest(I *image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0), + m_exclusive_lock(nullptr) { + ceph_assert(image_ctx != nullptr); +} + +template <typename I> +void CloseRequest<I>::send() { + if (m_image_ctx->config_watcher != nullptr) { + m_image_ctx->config_watcher->shut_down(); + + delete m_image_ctx->config_watcher; + m_image_ctx->config_watcher = nullptr; + } + + send_block_image_watcher(); +} + +template <typename I> +void CloseRequest<I>::send_block_image_watcher() { + if (m_image_ctx->image_watcher == nullptr) { + send_shut_down_update_watchers(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // prevent incoming requests from our peers + m_image_ctx->image_watcher->block_notifies(create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_block_image_watcher>(this)); +} + +template <typename I> +void CloseRequest<I>::handle_block_image_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + send_shut_down_update_watchers(); +} + +template <typename I> +void CloseRequest<I>::send_shut_down_update_watchers() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->state->shut_down_update_watchers(create_async_context_callback( + *m_image_ctx, create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_shut_down_update_watchers>(this))); +} + +template <typename I> +void CloseRequest<I>::handle_shut_down_update_watchers(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down update watchers: " << cpp_strerror(r) + << dendl; + } + + send_shut_down_io_queue(); +} + +template <typename I> +void CloseRequest<I>::send_shut_down_io_queue() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + m_image_ctx->io_work_queue->shut_down(create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_shut_down_io_queue>(this)); +} + +template <typename I> +void CloseRequest<I>::handle_shut_down_io_queue(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + send_shut_down_exclusive_lock(); +} + +template <typename I> +void CloseRequest<I>::send_shut_down_exclusive_lock() { + { + RWLock::WLocker owner_locker(m_image_ctx->owner_lock); + m_exclusive_lock = m_image_ctx->exclusive_lock; + + // if reading a snapshot -- possible object map is open + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + if (m_exclusive_lock == nullptr) { + delete m_image_ctx->object_map; + m_image_ctx->object_map = nullptr; + } + } + + if (m_exclusive_lock == nullptr) { + send_flush(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // in-flight IO will be flushed and in-flight requests will be canceled + // before releasing lock + m_exclusive_lock->shut_down(create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_shut_down_exclusive_lock>(this)); +} + +template <typename I> +void CloseRequest<I>::handle_shut_down_exclusive_lock(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + { + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + ceph_assert(m_image_ctx->exclusive_lock == nullptr); + + // object map and journal closed during exclusive lock shutdown + RWLock::RLocker snap_locker(m_image_ctx->snap_lock); + ceph_assert(m_image_ctx->journal == nullptr); + ceph_assert(m_image_ctx->object_map == nullptr); + } + + delete m_exclusive_lock; + m_exclusive_lock = nullptr; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down exclusive lock: " << cpp_strerror(r) + << dendl; + } + + send_unregister_image_watcher(); +} + +template <typename I> +void CloseRequest<I>::send_flush() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + auto ctx = create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_flush>(this); + auto aio_comp = io::AioCompletion::create_and_start(ctx, m_image_ctx, + io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec<I>::create_flush_request( + *m_image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + delete req; +} + +template <typename I> +void CloseRequest<I>::handle_flush(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl; + } + send_unregister_image_watcher(); +} + +template <typename I> +void CloseRequest<I>::send_unregister_image_watcher() { + if (m_image_ctx->image_watcher == nullptr) { + send_flush_readahead(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->image_watcher->unregister_watch(create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_unregister_image_watcher>(this)); +} + +template <typename I> +void CloseRequest<I>::handle_unregister_image_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to unregister image watcher: " << cpp_strerror(r) + << dendl; + } + + send_flush_readahead(); +} + +template <typename I> +void CloseRequest<I>::send_flush_readahead() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->readahead.wait_for_pending(create_async_context_callback( + *m_image_ctx, create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_flush_readahead>(this))); +} + +template <typename I> +void CloseRequest<I>::handle_flush_readahead(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + send_shut_down_object_dispatcher(); +} + +template <typename I> +void CloseRequest<I>::send_shut_down_object_dispatcher() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->io_object_dispatcher->shut_down(create_context_callback< + CloseRequest<I>, + &CloseRequest<I>::handle_shut_down_object_dispatcher>(this)); +} + +template <typename I> +void CloseRequest<I>::handle_shut_down_object_dispatcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down object dispatcher: " + << cpp_strerror(r) << dendl; + } + send_flush_op_work_queue(); +} + +template <typename I> +void CloseRequest<I>::send_flush_op_work_queue() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->op_work_queue->queue(create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_flush_op_work_queue>(this), 0); +} + +template <typename I> +void CloseRequest<I>::handle_flush_op_work_queue(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + send_close_parent(); +} + +template <typename I> +void CloseRequest<I>::send_close_parent() { + if (m_image_ctx->parent == nullptr) { + send_flush_image_watcher(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->parent->state->close(create_async_context_callback( + *m_image_ctx, create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_close_parent>(this))); +} + +template <typename I> +void CloseRequest<I>::handle_close_parent(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + delete m_image_ctx->parent; + m_image_ctx->parent = nullptr; + save_result(r); + if (r < 0) { + lderr(cct) << "error closing parent image: " << cpp_strerror(r) << dendl; + } + send_flush_image_watcher(); +} + +template <typename I> +void CloseRequest<I>::send_flush_image_watcher() { + if (m_image_ctx->image_watcher == nullptr) { + finish(); + return; + } + + m_image_ctx->image_watcher->flush(create_context_callback< + CloseRequest<I>, &CloseRequest<I>::handle_flush_image_watcher>(this)); +} + +template <typename I> +void CloseRequest<I>::handle_flush_image_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error flushing image watcher: " << cpp_strerror(r) << dendl; + } + save_result(r); + finish(); +} + +template <typename I> +void CloseRequest<I>::finish() { + m_image_ctx->shutdown(); + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::CloseRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/CloseRequest.h b/src/librbd/image/CloseRequest.h new file mode 100644 index 00000000..fd101bee --- /dev/null +++ b/src/librbd/image/CloseRequest.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H + +#include "librbd/ImageCtx.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class CloseRequest { +public: + static CloseRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return new CloseRequest(image_ctx, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * BLOCK_IMAGE_WATCHER (skip if R/O) + * | + * v + * SHUT_DOWN_UPDATE_WATCHERS + * | + * v + * SHUT_DOWN_AIO_WORK_QUEUE . . . + * | . (exclusive lock disabled) + * v v + * SHUT_DOWN_EXCLUSIVE_LOCK FLUSH + * | . + * | . . . . . . . . . . . + * | . + * v v + * UNREGISTER_IMAGE_WATCHER (skip if R/O) + * | + * v + * FLUSH_READAHEAD + * | + * v + * SHUT_DOWN_OBJECT_DISPATCHER + * | + * v + * FLUSH_OP_WORK_QUEUE + * | + * v (skip if no parent) + * CLOSE_PARENT + * | + * v + * FLUSH_IMAGE_WATCHER + * | + * v + * <finish> + * + * @endverbatim + */ + + CloseRequest(ImageCtxT *image_ctx, Context *on_finish); + + ImageCtxT *m_image_ctx; + Context *m_on_finish; + + int m_error_result; + + decltype(m_image_ctx->exclusive_lock) m_exclusive_lock; + + void send_block_image_watcher(); + void handle_block_image_watcher(int r); + + void send_shut_down_update_watchers(); + void handle_shut_down_update_watchers(int r); + + void send_shut_down_io_queue(); + void handle_shut_down_io_queue(int r); + + void send_shut_down_exclusive_lock(); + void handle_shut_down_exclusive_lock(int r); + + void send_flush(); + void handle_flush(int r); + + void send_unregister_image_watcher(); + void handle_unregister_image_watcher(int r); + + void send_flush_readahead(); + void handle_flush_readahead(int r); + + void send_shut_down_object_dispatcher(); + void handle_shut_down_object_dispatcher(int r); + + void send_flush_op_work_queue(); + void handle_flush_op_work_queue(int r); + + void send_close_parent(); + void handle_close_parent(int r); + + void send_flush_image_watcher(); + void handle_flush_image_watcher(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::CloseRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H diff --git a/src/librbd/image/CreateRequest.cc b/src/librbd/image/CreateRequest.cc new file mode 100644 index 00000000..2dfdc6b3 --- /dev/null +++ b/src/librbd/image/CreateRequest.cc @@ -0,0 +1,830 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/CreateRequest.h" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/ceph_context.h" +#include "cls/rbd/cls_rbd_client.h" +#include "osdc/Striper.h" +#include "librbd/Features.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/ValidatePoolRequest.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/mirror/EnableRequest.h" +#include "journal/Journaler.h" + + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CreateRequest: " << __func__ \ + << ": " + +namespace librbd { +namespace image { + +using util::create_rados_callback; +using util::create_context_callback; + +namespace { + +int validate_features(CephContext *cct, uint64_t features, + bool force_non_primary) { + if (features & ~RBD_FEATURES_ALL) { + lderr(cct) << "librbd does not support requested features." << dendl; + return -ENOSYS; + } + if ((features & RBD_FEATURES_INTERNAL) != 0) { + lderr(cct) << "cannot use internally controlled features" << dendl; + return -EINVAL; + } + if ((features & RBD_FEATURE_FAST_DIFF) != 0 && + (features & RBD_FEATURE_OBJECT_MAP) == 0) { + lderr(cct) << "cannot use fast diff without object map" << dendl; + return -EINVAL; + } + if ((features & RBD_FEATURE_OBJECT_MAP) != 0 && + (features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot use object map without exclusive lock" << dendl; + return -EINVAL; + } + if ((features & RBD_FEATURE_JOURNALING) != 0) { + if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot use journaling without exclusive lock" << dendl; + return -EINVAL; + } + } else if (force_non_primary) { + ceph_abort(); + } + + return 0; +} + +int validate_striping(CephContext *cct, uint8_t order, uint64_t stripe_unit, + uint64_t stripe_count) { + if ((stripe_unit && !stripe_count) || + (!stripe_unit && stripe_count)) { + lderr(cct) << "must specify both (or neither) of stripe-unit and " + << "stripe-count" << dendl; + return -EINVAL; + } else if (stripe_unit || stripe_count) { + if ((1ull << order) % stripe_unit || stripe_unit > (1ull << order)) { + lderr(cct) << "stripe unit is not a factor of the object size" << dendl; + return -EINVAL; + } + } + return 0; +} + +bool validate_layout(CephContext *cct, uint64_t size, file_layout_t &layout) { + if (!librbd::ObjectMap<>::is_compatible(layout, size)) { + lderr(cct) << "image size not compatible with object map" << dendl; + return false; + } + + return true; +} + +int get_image_option(const ImageOptions &image_options, int option, + uint8_t *value) { + uint64_t large_value; + int r = image_options.get(option, &large_value); + if (r < 0) { + return r; + } + *value = static_cast<uint8_t>(large_value); + return 0; +} + +} // anonymous namespace + +template<typename I> +int CreateRequest<I>::validate_order(CephContext *cct, uint8_t order) { + if (order > 25 || order < 12) { + lderr(cct) << "order must be in the range [12, 25]" << dendl; + return -EDOM; + } + return 0; +} + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CreateRequest: " << this << " " \ + << __func__ << ": " + +template<typename I> +CreateRequest<I>::CreateRequest(const ConfigProxy& config, IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, uint64_t size, + const ImageOptions &image_options, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable, + ContextWQ *op_work_queue, Context *on_finish) + : m_config(config), m_image_name(image_name), m_image_id(image_id), + m_size(size), m_non_primary_global_image_id(non_primary_global_image_id), + m_primary_mirror_uuid(primary_mirror_uuid), + m_skip_mirror_enable(skip_mirror_enable), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + + m_io_ctx.dup(ioctx); + m_cct = reinterpret_cast<CephContext *>(m_io_ctx.cct()); + + m_id_obj = util::id_obj_name(m_image_name); + m_header_obj = util::header_name(m_image_id); + m_objmap_name = ObjectMap<>::object_map_name(m_image_id, CEPH_NOSNAP); + m_force_non_primary = !non_primary_global_image_id.empty(); + + if (image_options.get(RBD_IMAGE_OPTION_FEATURES, &m_features) != 0) { + m_features = librbd::rbd_features_from_string( + m_config.get_val<std::string>("rbd_default_features"), nullptr); + m_negotiate_features = true; + } + + uint64_t features_clear = 0; + uint64_t features_set = 0; + image_options.get(RBD_IMAGE_OPTION_FEATURES_CLEAR, &features_clear); + image_options.get(RBD_IMAGE_OPTION_FEATURES_SET, &features_set); + + uint64_t features_conflict = features_clear & features_set; + features_clear &= ~features_conflict; + features_set &= ~features_conflict; + m_features |= features_set; + m_features &= ~features_clear; + + m_features &= ~RBD_FEATURES_IMPLICIT_ENABLE; + if ((m_features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP) { + m_features |= RBD_FEATURE_FAST_DIFF; + } + + if (image_options.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &m_stripe_unit) != 0 || + m_stripe_unit == 0) { + m_stripe_unit = m_config.get_val<Option::size_t>("rbd_default_stripe_unit"); + } + if (image_options.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &m_stripe_count) != 0 || + m_stripe_count == 0) { + m_stripe_count = m_config.get_val<uint64_t>("rbd_default_stripe_count"); + } + if (get_image_option(image_options, RBD_IMAGE_OPTION_ORDER, &m_order) != 0 || + m_order == 0) { + m_order = config.get_val<uint64_t>("rbd_default_order"); + } + if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_ORDER, + &m_journal_order) != 0) { + m_journal_order = m_config.get_val<uint64_t>("rbd_journal_order"); + } + if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, + &m_journal_splay_width) != 0) { + m_journal_splay_width = m_config.get_val<uint64_t>( + "rbd_journal_splay_width"); + } + if (image_options.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &m_journal_pool) != 0) { + m_journal_pool = m_config.get_val<std::string>("rbd_journal_pool"); + } + if (image_options.get(RBD_IMAGE_OPTION_DATA_POOL, &m_data_pool) != 0) { + m_data_pool = m_config.get_val<std::string>("rbd_default_data_pool"); + } + + m_layout.object_size = 1ull << m_order; + if (m_stripe_unit == 0 || m_stripe_count == 0) { + m_layout.stripe_unit = m_layout.object_size; + m_layout.stripe_count = 1; + } else { + m_layout.stripe_unit = m_stripe_unit; + m_layout.stripe_count = m_stripe_count; + } + + if (!m_data_pool.empty() && m_data_pool != ioctx.get_pool_name()) { + m_features |= RBD_FEATURE_DATA_POOL; + } else { + m_data_pool.clear(); + } + + if ((m_stripe_unit != 0 && m_stripe_unit != (1ULL << m_order)) || + (m_stripe_count != 0 && m_stripe_count != 1)) { + m_features |= RBD_FEATURE_STRIPINGV2; + } + + ldout(m_cct, 10) << "name=" << m_image_name << ", " + << "id=" << m_image_id << ", " + << "size=" << m_size << ", " + << "features=" << m_features << ", " + << "order=" << (uint64_t)m_order << ", " + << "stripe_unit=" << m_stripe_unit << ", " + << "stripe_count=" << m_stripe_count << ", " + << "journal_order=" << (uint64_t)m_journal_order << ", " + << "journal_splay_width=" + << (uint64_t)m_journal_splay_width << ", " + << "journal_pool=" << m_journal_pool << ", " + << "data_pool=" << m_data_pool << dendl; +} + +template<typename I> +void CreateRequest<I>::send() { + ldout(m_cct, 20) << dendl; + + int r = validate_features(m_cct, m_features, m_force_non_primary); + if (r < 0) { + complete(r); + return; + } + + r = validate_order(m_cct, m_order); + if (r < 0) { + complete(r); + return; + } + + r = validate_striping(m_cct, m_order, m_stripe_unit, m_stripe_count); + if (r < 0) { + complete(r); + return; + } + + if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) && + (!validate_layout(m_cct, m_size, m_layout))) { + complete(-EINVAL); + return; + } + + validate_data_pool(); +} + +template <typename I> +void CreateRequest<I>::validate_data_pool() { + m_data_io_ctx = m_io_ctx; + if ((m_features & RBD_FEATURE_DATA_POOL) != 0) { + librados::Rados rados(m_io_ctx); + int r = rados.ioctx_create(m_data_pool.c_str(), m_data_io_ctx); + if (r < 0) { + lderr(m_cct) << "data pool " << m_data_pool << " does not exist" << dendl; + complete(r); + return; + } + m_data_pool_id = m_data_io_ctx.get_id(); + m_data_io_ctx.set_namespace(m_io_ctx.get_namespace()); + } + + if (!m_config.get_val<bool>("rbd_validate_pool")) { + add_image_to_directory(); + return; + } + + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CreateRequest<I>, &CreateRequest<I>::handle_validate_data_pool>(this); + auto req = ValidatePoolRequest<I>::create(m_data_io_ctx, m_op_work_queue, + ctx); + req->send(); +} + +template <typename I> +void CreateRequest<I>::handle_validate_data_pool(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EINVAL) { + lderr(m_cct) << "pool does not support RBD images" << dendl; + complete(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to validate pool: " << cpp_strerror(r) << dendl; + complete(r); + return; + } + + add_image_to_directory(); +} + +template<typename I> +void CreateRequest<I>::add_image_to_directory() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + if (!m_io_ctx.get_namespace().empty()) { + cls_client::dir_state_assert(&op, cls::rbd::DIRECTORY_STATE_READY); + } + cls_client::dir_add_image(&op, m_image_name, m_image_id); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_add_image_to_directory>(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_add_image_to_directory(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 5) << "directory entry for image " << m_image_name + << " already exists" << dendl; + complete(r); + return; + } else if (!m_io_ctx.get_namespace().empty() && r == -ENOENT) { + ldout(m_cct, 5) << "namespace " << m_io_ctx.get_namespace() + << " does not exist" << dendl; + complete(r); + return; + } else if (r < 0) { + lderr(m_cct) << "error adding image to directory: " << cpp_strerror(r) + << dendl; + complete(r); + return; + } + + create_id_object(); +} + +template<typename I> +void CreateRequest<I>::create_id_object() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + op.create(true); + cls_client::set_id(&op, m_image_id); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_create_id_object>(this); + int r = m_io_ctx.aio_operate(m_id_obj, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_create_id_object(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 5) << "id object for " << m_image_name << " already exists" + << dendl; + m_r_saved = r; + remove_from_dir(); + return; + } else if (r < 0) { + lderr(m_cct) << "error creating RBD id object: " << cpp_strerror(r) + << dendl; + m_r_saved = r; + remove_from_dir(); + return; + } + + negotiate_features(); +} + +template<typename I> +void CreateRequest<I>::negotiate_features() { + if (!m_negotiate_features) { + create_image(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::get_all_features_start(&op); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_negotiate_features>(this); + + m_outbl.clear(); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_negotiate_features(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + uint64_t all_features; + if (r >= 0) { + auto it = m_outbl.cbegin(); + r = cls_client::get_all_features_finish(&it, &all_features); + } + if (r < 0) { + ldout(m_cct, 10) << "error retrieving server supported features set: " + << cpp_strerror(r) << dendl; + } else if ((m_features & all_features) != m_features) { + m_features &= all_features; + ldout(m_cct, 10) << "limiting default features set to server supported: " + << m_features << dendl; + } + + create_image(); +} + +template<typename I> +void CreateRequest<I>::create_image() { + ldout(m_cct, 15) << dendl; + ceph_assert(m_data_pool.empty() || m_data_pool_id != -1); + + ostringstream oss; + oss << RBD_DATA_PREFIX; + if (m_data_pool_id != -1) { + oss << stringify(m_io_ctx.get_id()) << "."; + } + oss << m_image_id; + if (oss.str().length() > RBD_MAX_BLOCK_NAME_PREFIX_LENGTH) { + lderr(m_cct) << "object prefix '" << oss.str() << "' too large" << dendl; + m_r_saved = -EINVAL; + remove_id_object(); + return; + } + + librados::ObjectWriteOperation op; + op.create(true); + cls_client::create_image(&op, m_size, m_order, m_features, oss.str(), + m_data_pool_id); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_create_image>(this); + int r = m_io_ctx.aio_operate(m_header_obj, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_create_image(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 5) << "image id already in-use" << dendl; + complete(-EBADF); + return; + } else if (r < 0) { + lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl; + m_r_saved = r; + remove_id_object(); + return; + } + + set_stripe_unit_count(); +} + +template<typename I> +void CreateRequest<I>::set_stripe_unit_count() { + if ((!m_stripe_unit && !m_stripe_count) || + ((m_stripe_count == 1) && (m_stripe_unit == (1ull << m_order)))) { + object_map_resize(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::set_stripe_unit_count(&op, m_stripe_unit, m_stripe_count); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_set_stripe_unit_count>(this); + int r = m_io_ctx.aio_operate(m_header_obj, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_set_stripe_unit_count(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error setting stripe unit/count: " + << cpp_strerror(r) << dendl; + m_r_saved = r; + remove_header_object(); + return; + } + + object_map_resize(); +} + +template<typename I> +void CreateRequest<I>::object_map_resize() { + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) { + fetch_mirror_mode(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::object_map_resize(&op, Striper::get_num_objects(m_layout, m_size), + OBJECT_NONEXISTENT); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_object_map_resize>(this); + int r = m_io_ctx.aio_operate(m_objmap_name, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_object_map_resize(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error creating initial object map: " + << cpp_strerror(r) << dendl; + + m_r_saved = r; + remove_header_object(); + return; + } + + fetch_mirror_mode(); +} + +template<typename I> +void CreateRequest<I>::fetch_mirror_mode() { + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + complete(0); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_fetch_mirror_mode>(this); + m_outbl.clear(); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_outbl); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_fetch_mirror_mode(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if ((r < 0) && (r != -ENOENT)) { + lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + remove_object_map(); + return; + } + + cls::rbd::MirrorMode mirror_mode_internal = cls::rbd::MIRROR_MODE_DISABLED; + if (r == 0) { + auto it = m_outbl.cbegin(); + r = cls_client::mirror_mode_get_finish(&it, &mirror_mode_internal); + if (r < 0) { + lderr(m_cct) << "Failed to retrieve mirror mode" << dendl; + + m_r_saved = r; + remove_object_map(); + return; + } + } + + // TODO: remove redundant code... + switch (mirror_mode_internal) { + case cls::rbd::MIRROR_MODE_DISABLED: + case cls::rbd::MIRROR_MODE_IMAGE: + case cls::rbd::MIRROR_MODE_POOL: + m_mirror_mode = static_cast<rbd_mirror_mode_t>(mirror_mode_internal); + break; + default: + lderr(m_cct) << "Unknown mirror mode (" + << static_cast<uint32_t>(mirror_mode_internal) << ")" << dendl; + r = -EINVAL; + remove_object_map(); + return; + } + + journal_create(); +} + +template<typename I> +void CreateRequest<I>::journal_create() { + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_journal_create>( + this); + + librbd::journal::TagData tag_data; + tag_data.mirror_uuid = (m_force_non_primary ? m_primary_mirror_uuid : + librbd::Journal<I>::LOCAL_MIRROR_UUID); + + librbd::journal::CreateRequest<I> *req = + librbd::journal::CreateRequest<I>::create( + m_io_ctx, m_image_id, m_journal_order, m_journal_splay_width, + m_journal_pool, cls::journal::Tag::TAG_CLASS_NEW, tag_data, + librbd::Journal<I>::IMAGE_CLIENT_ID, m_op_work_queue, ctx); + req->send(); +} + +template<typename I> +void CreateRequest<I>::handle_journal_create(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error creating journal: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + remove_object_map(); + return; + } + + mirror_image_enable(); +} + +template<typename I> +void CreateRequest<I>::mirror_image_enable() { + if (((m_mirror_mode != RBD_MIRROR_MODE_POOL) && !m_force_non_primary) || + m_skip_mirror_enable) { + complete(0); + return; + } + + ldout(m_cct, 15) << dendl; + auto ctx = create_context_callback< + CreateRequest<I>, &CreateRequest<I>::handle_mirror_image_enable>(this); + auto req = mirror::EnableRequest<I>::create(m_io_ctx, m_image_id, + m_non_primary_global_image_id, + m_op_work_queue, ctx); + req->send(); +} + +template<typename I> +void CreateRequest<I>::handle_mirror_image_enable(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "cannot enable mirroring: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + journal_remove(); + return; + } + + complete(0); +} + +template<typename I> +void CreateRequest<I>::complete(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_data_io_ctx.close(); + auto on_finish = m_on_finish; + delete this; + on_finish->complete(r); +} + +// cleanup +template<typename I> +void CreateRequest<I>::journal_remove() { + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + remove_object_map(); + return; + } + + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_journal_remove>( + this); + + librbd::journal::RemoveRequest<I> *req = + librbd::journal::RemoveRequest<I>::create( + m_io_ctx, m_image_id, librbd::Journal<I>::IMAGE_CLIENT_ID, m_op_work_queue, + ctx); + req->send(); +} + +template<typename I> +void CreateRequest<I>::handle_journal_remove(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up journal after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_object_map(); +} + +template<typename I> +void CreateRequest<I>::remove_object_map() { + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) { + remove_header_object(); + return; + } + + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_remove_object_map>(this); + int r = m_io_ctx.aio_remove(m_objmap_name, comp); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_remove_object_map(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up object map after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_header_object(); +} + +template<typename I> +void CreateRequest<I>::remove_header_object() { + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_remove_header_object>(this); + int r = m_io_ctx.aio_remove(m_header_obj, comp); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_remove_header_object(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up image header after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_id_object(); +} + +template<typename I> +void CreateRequest<I>::remove_id_object() { + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_remove_id_object>(this); + int r = m_io_ctx.aio_remove(m_id_obj, comp); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_remove_id_object(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up id object after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_from_dir(); +} + +template<typename I> +void CreateRequest<I>::remove_from_dir() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::dir_remove_image(&op, m_image_name, m_image_id); + + using klass = CreateRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_remove_from_dir>(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template<typename I> +void CreateRequest<I>::handle_remove_from_dir(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up image from rbd_directory object " + << "after creation failed: " << cpp_strerror(r) << dendl; + } + + complete(m_r_saved); +} + +} //namespace image +} //namespace librbd + +template class librbd::image::CreateRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/CreateRequest.h b/src/librbd/image/CreateRequest.h new file mode 100644 index 00000000..6d5b641d --- /dev/null +++ b/src/librbd/image/CreateRequest.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" + +class ConfigProxy; +class Context; +class ContextWQ; + +using librados::IoCtx; + +namespace journal { class Journaler; } + +namespace librbd { +namespace image { + +template <typename ImageCtxT = ImageCtx> +class CreateRequest { +public: + static CreateRequest *create(const ConfigProxy& config, IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, uint64_t size, + const ImageOptions &image_options, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable, + ContextWQ *op_work_queue, Context *on_finish) { + return new CreateRequest(config, ioctx, image_name, image_id, size, + image_options, non_primary_global_image_id, + primary_mirror_uuid, skip_mirror_enable, + op_work_queue, on_finish); + } + + static int validate_order(CephContext *cct, uint8_t order); + + void send(); + +private: + /** + * @verbatim + * + * <start> . . . . > . . . . . + * | . + * v . + * VALIDATE DATA POOL v (pool validation + * | . disabled) + * v . + * (error: bottom up) ADD IMAGE TO DIRECTORY < . . . . + * _______<_______ | + * | | v + * | | CREATE ID OBJECT + * | | / | + * | REMOVE FROM DIR <-------/ v + * | | NEGOTIATE FEATURES (when using default features) + * | | | + * | | v (stripingv2 disabled) + * | | CREATE IMAGE. . . . > . . . . + * v | / | . + * | REMOVE ID OBJ <---------/ v . + * | | SET STRIPE UNIT COUNT . + * | | / | \ . . . . . > . . . . + * | REMOVE HEADER OBJ<------/ v /. (object-map + * | |\ OBJECT MAP RESIZE . . < . . * v disabled) + * | | \ / | \ . . . . . > . . . . + * | | *<-----------/ v /. (journaling + * | | FETCH MIRROR MODE. . < . . * v disabled) + * | | / | . + * | REMOVE OBJECT MAP<--------/ v . + * | |\ JOURNAL CREATE . + * | | \ / | . + * v | *<------------/ v . + * | | MIRROR IMAGE ENABLE . + * | | / | . + * | JOURNAL REMOVE*<-------/ | . + * | v . + * |_____________>___________________<finish> . . . . < . . . . + * + * @endverbatim + */ + + CreateRequest(const ConfigProxy& config, IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, uint64_t size, + const ImageOptions &image_options, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable, + ContextWQ *op_work_queue, Context *on_finish); + + const ConfigProxy& m_config; + IoCtx m_io_ctx; + IoCtx m_data_io_ctx; + std::string m_image_name; + std::string m_image_id; + uint64_t m_size; + uint8_t m_order = 0; + uint64_t m_features = 0; + uint64_t m_stripe_unit = 0; + uint64_t m_stripe_count = 0; + uint8_t m_journal_order = 0; + uint8_t m_journal_splay_width = 0; + std::string m_journal_pool; + std::string m_data_pool; + int64_t m_data_pool_id = -1; + const std::string m_non_primary_global_image_id; + const std::string m_primary_mirror_uuid; + bool m_skip_mirror_enable; + bool m_negotiate_features = false; + + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + int m_r_saved = 0; // used to return actual error after cleanup + bool m_force_non_primary; + file_layout_t m_layout; + std::string m_id_obj, m_header_obj, m_objmap_name; + + bufferlist m_outbl; + rbd_mirror_mode_t m_mirror_mode = RBD_MIRROR_MODE_DISABLED; + cls::rbd::MirrorImage m_mirror_image_internal; + + void validate_data_pool(); + void handle_validate_data_pool(int r); + + void add_image_to_directory(); + void handle_add_image_to_directory(int r); + + void create_id_object(); + void handle_create_id_object(int r); + + void negotiate_features(); + void handle_negotiate_features(int r); + + void create_image(); + void handle_create_image(int r); + + void set_stripe_unit_count(); + void handle_set_stripe_unit_count(int r); + + void object_map_resize(); + void handle_object_map_resize(int r); + + void fetch_mirror_mode(); + void handle_fetch_mirror_mode(int r); + + void journal_create(); + void handle_journal_create(int r); + + void mirror_image_enable(); + void handle_mirror_image_enable(int r); + + void complete(int r); + + // cleanup + void journal_remove(); + void handle_journal_remove(int r); + + void remove_object_map(); + void handle_remove_object_map(int r); + + void remove_header_object(); + void handle_remove_header_object(int r); + + void remove_id_object(); + void handle_remove_id_object(int r); + + void remove_from_dir(); + void handle_remove_from_dir(int r); + +}; + +} //namespace image +} //namespace librbd + +extern template class librbd::image::CreateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H diff --git a/src/librbd/image/DetachChildRequest.cc b/src/librbd/image/DetachChildRequest.cc new file mode 100644 index 00000000..86cd4a88 --- /dev/null +++ b/src/librbd/image/DetachChildRequest.cc @@ -0,0 +1,303 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/DetachChildRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/journal/DisabledPolicy.h" +#include <string> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::DetachChildRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +DetachChildRequest<I>::~DetachChildRequest() { + ceph_assert(m_parent_image_ctx == nullptr); +} + +template <typename I> +void DetachChildRequest<I>::send() { + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + RWLock::RLocker parent_locker(m_image_ctx.parent_lock); + + // use oldest snapshot or HEAD for parent spec + if (!m_image_ctx.snap_info.empty()) { + m_parent_spec = m_image_ctx.snap_info.begin()->second.parent.spec; + } else { + m_parent_spec = m_image_ctx.parent_md.spec; + } + } + + if (m_parent_spec.pool_id == -1) { + // ignore potential race with parent disappearing + m_image_ctx.op_work_queue->queue(create_context_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::finish>(this), 0); + return; + } else if (!m_image_ctx.test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) { + clone_v1_remove_child(); + return; + } + + clone_v2_child_detach(); +} + +template <typename I> +void DetachChildRequest<I>::clone_v2_child_detach() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + cls_client::child_detach(&op, m_parent_spec.snap_id, + {m_image_ctx.md_ctx.get_id(), + m_image_ctx.md_ctx.get_namespace(), + m_image_ctx.id}); + + int r = util::create_ioctx(m_image_ctx.md_ctx, "parent image", + m_parent_spec.pool_id, + m_parent_spec.pool_namespace, &m_parent_io_ctx); + if (r < 0) { + finish(r); + return; + } + + m_parent_header_name = util::header_name(m_parent_spec.image_id); + + auto aio_comp = create_rados_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::handle_clone_v2_child_detach>(this); + r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void DetachChildRequest<I>::handle_clone_v2_child_detach(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error detaching child from parent: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + clone_v2_get_snapshot(); +} + +template <typename I> +void DetachChildRequest<I>::clone_v2_get_snapshot() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::snapshot_get_start(&op, m_parent_spec.snap_id); + + m_out_bl.clear(); + auto aio_comp = create_rados_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::handle_clone_v2_get_snapshot>(this); + int r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void DetachChildRequest<I>::handle_clone_v2_get_snapshot(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + bool remove_snapshot = false; + if (r == 0) { + cls::rbd::SnapshotInfo snap_info; + auto it = m_out_bl.cbegin(); + r = cls_client::snapshot_get_finish(&it, &snap_info); + if (r == 0) { + m_parent_snap_namespace = snap_info.snapshot_namespace; + m_parent_snap_name = snap_info.name; + + if (cls::rbd::get_snap_namespace_type(m_parent_snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH && + snap_info.child_count == 0) { + // snapshot is in trash w/ zero children, so remove it + remove_snapshot = true; + } + } + } + + if (r < 0) { + ldout(cct, 5) << "failed to retrieve snapshot: " << cpp_strerror(r) + << dendl; + } + + if (!remove_snapshot) { + finish(0); + return; + } + + clone_v2_open_parent(); +} + +template<typename I> +void DetachChildRequest<I>::clone_v2_open_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + m_parent_image_ctx = I::create("", m_parent_spec.image_id, nullptr, + m_parent_io_ctx, false); + + auto ctx = create_context_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::handle_clone_v2_open_parent>(this); + m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template<typename I> +void DetachChildRequest<I>::handle_clone_v2_open_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + ldout(cct, 5) << "failed to open parent for read/write: " + << cpp_strerror(r) << dendl; + m_parent_image_ctx->destroy(); + m_parent_image_ctx = nullptr; + finish(0); + return; + } + + // do not attempt to open the parent journal when removing the trash + // snapshot, because the parent may be not promoted + if (m_parent_image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + RWLock::WLocker snap_locker(m_parent_image_ctx->snap_lock); + m_parent_image_ctx->set_journal_policy(new journal::DisabledPolicy()); + } + + // disallow any proxied maintenance operations + { + RWLock::RLocker owner_lock(m_parent_image_ctx->owner_lock); + if (m_parent_image_ctx->exclusive_lock != nullptr) { + m_parent_image_ctx->exclusive_lock->block_requests(0); + } + } + + clone_v2_remove_snapshot(); +} + +template<typename I> +void DetachChildRequest<I>::clone_v2_remove_snapshot() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::handle_clone_v2_remove_snapshot>(this); + m_parent_image_ctx->operations->snap_remove(m_parent_snap_namespace, + m_parent_snap_name, ctx); +} + +template<typename I> +void DetachChildRequest<I>::handle_clone_v2_remove_snapshot(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + ldout(cct, 5) << "failed to remove trashed clone snapshot: " + << cpp_strerror(r) << dendl; + } + + clone_v2_close_parent(); +} + +template<typename I> +void DetachChildRequest<I>::clone_v2_close_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::handle_clone_v2_close_parent>(this); + m_parent_image_ctx->state->close(ctx); +} + +template<typename I> +void DetachChildRequest<I>::handle_clone_v2_close_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + ldout(cct, 5) << "failed to close parent image:" << cpp_strerror(r) + << dendl; + } + + m_parent_image_ctx->destroy(); + m_parent_image_ctx = nullptr; + finish(0); +} + +template<typename I> +void DetachChildRequest<I>::clone_v1_remove_child() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + m_parent_spec.pool_namespace = ""; + + librados::ObjectWriteOperation op; + librbd::cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id); + + auto aio_comp = create_rados_callback< + DetachChildRequest<I>, + &DetachChildRequest<I>::handle_clone_v1_remove_child>(this); + int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template<typename I> +void DetachChildRequest<I>::handle_clone_v1_remove_child(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } else if (r < 0) { + lderr(cct) << "failed to remove child from children list: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void DetachChildRequest<I>::finish(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::DetachChildRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/DetachChildRequest.h b/src/librbd/image/DetachChildRequest.h new file mode 100644 index 00000000..bb0c81d2 --- /dev/null +++ b/src/librbd/image/DetachChildRequest.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H +#define CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class DetachChildRequest { +public: + static DetachChildRequest* create(ImageCtxT& image_ctx, Context* on_finish) { + return new DetachChildRequest(image_ctx, on_finish); + } + + DetachChildRequest(ImageCtxT& image_ctx, Context* on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { + } + ~DetachChildRequest(); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * (v1) | (v2) + * /--------------/ \--------------\ + * | | + * v v + * REMOVE_CHILD CHILD_DETACH + * | | + * | v + * | GET_SNAPSHOT + * | (snapshot in-use) . | + * |/. . . . . . . . . . . . . . . | + * | v + * | OPEN_PARENT + * | | + * | v + * | REMOVE_SNAPSHOT + * | | + * | v + * | CLOSE_PARENT + * | | + * |/------------------------------/ + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + Context* m_on_finish; + + librados::IoCtx m_parent_io_ctx; + cls::rbd::ParentImageSpec m_parent_spec; + std::string m_parent_header_name; + + cls::rbd::SnapshotNamespace m_parent_snap_namespace; + std::string m_parent_snap_name; + + ImageCtxT* m_parent_image_ctx = nullptr; + + ceph::bufferlist m_out_bl; + + void clone_v2_child_detach(); + void handle_clone_v2_child_detach(int r); + + void clone_v2_get_snapshot(); + void handle_clone_v2_get_snapshot(int r); + + void clone_v2_open_parent(); + void handle_clone_v2_open_parent(int r); + + void clone_v2_remove_snapshot(); + void handle_clone_v2_remove_snapshot(int r); + + void clone_v2_close_parent(); + void handle_clone_v2_close_parent(int r); + + void clone_v1_remove_child(); + void handle_clone_v1_remove_child(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::DetachChildRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H diff --git a/src/librbd/image/DetachParentRequest.cc b/src/librbd/image/DetachParentRequest.cc new file mode 100644 index 00000000..40ff44b7 --- /dev/null +++ b/src/librbd/image/DetachParentRequest.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/DetachParentRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::DetachParentRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +void DetachParentRequest<I>::send() { + detach_parent(); +} + +template <typename I> +void DetachParentRequest<I>::detach_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + if (!m_legacy_parent) { + librbd::cls_client::parent_detach(&op); + } else { + librbd::cls_client::remove_parent(&op); + } + + auto aio_comp = create_rados_callback< + DetachParentRequest<I>, + &DetachParentRequest<I>::handle_detach_parent>(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void DetachParentRequest<I>::handle_detach_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + if (!m_legacy_parent && r == -EOPNOTSUPP) { + ldout(cct, 10) << "retrying using legacy parent method" << dendl; + m_legacy_parent = true; + detach_parent(); + return; + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "detach parent encountered an error: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void DetachParentRequest<I>::finish(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::DetachParentRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/DetachParentRequest.h b/src/librbd/image/DetachParentRequest.h new file mode 100644 index 00000000..17c86aaa --- /dev/null +++ b/src/librbd/image/DetachParentRequest.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H +#define CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class DetachParentRequest { +public: + static DetachParentRequest* create(ImageCtxT& image_ctx, Context* on_finish) { + return new DetachParentRequest(image_ctx, on_finish); + } + + DetachParentRequest(ImageCtxT& image_ctx, Context* on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | * * * * * * + * | * * -EOPNOTSUPP + * v v * + * DETACH_PARENT * * * + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + Context* m_on_finish; + + bool m_legacy_parent = false; + + void detach_parent(); + void handle_detach_parent(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::DetachParentRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H diff --git a/src/librbd/image/ListWatchersRequest.cc b/src/librbd/image/ListWatchersRequest.cc new file mode 100644 index 00000000..e93fffc1 --- /dev/null +++ b/src/librbd/image/ListWatchersRequest.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ListWatchersRequest.h" +#include "common/RWLock.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Utils.h" + +#include <algorithm> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::ListWatchersRequest: " << this \ + << " " << __func__ << ": " + +static std::ostream& operator<<(std::ostream& os, const obj_watch_t& watch) { + os << "{addr=" << watch.addr << ", " + << "watcher_id=" << watch.watcher_id << ", " + << "cookie=" << watch.cookie << "}"; + return os; +} + +namespace librbd { +namespace image { + +using librados::IoCtx; +using util::create_rados_callback; + +template<typename I> +ListWatchersRequest<I>::ListWatchersRequest(I &image_ctx, int flags, + std::list<obj_watch_t> *watchers, + Context *on_finish) + : m_image_ctx(image_ctx), m_flags(flags), m_watchers(watchers), + m_on_finish(on_finish), m_cct(m_image_ctx.cct) { +} + +template<typename I> +void ListWatchersRequest<I>::send() { + ldout(m_cct, 20) << dendl; + + list_image_watchers(); +} + +template<typename I> +void ListWatchersRequest<I>::list_image_watchers() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + op.list_watchers(&m_object_watchers, &m_ret_val); + + using klass = ListWatchersRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_list_image_watchers>(this); + + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, + rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void ListWatchersRequest<I>::handle_list_image_watchers(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0 && m_ret_val < 0) { + r = m_ret_val; + } + if (r < 0) { + lderr(m_cct) << "error listing image watchers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ldout(m_cct, 20) << "object_watchers=" << m_object_watchers << dendl; + list_mirror_watchers(); +} + +template<typename I> +void ListWatchersRequest<I>::list_mirror_watchers() { + if ((m_object_watchers.empty()) || + (m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) == 0 || + (m_image_ctx.features & RBD_FEATURE_JOURNALING) == 0) { + finish(0); + return; + } + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + op.list_watchers(&m_mirror_watchers, &m_ret_val); + + using klass = ListWatchersRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_list_mirror_watchers>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, rados_completion, + &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void ListWatchersRequest<I>::handle_list_mirror_watchers(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0 && m_ret_val < 0) { + r = m_ret_val; + } + if (r < 0 && r != -ENOENT) { + ldout(m_cct, 1) << "error listing mirror watchers: " << cpp_strerror(r) + << dendl; + } + + ldout(m_cct, 20) << "mirror_watchers=" << m_mirror_watchers << dendl; + finish(0); +} + +template<typename I> +void ListWatchersRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0) { + m_watchers->clear(); + + if (m_object_watchers.size() > 0) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + uint64_t watch_handle = m_image_ctx.image_watcher != nullptr ? + m_image_ctx.image_watcher->get_watch_handle() : 0; + + for (auto &w : m_object_watchers) { + if ((m_flags & LIST_WATCHERS_FILTER_OUT_MY_INSTANCE) != 0) { + if (w.cookie == watch_handle) { + ldout(m_cct, 20) << "filtering out my instance: " << w << dendl; + continue; + } + } + if ((m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) != 0) { + auto it = std::find_if(m_mirror_watchers.begin(), + m_mirror_watchers.end(), + [w] (obj_watch_t &watcher) { + return (strncmp(w.addr, watcher.addr, + sizeof(w.addr)) == 0); + }); + if (it != m_mirror_watchers.end()) { + ldout(m_cct, 20) << "filtering out mirror instance: " << w << dendl; + continue; + } + } + m_watchers->push_back(w); + } + } + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::ListWatchersRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/ListWatchersRequest.h b/src/librbd/image/ListWatchersRequest.h new file mode 100644 index 00000000..941bf728 --- /dev/null +++ b/src/librbd/image/ListWatchersRequest.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H +#define CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H + +#include "include/rados/rados_types.hpp" + +#include <list> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +enum { + LIST_WATCHERS_FILTER_OUT_MY_INSTANCE = 1 << 0, + LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES = 1 << 1, +}; + +template<typename ImageCtxT = ImageCtx> +class ListWatchersRequest { +public: + static ListWatchersRequest *create(ImageCtxT &image_ctx, int flags, + std::list<obj_watch_t> *watchers, + Context *on_finish) { + return new ListWatchersRequest(image_ctx, flags, watchers, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * LIST_IMAGE_WATCHERS + * | + * v + * LIST_MIRROR_WATCHERS (skip if not needed) + * | + * v + * <finish> + * + * @endverbatim + */ + + ListWatchersRequest(ImageCtxT &image_ctx, int flags, std::list<obj_watch_t> *watchers, + Context *on_finish); + + ImageCtxT& m_image_ctx; + int m_flags; + std::list<obj_watch_t> *m_watchers; + Context *m_on_finish; + + CephContext *m_cct; + int m_ret_val; + bufferlist m_out_bl; + std::list<obj_watch_t> m_object_watchers; + std::list<obj_watch_t> m_mirror_watchers; + + void list_image_watchers(); + void handle_list_image_watchers(int r); + + void list_mirror_watchers(); + void handle_list_mirror_watchers(int r); + + void finish(int r); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::ListWatchersRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H diff --git a/src/librbd/image/OpenRequest.cc b/src/librbd/image/OpenRequest.cc new file mode 100644 index 00000000..6f5f3083 --- /dev/null +++ b/src/librbd/image/OpenRequest.cc @@ -0,0 +1,662 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/OpenRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ConfigWatcher.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/cache/ObjectCacherObjectDispatch.h" +#include "librbd/image/CloseRequest.h" +#include "librbd/image/RefreshRequest.h" +#include "librbd/image/SetSnapRequest.h" +#include <boost/algorithm/string/predicate.hpp> +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::OpenRequest: " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +OpenRequest<I>::OpenRequest(I *image_ctx, uint64_t flags, + Context *on_finish) + : m_image_ctx(image_ctx), + m_skip_open_parent_image(flags & OPEN_FLAG_SKIP_OPEN_PARENT), + m_on_finish(on_finish), m_error_result(0) { + if ((flags & OPEN_FLAG_OLD_FORMAT) != 0) { + m_image_ctx->old_format = true; + } + if ((flags & OPEN_FLAG_IGNORE_MIGRATING) != 0) { + m_image_ctx->ignore_migrating = true; + } +} + +template <typename I> +void OpenRequest<I>::send() { + if (m_image_ctx->old_format) { + send_v1_detect_header(); + } else { + send_v2_detect_header(); + } +} + +template <typename I> +void OpenRequest<I>::send_v1_detect_header() { + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_v1_detect_header>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(util::old_header_name(m_image_ctx->name), + comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v1_detect_header(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + if (*result != -ENOENT) { + lderr(cct) << "failed to stat image header: " << cpp_strerror(*result) + << dendl; + } + send_close_image(*result); + } else { + ldout(cct, 1) << "RBD image format 1 is deprecated. " + << "Please copy this image to image format 2." << dendl; + + m_image_ctx->old_format = true; + m_image_ctx->header_oid = util::old_header_name(m_image_ctx->name); + m_image_ctx->apply_metadata({}, true); + + send_refresh(); + } + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_detect_header() { + if (m_image_ctx->id.empty()) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_v2_detect_header>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name), + comp, &op, &m_out_bl); + comp->release(); + } else { + send_v2_get_name(); + } +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_detect_header(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == -ENOENT) { + send_v1_detect_header(); + } else if (*result < 0) { + lderr(cct) << "failed to stat v2 image header: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + } else { + m_image_ctx->old_format = false; + send_v2_get_id(); + } + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_id() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_id_start(&op); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_v2_get_id>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name), + comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_id(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_id_finish(&it, &m_image_ctx->id); + } + if (*result < 0) { + lderr(cct) << "failed to retrieve image id: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + } else { + send_v2_get_initial_metadata(); + } + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_name() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::dir_get_name_start(&op, m_image_ctx->id); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_name>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_name(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::dir_get_name_finish(&it, &m_image_ctx->name); + } + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to retrieve name: " + << cpp_strerror(*result) << dendl; + send_close_image(*result); + } else if (*result == -ENOENT) { + // image does not exist in directory, look in the trash bin + ldout(cct, 10) << "image id " << m_image_ctx->id << " does not exist in " + << "rbd directory, searching in rbd trash..." << dendl; + send_v2_get_name_from_trash(); + } else { + send_v2_get_initial_metadata(); + } + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_name_from_trash() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::trash_get_start(&op, m_image_ctx->id); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_name_from_trash>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(RBD_TRASH, comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_name_from_trash(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + cls::rbd::TrashImageSpec trash_spec; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::trash_get_finish(&it, &trash_spec); + m_image_ctx->name = trash_spec.name; + } + if (*result < 0) { + if (*result == -EOPNOTSUPP) { + *result = -ENOENT; + } + if (*result == -ENOENT) { + ldout(cct, 5) << "failed to retrieve name for image id " + << m_image_ctx->id << dendl; + } else { + lderr(cct) << "failed to retrieve name from trash: " + << cpp_strerror(*result) << dendl; + } + send_close_image(*result); + } else { + send_v2_get_initial_metadata(); + } + + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_initial_metadata() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->old_format = false; + m_image_ctx->header_oid = util::header_name(m_image_ctx->id); + + librados::ObjectReadOperation op; + cls_client::get_size_start(&op, CEPH_NOSNAP); + cls_client::get_object_prefix_start(&op); + cls_client::get_features_start(&op, true); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_initial_metadata>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_initial_metadata(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + if (*result >= 0) { + uint64_t size; + *result = cls_client::get_size_finish(&it, &size, &m_image_ctx->order); + } + + if (*result >= 0) { + *result = cls_client::get_object_prefix_finish(&it, + &m_image_ctx->object_prefix); + } + + if (*result >= 0) { + uint64_t incompatible_features; + *result = cls_client::get_features_finish(&it, &m_image_ctx->features, + &incompatible_features); + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve initial metadata: " + << cpp_strerror(*result) << dendl; + send_close_image(*result); + return nullptr; + } + + if (m_image_ctx->test_features(RBD_FEATURE_STRIPINGV2)) { + send_v2_get_stripe_unit_count(); + } else { + send_v2_get_create_timestamp(); + } + + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_stripe_unit_count() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_stripe_unit_count_start(&op); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_stripe_unit_count>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_stripe_unit_count(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_stripe_unit_count_finish( + &it, &m_image_ctx->stripe_unit, &m_image_ctx->stripe_count); + } + + if (*result == -ENOEXEC || *result == -EINVAL) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to read striping metadata: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_v2_get_create_timestamp(); + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_create_timestamp() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_create_timestamp_start(&op); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_create_timestamp>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_create_timestamp(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_create_timestamp_finish(&it, + &m_image_ctx->create_timestamp); + } + if (*result < 0 && *result != -EOPNOTSUPP) { + lderr(cct) << "failed to retrieve create_timestamp: " + << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_v2_get_access_modify_timestamp(); + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_access_modify_timestamp() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_access_timestamp_start(&op); + cls_client::get_modify_timestamp_start(&op); + //TODO: merge w/ create timestamp query after luminous EOLed + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_access_modify_timestamp>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_access_modify_timestamp(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_access_timestamp_finish(&it, + &m_image_ctx->access_timestamp); + if (*result == 0) + *result = cls_client::get_modify_timestamp_finish(&it, + &m_image_ctx->modify_timestamp); + } + if (*result < 0 && *result != -EOPNOTSUPP) { + lderr(cct) << "failed to retrieve access/modify_timestamp: " + << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_v2_get_data_pool(); + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_v2_get_data_pool() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_data_pool_start(&op); + + using klass = OpenRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_data_pool>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *OpenRequest<I>::handle_v2_get_data_pool(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + int64_t data_pool_id = -1; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_data_pool_finish(&it, &data_pool_id); + } else if (*result == -EOPNOTSUPP) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to read data pool: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + if (data_pool_id != -1) { + *result = util::create_ioctx(m_image_ctx->md_ctx, "data pool", data_pool_id, + {}, &m_image_ctx->data_ctx); + if (*result < 0) { + if (*result != -ENOENT) { + send_close_image(*result); + return nullptr; + } + m_image_ctx->data_ctx.close(); + } else { + m_image_ctx->data_ctx.set_namespace(m_image_ctx->md_ctx.get_namespace()); + } + } else { + data_pool_id = m_image_ctx->md_ctx.get_id(); + } + + m_image_ctx->init_layout(data_pool_id); + send_refresh(); + return nullptr; +} + +template <typename I> +void OpenRequest<I>::send_refresh() { + m_image_ctx->init(); + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->config_watcher = ConfigWatcher<I>::create(*m_image_ctx); + m_image_ctx->config_watcher->init(); + + using klass = OpenRequest<I>; + RefreshRequest<I> *req = RefreshRequest<I>::create( + *m_image_ctx, false, m_skip_open_parent_image, + create_context_callback<klass, &klass::handle_refresh>(this)); + req->send(); +} + +template <typename I> +Context *OpenRequest<I>::handle_refresh(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + return send_init_cache(result); +} + +template <typename I> +Context *OpenRequest<I>::send_init_cache(int *result) { + // cache is disabled or parent image context + if (!m_image_ctx->cache || m_image_ctx->child != nullptr || + !m_image_ctx->data_ctx.is_valid()) { + return send_register_watch(result); + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + auto cache = cache::ObjectCacherObjectDispatch<I>::create(m_image_ctx); + cache->init(); + + // readahead requires the cache + m_image_ctx->readahead.set_trigger_requests( + m_image_ctx->config.template get_val<uint64_t>("rbd_readahead_trigger_requests")); + m_image_ctx->readahead.set_max_readahead_size( + m_image_ctx->config.template get_val<Option::size_t>("rbd_readahead_max_bytes")); + + return send_register_watch(result); +} + +template <typename I> +Context *OpenRequest<I>::send_register_watch(int *result) { + if (m_image_ctx->read_only) { + return send_set_snap(result); + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = OpenRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_register_watch>(this); + m_image_ctx->register_watch(ctx); + return nullptr; +} + +template <typename I> +Context *OpenRequest<I>::handle_register_watch(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -EPERM) { + ldout(cct, 5) << "user does not have write permission" << dendl; + send_close_image(*result); + return nullptr; + } else if (*result < 0) { + lderr(cct) << "failed to register watch: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + return send_set_snap(result); +} + +template <typename I> +Context *OpenRequest<I>::send_set_snap(int *result) { + if (m_image_ctx->snap_name.empty() && + m_image_ctx->open_snap_id == CEPH_NOSNAP) { + *result = 0; + return m_on_finish; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + uint64_t snap_id = CEPH_NOSNAP; + std::swap(m_image_ctx->open_snap_id, snap_id); + if (snap_id == CEPH_NOSNAP) { + RWLock::RLocker snap_locker(m_image_ctx->snap_lock); + snap_id = m_image_ctx->get_snap_id(m_image_ctx->snap_namespace, + m_image_ctx->snap_name); + } + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "failed to find snapshot " << m_image_ctx->snap_name << dendl; + send_close_image(-ENOENT); + return nullptr; + } + + using klass = OpenRequest<I>; + SetSnapRequest<I> *req = SetSnapRequest<I>::create( + *m_image_ctx, snap_id, + create_context_callback<klass, &klass::handle_set_snap>(this)); + req->send(); + return nullptr; +} + +template <typename I> +Context *OpenRequest<I>::handle_set_snap(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to set image snapshot: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + return m_on_finish; +} + +template <typename I> +void OpenRequest<I>::send_close_image(int error_result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_error_result = error_result; + + using klass = OpenRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_close_image>( + this); + CloseRequest<I> *req = CloseRequest<I>::create(m_image_ctx, ctx); + req->send(); +} + +template <typename I> +Context *OpenRequest<I>::handle_close_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close image: " << cpp_strerror(*result) << dendl; + } + if (m_error_result < 0) { + *result = m_error_result; + } + return m_on_finish; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::OpenRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/OpenRequest.h b/src/librbd/image/OpenRequest.h new file mode 100644 index 00000000..3476832a --- /dev/null +++ b/src/librbd/image/OpenRequest.h @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H +#define CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H + +#include "include/buffer.h" +#include <map> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class OpenRequest { +public: + static OpenRequest *create(ImageCtxT *image_ctx, uint64_t flags, + Context *on_finish) { + return new OpenRequest(image_ctx, flags, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * | (v1) + * |-----> V1_DETECT_HEADER + * | | + * | \-------------------------------\ + * | (v2) | + * \-----> V2_DETECT_HEADER | + * | | + * v | + * V2_GET_ID|NAME | + * | | + * v (skip if have name) | + * V2_GET_NAME_FROM_TRASH | + * | | + * v | + * V2_GET_INITIAL_METADATA | + * | | + * v | + * V2_GET_STRIPE_UNIT_COUNT (skip if | + * | disabled) | + * v | + * V2_GET_CREATE_TIMESTAMP | + * | | + * v | + * V2_GET_ACCESS_MODIFIY_TIMESTAMP | + * | | + * v | + * V2_GET_DATA_POOL --------------> REFRESH + * | + * v + * INIT_CACHE + * | + * v + * REGISTER_WATCH (skip if + * | read-only) + * v + * SET_SNAP (skip if no snap) + * | + * v + * <finish> + * ^ + * (on error) | + * * * * * * * > CLOSE ------------------------/ + * + * @endverbatim + */ + + OpenRequest(ImageCtxT *image_ctx, uint64_t flags, Context *on_finish); + + ImageCtxT *m_image_ctx; + bool m_skip_open_parent_image; + Context *m_on_finish; + + bufferlist m_out_bl; + int m_error_result; + + void send_v1_detect_header(); + Context *handle_v1_detect_header(int *result); + + void send_v2_detect_header(); + Context *handle_v2_detect_header(int *result); + + void send_v2_get_id(); + Context *handle_v2_get_id(int *result); + + void send_v2_get_name(); + Context *handle_v2_get_name(int *result); + + void send_v2_get_name_from_trash(); + Context *handle_v2_get_name_from_trash(int *result); + + void send_v2_get_initial_metadata(); + Context *handle_v2_get_initial_metadata(int *result); + + void send_v2_get_stripe_unit_count(); + Context *handle_v2_get_stripe_unit_count(int *result); + + void send_v2_get_create_timestamp(); + Context *handle_v2_get_create_timestamp(int *result); + + void send_v2_get_access_modify_timestamp(); + Context *handle_v2_get_access_modify_timestamp(int *result); + + void send_v2_get_data_pool(); + Context *handle_v2_get_data_pool(int *result); + + void send_refresh(); + Context *handle_refresh(int *result); + + Context *send_init_cache(int *result); + + Context *send_register_watch(int *result); + Context *handle_register_watch(int *result); + + Context *send_set_snap(int *result); + Context *handle_set_snap(int *result); + + void send_close_image(int error_result); + Context *handle_close_image(int *result); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::OpenRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H diff --git a/src/librbd/image/PreRemoveRequest.cc b/src/librbd/image/PreRemoveRequest.cc new file mode 100644 index 00000000..3326c143 --- /dev/null +++ b/src/librbd/image/PreRemoveRequest.cc @@ -0,0 +1,321 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/PreRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/Utils.h" +#include "librbd/image/ListWatchersRequest.h" +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/operation/SnapshotRemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::PreRemoveRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +namespace { + +bool auto_delete_snapshot(const SnapInfo& snap_info) { + auto snap_namespace_type = cls::rbd::get_snap_namespace_type( + snap_info.snap_namespace); + switch (snap_namespace_type) { + case cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH: + return true; + default: + return false; + } +} + +} // anonymous namespace + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +void PreRemoveRequest<I>::send() { + auto cct = m_image_ctx->cct; + if (m_image_ctx->operations_disabled) { + lderr(cct) << "image operations disabled due to unsupported op features" + << dendl; + finish(-EROFS); + return; + } + + acquire_exclusive_lock(); +} + +template <typename I> +void PreRemoveRequest<I>::acquire_exclusive_lock() { + RWLock::RLocker owner_lock(m_image_ctx->owner_lock); + if (m_image_ctx->exclusive_lock == nullptr) { + validate_image_removal(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // do not attempt to open the journal when removing the image in case + // it's corrupt + if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + m_image_ctx->set_journal_policy(new journal::DisabledPolicy()); + } + + auto ctx = create_context_callback< + PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_exclusive_lock>(this); + + m_image_ctx->exclusive_lock->try_acquire_lock(ctx); +} + +template <typename I> +void PreRemoveRequest<I>::handle_exclusive_lock(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 || !m_image_ctx->exclusive_lock->is_lock_owner()) { + if (!m_force) { + lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl; + finish(-EBUSY); + } else { + ldout(cct, 5) << "cannot obtain exclusive lock - " + << "proceeding due to force flag set" << dendl; + shut_down_exclusive_lock(); + } + return; + } + + validate_image_removal(); +} + +template <typename I> +void PreRemoveRequest<I>::shut_down_exclusive_lock() { + RWLock::RLocker owner_lock(m_image_ctx->owner_lock); + if (m_image_ctx->exclusive_lock == nullptr) { + validate_image_removal(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + PreRemoveRequest<I>, + &PreRemoveRequest<I>::handle_shut_down_exclusive_lock>(this); + + m_exclusive_lock = m_image_ctx->exclusive_lock; + m_exclusive_lock->shut_down(ctx); +} + +template <typename I> +void PreRemoveRequest<I>::handle_shut_down_exclusive_lock(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + delete m_exclusive_lock; + m_exclusive_lock = nullptr; + + if (r < 0) { + lderr(cct) << "error shutting down exclusive lock: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ceph_assert(m_image_ctx->exclusive_lock == nullptr); + validate_image_removal(); +} + +template <typename I> +void PreRemoveRequest<I>::validate_image_removal() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + if (!m_image_ctx->ignore_migrating && + m_image_ctx->test_features(RBD_FEATURE_MIGRATING)) { + lderr(cct) << "image in migration state - not removing" << dendl; + finish(-EBUSY); + return; + } + + check_image_snaps(); +} + +template <typename I> +void PreRemoveRequest<I>::check_image_snaps() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_image_ctx->snap_lock.get_read(); + for (auto& snap_info : m_image_ctx->snap_info) { + if (auto_delete_snapshot(snap_info.second)) { + m_snap_infos.insert(snap_info); + } else { + m_image_ctx->snap_lock.put_read(); + + lderr(cct) << "image has snapshots - not removing" << dendl; + finish(-ENOTEMPTY); + return; + } + } + m_image_ctx->snap_lock.put_read(); + + list_image_watchers(); +} + +template <typename I> +void PreRemoveRequest<I>::list_image_watchers() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + int flags = LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES; + auto ctx = create_context_callback< + PreRemoveRequest<I>, + &PreRemoveRequest<I>::handle_list_image_watchers>(this); + auto req = ListWatchersRequest<I>::create(*m_image_ctx, flags, &m_watchers, + ctx); + req->send(); +} + +template <typename I> +void PreRemoveRequest<I>::handle_list_image_watchers(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error listing image watchers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + check_image_watchers(); +} + +template <typename I> +void PreRemoveRequest<I>::check_image_watchers() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + if (!m_watchers.empty()) { + lderr(cct) << "image has watchers - not removing" << dendl; + finish(-EBUSY); + return; + } + + check_group(); +} + +template <typename I> +void PreRemoveRequest<I>::check_group() { + if (m_image_ctx->old_format) { + finish(0); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::image_group_get_start(&op); + + auto rados_completion = create_rados_callback< + PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_check_group>(this); + m_out_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, + rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void PreRemoveRequest<I>::handle_check_group(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + cls::rbd::GroupSpec s; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::image_group_get_finish(&it, &s); + } + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error fetching group for image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (s.is_valid()) { + lderr(cct) << "image is in a group - not removing" << dendl; + finish(-EMLINK); + return; + } + + remove_snapshot(); +} + +template <typename I> +void PreRemoveRequest<I>::remove_snapshot() { + if (m_snap_infos.empty()) { + finish(0); + return; + } + + auto cct = m_image_ctx->cct; + auto snap_id = m_snap_infos.begin()->first; + auto& snap_info = m_snap_infos.begin()->second; + ldout(cct, 20) << "snap_id=" << snap_id << ", " + << "snap_name=" << snap_info.name << dendl; + + RWLock::RLocker owner_lock(m_image_ctx->owner_lock); + auto ctx = create_context_callback< + PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_remove_snapshot>(this); + auto req = librbd::operation::SnapshotRemoveRequest<I>::create( + *m_image_ctx, snap_info.snap_namespace, snap_info.name, + snap_id, ctx); + req->send(); + +} + +template <typename I> +void PreRemoveRequest<I>::handle_remove_snapshot(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + auto snap_id = m_snap_infos.begin()->first; + lderr(cct) << "failed to auto-prune snapshot " << snap_id << ": " + << cpp_strerror(r) << dendl; + + if (r == -EBUSY) { + r = -ENOTEMPTY; + } + finish(r); + return; + } + + ceph_assert(!m_snap_infos.empty()); + m_snap_infos.erase(m_snap_infos.begin()); + + remove_snapshot(); +} + +template <typename I> +void PreRemoveRequest<I>::finish(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::PreRemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/PreRemoveRequest.h b/src/librbd/image/PreRemoveRequest.h new file mode 100644 index 00000000..30f16b36 --- /dev/null +++ b/src/librbd/image/PreRemoveRequest.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/buffer.h" +#include "librbd/ImageCtx.h" +#include <list> +#include <map> + +class Context; + +namespace librbd { +namespace image { + +template <typename ImageCtxT> +class PreRemoveRequest { +public: + + static PreRemoveRequest *create(ImageCtxT *image_ctx, bool force, + Context *on_finish) { + return new PreRemoveRequest(image_ctx, force, on_finish); + } + + PreRemoveRequest(ImageCtxT *image_ctx, bool force, Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | (skip if + * v not needed) (error) + * ACQUIRE EXCLUSIVE LOCK * * * * * * > SHUT DOWN EXCLUSIVE LOCK + * | | + * v | + * CHECK IMAGE WATCHERS <------------------/ + * | + * v + * CHECK GROUP + * | + * | /------\ + * | | | + * v v | + * REMOVE SNAPS ----/ + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT* m_image_ctx; + bool m_force; + Context* m_on_finish; + + decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr; + + bufferlist m_out_bl; + std::list<obj_watch_t> m_watchers; + + std::map<uint64_t, SnapInfo> m_snap_infos; + + void acquire_exclusive_lock(); + void handle_exclusive_lock(int r); + + void shut_down_exclusive_lock(); + void handle_shut_down_exclusive_lock(int r); + + void validate_image_removal(); + void check_image_snaps(); + + void list_image_watchers(); + void handle_list_image_watchers(int r); + + void check_image_watchers(); + + void check_group(); + void handle_check_group(int r); + + void remove_snapshot(); + void handle_remove_snapshot(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::PreRemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H diff --git a/src/librbd/image/RefreshParentRequest.cc b/src/librbd/image/RefreshParentRequest.cc new file mode 100644 index 00000000..668ad56b --- /dev/null +++ b/src/librbd/image/RefreshParentRequest.cc @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/RefreshParentRequest.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/image/CloseRequest.h" +#include "librbd/image/OpenRequest.h" +#include "librbd/io/ObjectDispatcher.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::RefreshParentRequest: " + +namespace librbd { +namespace image { + +using util::create_async_context_callback; +using util::create_context_callback; + +template <typename I> +RefreshParentRequest<I>::RefreshParentRequest( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info, Context *on_finish) + : m_child_image_ctx(child_image_ctx), m_parent_md(parent_md), + m_migration_info(migration_info), m_on_finish(on_finish), + m_parent_image_ctx(nullptr), m_parent_snap_id(CEPH_NOSNAP), + m_error_result(0) { +} + +template <typename I> +bool RefreshParentRequest<I>::is_refresh_required( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + ceph_assert(child_image_ctx.snap_lock.is_locked()); + ceph_assert(child_image_ctx.parent_lock.is_locked()); + return (is_open_required(child_image_ctx, parent_md, migration_info) || + is_close_required(child_image_ctx, parent_md, migration_info)); +} + +template <typename I> +bool RefreshParentRequest<I>::is_close_required( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + return (child_image_ctx.parent != nullptr && + !does_parent_exist(child_image_ctx, parent_md, migration_info)); +} + +template <typename I> +bool RefreshParentRequest<I>::is_open_required( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + return (does_parent_exist(child_image_ctx, parent_md, migration_info) && + (child_image_ctx.parent == nullptr || + child_image_ctx.parent->md_ctx.get_id() != parent_md.spec.pool_id || + child_image_ctx.parent->md_ctx.get_namespace() != + parent_md.spec.pool_namespace || + child_image_ctx.parent->id != parent_md.spec.image_id || + child_image_ctx.parent->snap_id != parent_md.spec.snap_id)); +} + +template <typename I> +bool RefreshParentRequest<I>::does_parent_exist( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + if (child_image_ctx.child != nullptr && + child_image_ctx.child->migration_info.empty() && parent_md.overlap == 0) { + // intermediate, non-migrating images should only open their parent if they + // overlap + return false; + } + + return (parent_md.spec.pool_id > -1 && parent_md.overlap > 0) || + !migration_info.empty(); +} + +template <typename I> +void RefreshParentRequest<I>::send() { + if (is_open_required(m_child_image_ctx, m_parent_md, m_migration_info)) { + send_open_parent(); + } else { + // parent will be closed (if necessary) during finalize + send_complete(0); + } +} + +template <typename I> +void RefreshParentRequest<I>::apply() { + ceph_assert(m_child_image_ctx.snap_lock.is_wlocked()); + ceph_assert(m_child_image_ctx.parent_lock.is_wlocked()); + std::swap(m_child_image_ctx.parent, m_parent_image_ctx); +} + +template <typename I> +void RefreshParentRequest<I>::finalize(Context *on_finish) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_finish = on_finish; + if (m_parent_image_ctx != nullptr) { + send_close_parent(); + } else { + send_complete(0); + } +} + +template <typename I> +void RefreshParentRequest<I>::send_open_parent() { + ceph_assert(m_parent_md.spec.pool_id >= 0); + + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::IoCtx parent_io_ctx; + int r = util::create_ioctx(m_child_image_ctx.md_ctx, "parent image", + m_parent_md.spec.pool_id, + m_parent_md.spec.pool_namespace, &parent_io_ctx); + if (r < 0) { + send_complete(r); + return; + } + + std::string image_name; + uint64_t flags = 0; + if (!m_migration_info.empty() && !m_migration_info.image_name.empty()) { + image_name = m_migration_info.image_name; + flags |= OPEN_FLAG_OLD_FORMAT; + } + + m_parent_image_ctx = new I(image_name, m_parent_md.spec.image_id, + m_parent_md.spec.snap_id, parent_io_ctx, true); + m_parent_image_ctx->child = &m_child_image_ctx; + + // set rados flags for reading the parent image + if (m_child_image_ctx.config.template get_val<bool>("rbd_balance_parent_reads")) { + m_parent_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS); + } else if (m_child_image_ctx.config.template get_val<bool>("rbd_localize_parent_reads")) { + m_parent_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS); + } + + using klass = RefreshParentRequest<I>; + Context *ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + klass, &klass::handle_open_parent, false>(this)); + OpenRequest<I> *req = OpenRequest<I>::create(m_parent_image_ctx, flags, ctx); + req->send(); +} + +template <typename I> +Context *RefreshParentRequest<I>::handle_open_parent(int *result) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl; + + save_result(result); + if (*result < 0) { + lderr(cct) << "failed to open parent image: " << cpp_strerror(*result) + << dendl; + + // image already closed by open state machine + delete m_parent_image_ctx; + m_parent_image_ctx = nullptr; + } + + return m_on_finish; +} + +template <typename I> +void RefreshParentRequest<I>::send_close_parent() { + ceph_assert(m_parent_image_ctx != nullptr); + + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshParentRequest<I>; + Context *ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + klass, &klass::handle_close_parent, false>(this)); + CloseRequest<I> *req = CloseRequest<I>::create(m_parent_image_ctx, ctx); + req->send(); +} + +template <typename I> +Context *RefreshParentRequest<I>::handle_close_parent(int *result) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl; + + delete m_parent_image_ctx; + m_parent_image_ctx = nullptr; + + if (*result < 0) { + lderr(cct) << "failed to close parent image: " << cpp_strerror(*result) + << dendl; + } + + send_reset_existence_cache(); + return nullptr; +} + +template <typename I> +void RefreshParentRequest<I>::send_reset_existence_cache() { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + Context *ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + RefreshParentRequest<I>, + &RefreshParentRequest<I>::handle_reset_existence_cache, false>(this)); + m_child_image_ctx.io_object_dispatcher->reset_existence_cache(ctx); +} + +template <typename I> +Context *RefreshParentRequest<I>::handle_reset_existence_cache(int *result) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to reset object existence cache: " + << cpp_strerror(*result) << dendl; + } + + if (m_error_result < 0) { + // propagate errors from opening the image + *result = m_error_result; + } else { + *result = 0; + } + return m_on_finish; +} + +template <typename I> +void RefreshParentRequest<I>::send_complete(int r) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_finish->complete(r); +} + +} // namespace image +} // namespace librbd + +template class librbd::image::RefreshParentRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/RefreshParentRequest.h b/src/librbd/image/RefreshParentRequest.h new file mode 100644 index 00000000..086d8ec1 --- /dev/null +++ b/src/librbd/image/RefreshParentRequest.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H +#define CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H + +#include "include/int_types.h" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class RefreshParentRequest { +public: + static RefreshParentRequest *create(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info, + Context *on_finish) { + return new RefreshParentRequest(child_image_ctx, parent_md, migration_info, + on_finish); + } + + static bool is_refresh_required(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + + void send(); + void apply(); + void finalize(Context *on_finish); + +private: + /** + * @verbatim + * + * <start> + * | + * | (open required) + * |----------------> OPEN_PARENT * * * * * * * * * * * * * * * + * | | * + * | v (on error) * + * \----------------> <apply> * + * | * + * | (close required) * + * |-----------------> CLOSE_PARENT * + * | | * + * | v * + * | RESET_EXISTENCE * + * | | * + * | v * + * \-----------------> <finish> < * * * * + * + * @endverbatim + */ + + RefreshParentRequest(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info, Context *on_finish); + + ImageCtxT &m_child_image_ctx; + ParentImageInfo m_parent_md; + MigrationInfo m_migration_info; + Context *m_on_finish; + + ImageCtxT *m_parent_image_ctx; + uint64_t m_parent_snap_id; + + int m_error_result; + + static bool is_close_required(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + static bool is_open_required(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + static bool does_parent_exist(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + + void send_open_parent(); + Context *handle_open_parent(int *result); + + void send_close_parent(); + Context *handle_close_parent(int *result); + + void send_reset_existence_cache(); + Context *handle_reset_existence_cache(int *result); + + void send_complete(int r); + + void save_result(int *result) { + if (m_error_result == 0 && *result < 0) { + m_error_result = *result; + } + } + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::RefreshParentRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H diff --git a/src/librbd/image/RefreshRequest.cc b/src/librbd/image/RefreshRequest.cc new file mode 100644 index 00000000..3cc1b79e --- /dev/null +++ b/src/librbd/image/RefreshRequest.cc @@ -0,0 +1,1551 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/algorithm/string/predicate.hpp> +#include "include/ceph_assert.h" + +#include "librbd/image/RefreshRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/Utils.h" +#include "librbd/image/RefreshParentRequest.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/journal/Policy.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::RefreshRequest: " + +namespace librbd { +namespace image { + +namespace { + +const uint64_t MAX_METADATA_ITEMS = 128; + +} + +using util::create_rados_callback; +using util::create_async_context_callback; +using util::create_context_callback; + +template <typename I> +RefreshRequest<I>::RefreshRequest(I &image_ctx, bool acquiring_lock, + bool skip_open_parent, Context *on_finish) + : m_image_ctx(image_ctx), m_acquiring_lock(acquiring_lock), + m_skip_open_parent_image(skip_open_parent), + m_on_finish(create_async_context_callback(m_image_ctx, on_finish)), + m_error_result(0), m_flush_aio(false), m_exclusive_lock(nullptr), + m_object_map(nullptr), m_journal(nullptr), m_refresh_parent(nullptr) { + m_pool_metadata_io_ctx.dup(image_ctx.md_ctx); + m_pool_metadata_io_ctx.set_namespace(""); +} + +template <typename I> +RefreshRequest<I>::~RefreshRequest() { + // these require state machine to close + ceph_assert(m_exclusive_lock == nullptr); + ceph_assert(m_object_map == nullptr); + ceph_assert(m_journal == nullptr); + ceph_assert(m_refresh_parent == nullptr); + ceph_assert(!m_blocked_writes); +} + +template <typename I> +void RefreshRequest<I>::send() { + if (m_image_ctx.old_format) { + send_v1_read_header(); + } else { + send_v2_get_mutable_metadata(); + } +} + +template <typename I> +void RefreshRequest<I>::send_get_migration_header() { + if (m_image_ctx.ignore_migrating) { + if (m_image_ctx.old_format) { + send_v1_get_snapshots(); + } else { + send_v2_get_metadata(); + } + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::migration_get_start(&op); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_migration_header>(this); + m_out_bl.clear(); + m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_get_migration_header(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::migration_get_finish(&it, &m_migration_spec); + } else if (*result == -ENOENT) { + ldout(cct, 5) << this << " " << __func__ << ": no migration header found" + << ", retrying" << dendl; + send(); + return nullptr; + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve migration header: " + << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + switch(m_migration_spec.header_type) { + case cls::rbd::MIGRATION_HEADER_TYPE_SRC: + if (!m_image_ctx.read_only) { + lderr(cct) << "image being migrated" << dendl; + *result = -EROFS; + return m_on_finish; + } + ldout(cct, 1) << this << " " << __func__ << ": migrating to: " + << m_migration_spec << dendl; + break; + case cls::rbd::MIGRATION_HEADER_TYPE_DST: + ldout(cct, 1) << this << " " << __func__ << ": migrating from: " + << m_migration_spec << dendl; + switch (m_migration_spec.state) { + case cls::rbd::MIGRATION_STATE_PREPARING: + ldout(cct, 5) << this << " " << __func__ << ": current migration state: " + << m_migration_spec.state << ", retrying" << dendl; + send(); + return nullptr; + case cls::rbd::MIGRATION_STATE_PREPARED: + case cls::rbd::MIGRATION_STATE_EXECUTING: + case cls::rbd::MIGRATION_STATE_EXECUTED: + break; + case cls::rbd::MIGRATION_STATE_ABORTING: + if (!m_image_ctx.read_only) { + lderr(cct) << this << " " << __func__ << ": migration is being aborted" + << dendl; + *result = -EROFS; + return m_on_finish; + } + break; + default: + lderr(cct) << this << " " << __func__ << ": migration is in an " + << "unexpected state" << dendl; + *result = -EINVAL; + return m_on_finish; + } + break; + default: + ldout(cct, 1) << this << " " << __func__ << ": migration type " + << m_migration_spec.header_type << dendl; + *result = -EBADMSG; + return m_on_finish; + } + + if (m_image_ctx.old_format) { + send_v1_get_snapshots(); + } else { + send_v2_get_metadata(); + } + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v1_read_header() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + op.read(0, 0, nullptr, nullptr); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_read_header>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v1_read_header(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl; + + rbd_obj_header_ondisk v1_header; + bool migrating = false; + if (*result < 0) { + return m_on_finish; + } else if (m_out_bl.length() < sizeof(v1_header)) { + lderr(cct) << "v1 header too small" << dendl; + *result = -EIO; + return m_on_finish; + } else if (memcmp(RBD_HEADER_TEXT, m_out_bl.c_str(), + sizeof(RBD_HEADER_TEXT)) != 0) { + if (memcmp(RBD_MIGRATE_HEADER_TEXT, m_out_bl.c_str(), + sizeof(RBD_MIGRATE_HEADER_TEXT)) == 0) { + ldout(cct, 1) << this << " " << __func__ << ": migration v1 header detected" + << dendl; + migrating = true; + } else { + lderr(cct) << "unrecognized v1 header" << dendl; + *result = -ENXIO; + return m_on_finish; + } + } + + memcpy(&v1_header, m_out_bl.c_str(), sizeof(v1_header)); + m_order = v1_header.options.order; + m_size = v1_header.image_size; + m_object_prefix = v1_header.block_name; + if (migrating) { + send_get_migration_header(); + } else { + send_v1_get_snapshots(); + } + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v1_get_snapshots() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::old_snapshot_list_start(&op); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_get_snapshots>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v1_get_snapshots(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl; + + std::vector<std::string> snap_names; + std::vector<uint64_t> snap_sizes; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::old_snapshot_list_finish(&it, &snap_names, + &snap_sizes, &m_snapc); + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve v1 snapshots: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if (!m_snapc.is_valid()) { + lderr(cct) << "v1 image snap context is invalid" << dendl; + *result = -EIO; + return m_on_finish; + } + + m_snap_infos.clear(); + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + m_snap_infos.push_back({m_snapc.snaps[i], + {cls::rbd::UserSnapshotNamespace{}}, + snap_names[i], snap_sizes[i], {}, 0}); + } + + send_v1_get_locks(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v1_get_locks() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_get_locks>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v1_get_locks(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + ClsLockType lock_type; + *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers, + &lock_type, &m_lock_tag); + if (*result == 0) { + m_exclusive_locked = (lock_type == LOCK_EXCLUSIVE); + } + } + if (*result < 0) { + lderr(cct) << "failed to retrieve locks: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v1_apply(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v1_apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // ensure we are not in a rados callback when applying updates + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v1_apply>(this); + m_image_ctx.op_work_queue->queue(ctx, 0); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v1_apply(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + apply(); + return send_flush_aio(); +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_mutable_metadata() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + uint64_t snap_id; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + snap_id = m_image_ctx.snap_id; + } + + bool read_only = m_image_ctx.read_only || snap_id != CEPH_NOSNAP; + librados::ObjectReadOperation op; + cls_client::get_size_start(&op, CEPH_NOSNAP); + cls_client::get_features_start(&op, read_only); + cls_client::get_flags_start(&op, CEPH_NOSNAP); + cls_client::get_snapcontext_start(&op); + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_mutable_metadata>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_mutable_metadata(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + if (*result >= 0) { + uint8_t order; + *result = cls_client::get_size_finish(&it, &m_size, &order); + } + + if (*result >= 0) { + *result = cls_client::get_features_finish(&it, &m_features, + &m_incompatible_features); + } + + if (*result >= 0) { + *result = cls_client::get_flags_finish(&it, &m_flags); + } + + if (*result >= 0) { + *result = cls_client::get_snapcontext_finish(&it, &m_snapc); + } + + if (*result >= 0) { + ClsLockType lock_type = LOCK_NONE; + *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers, + &lock_type, &m_lock_tag); + if (*result == 0) { + m_exclusive_locked = (lock_type == LOCK_EXCLUSIVE); + } + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve mutable metadata: " + << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + uint64_t unsupported = m_incompatible_features & ~RBD_FEATURES_ALL; + if (unsupported != 0ULL) { + lderr(cct) << "Image uses unsupported features: " << unsupported << dendl; + *result = -ENOSYS; + return m_on_finish; + } + + if (!m_snapc.is_valid()) { + lderr(cct) << "image snap context is invalid!" << dendl; + *result = -EIO; + return m_on_finish; + } + + if (m_acquiring_lock && (m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + ldout(cct, 5) << "ignoring dynamically disabled exclusive lock" << dendl; + m_features |= RBD_FEATURE_EXCLUSIVE_LOCK; + m_incomplete_update = true; + } + + send_v2_get_parent(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_parent() { + // NOTE: remove support when Mimic is EOLed + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": legacy=" << m_legacy_parent + << dendl; + + librados::ObjectReadOperation op; + if (!m_legacy_parent) { + cls_client::parent_get_start(&op); + cls_client::parent_overlap_get_start(&op, CEPH_NOSNAP); + } else { + cls_client::get_parent_start(&op, CEPH_NOSNAP); + } + + auto aio_comp = create_rados_callback< + RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_parent>(this); + m_out_bl.clear(); + m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op, + &m_out_bl); + aio_comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_parent(int *result) { + // NOTE: remove support when Mimic is EOLed + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + if (!m_legacy_parent) { + if (*result == 0) { + *result = cls_client::parent_get_finish(&it, &m_parent_md.spec); + } + + std::optional<uint64_t> parent_overlap; + if (*result == 0) { + *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap); + } + + if (*result == 0 && parent_overlap) { + m_parent_md.overlap = *parent_overlap; + m_head_parent_overlap = true; + } + } else if (*result == 0) { + *result = cls_client::get_parent_finish(&it, &m_parent_md.spec, + &m_parent_md.overlap); + m_head_parent_overlap = true; + } + + if (*result == -EOPNOTSUPP && !m_legacy_parent) { + ldout(cct, 10) << "retrying using legacy parent method" << dendl; + m_legacy_parent = true; + send_v2_get_parent(); + return nullptr; + } if (*result < 0) { + lderr(cct) << "failed to retrieve parent: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if ((m_features & RBD_FEATURE_MIGRATING) != 0) { + ldout(cct, 1) << "migrating feature set" << dendl; + send_get_migration_header(); + return nullptr; + } + + send_v2_get_metadata(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_metadata() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "start_key=" << m_last_metadata_key << dendl; + + librados::ObjectReadOperation op; + cls_client::metadata_list_start(&op, m_last_metadata_key, MAX_METADATA_ITEMS); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_v2_get_metadata>(this); + m_out_bl.clear(); + m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_metadata(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + std::map<std::string, bufferlist> metadata; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::metadata_list_finish(&it, &metadata); + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve metadata: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if (!metadata.empty()) { + m_metadata.insert(metadata.begin(), metadata.end()); + m_last_metadata_key = metadata.rbegin()->first; + if (boost::starts_with(m_last_metadata_key, + ImageCtx::METADATA_CONF_PREFIX)) { + send_v2_get_metadata(); + return nullptr; + } + } + + m_last_metadata_key.clear(); + send_v2_get_pool_metadata(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_pool_metadata() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "start_key=" << m_last_metadata_key << dendl; + + librados::ObjectReadOperation op; + cls_client::metadata_list_start(&op, m_last_metadata_key, MAX_METADATA_ITEMS); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_v2_get_pool_metadata>(this); + m_out_bl.clear(); + m_pool_metadata_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_out_bl); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_pool_metadata(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + std::map<std::string, bufferlist> metadata; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::metadata_list_finish(&it, &metadata); + } + + if (*result == -EOPNOTSUPP || *result == -ENOENT) { + ldout(cct, 10) << "pool metadata not supported by OSD" << dendl; + } else if (*result < 0) { + lderr(cct) << "failed to retrieve pool metadata: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if (!metadata.empty()) { + m_metadata.insert(metadata.begin(), metadata.end()); + m_last_metadata_key = metadata.rbegin()->first; + if (boost::starts_with(m_last_metadata_key, + ImageCtx::METADATA_CONF_PREFIX)) { + send_v2_get_pool_metadata(); + return nullptr; + } + } + + bool thread_safe = m_image_ctx.image_watcher->is_unregistered(); + m_image_ctx.apply_metadata(m_metadata, thread_safe); + + send_v2_get_op_features(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_op_features() { + if ((m_features & RBD_FEATURE_OPERATIONS) == 0LL) { + send_v2_get_group(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::op_features_get_start(&op); + + librados::AioCompletion *comp = create_rados_callback< + RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_op_features>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_op_features(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + // -EOPNOTSUPP handler not required since feature bit implies OSD + // supports the method + if (*result == 0) { + auto it = m_out_bl.cbegin(); + cls_client::op_features_get_finish(&it, &m_op_features); + } else if (*result < 0) { + lderr(cct) << "failed to retrieve op features: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v2_get_group(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_group() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::image_group_get_start(&op); + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_group>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_group(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + cls_client::image_group_get_finish(&it, &m_group_spec); + } + if (*result < 0 && *result != -EOPNOTSUPP) { + lderr(cct) << "failed to retrieve group: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v2_get_snapshots(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_get_snapshots() { + m_snap_infos.resize(m_snapc.snaps.size()); + m_snap_flags.resize(m_snapc.snaps.size()); + m_snap_parents.resize(m_snapc.snaps.size()); + m_snap_protection.resize(m_snapc.snaps.size()); + + if (m_snapc.snaps.empty()) { + send_v2_refresh_parent(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + for (auto snap_id : m_snapc.snaps) { + if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) { + /// NOTE: remove after Luminous is retired + cls_client::get_snapshot_name_start(&op, snap_id); + cls_client::get_size_start(&op, snap_id); + if (m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) { + cls_client::get_snapshot_timestamp_start(&op, snap_id); + } + } else { + cls_client::snapshot_get_start(&op, snap_id); + } + + if (m_legacy_parent) { + cls_client::get_parent_start(&op, snap_id); + } else { + cls_client::parent_overlap_get_start(&op, snap_id); + } + + cls_client::get_flags_start(&op, snap_id); + cls_client::get_protection_status_start(&op, snap_id); + } + + using klass = RefreshRequest<I>; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_snapshots>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_get_snapshots(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) { + /// NOTE: remove after Luminous is retired + std::string snap_name; + if (*result >= 0) { + *result = cls_client::get_snapshot_name_finish(&it, &snap_name); + } + + uint64_t snap_size; + if (*result >= 0) { + uint8_t order; + *result = cls_client::get_size_finish(&it, &snap_size, &order); + } + + utime_t snap_timestamp; + if (*result >= 0 && + m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) { + /// NOTE: remove after Jewel is retired + *result = cls_client::get_snapshot_timestamp_finish(&it, + &snap_timestamp); + } + + if (*result >= 0) { + m_snap_infos[i] = {m_snapc.snaps[i], + {cls::rbd::UserSnapshotNamespace{}}, + snap_name, snap_size, snap_timestamp, 0}; + } + } else if (*result >= 0) { + *result = cls_client::snapshot_get_finish(&it, &m_snap_infos[i]); + } + + if (*result == 0) { + if (m_legacy_parent) { + *result = cls_client::get_parent_finish(&it, &m_snap_parents[i].spec, + &m_snap_parents[i].overlap); + } else { + std::optional<uint64_t> parent_overlap; + *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap); + if (*result == 0 && parent_overlap && m_parent_md.spec.pool_id > -1) { + m_snap_parents[i].spec = m_parent_md.spec; + m_snap_parents[i].overlap = *parent_overlap; + } + } + } + + if (*result >= 0) { + *result = cls_client::get_flags_finish(&it, &m_snap_flags[i]); + } + + if (*result >= 0) { + *result = cls_client::get_protection_status_finish( + &it, &m_snap_protection[i]); + } + + if (*result < 0) { + break; + } + } + + if (*result == -ENOENT) { + ldout(cct, 10) << "out-of-sync snapshot state detected" << dendl; + send_v2_get_mutable_metadata(); + return nullptr; + } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_DISABLED && + *result == -EOPNOTSUPP) { + ldout(cct, 10) << "retrying using legacy snapshot methods" << dendl; + m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED; + send_v2_get_snapshots(); + return nullptr; + } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_ENABLED && + *result == -EOPNOTSUPP) { + ldout(cct, 10) << "retrying using legacy snapshot methods (jewel)" << dendl; + m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP; + send_v2_get_snapshots(); + return nullptr; + } else if (*result < 0) { + lderr(cct) << "failed to retrieve snapshots: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v2_refresh_parent(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_refresh_parent() { + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + RWLock::RLocker parent_locker(m_image_ctx.parent_lock); + + ParentImageInfo parent_md; + MigrationInfo migration_info; + int r = get_parent_info(m_image_ctx.snap_id, &parent_md, &migration_info); + if (!m_skip_open_parent_image && (r < 0 || + RefreshParentRequest<I>::is_refresh_required(m_image_ctx, parent_md, + migration_info))) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_refresh_parent>(this); + m_refresh_parent = RefreshParentRequest<I>::create( + m_image_ctx, parent_md, migration_info, ctx); + } + } + + if (m_refresh_parent != nullptr) { + m_refresh_parent->send(); + } else { + send_v2_init_exclusive_lock(); + } +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to refresh parent image: " << cpp_strerror(*result) + << dendl; + save_result(result); + send_v2_apply(); + return nullptr; + } + + send_v2_init_exclusive_lock(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_init_exclusive_lock() { + if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0 || + m_image_ctx.read_only || !m_image_ctx.snap_name.empty() || + m_image_ctx.exclusive_lock != nullptr) { + send_v2_open_object_map(); + return; + } + + // implies exclusive lock dynamically enabled or image open in-progress + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // TODO need safe shut down + m_exclusive_lock = m_image_ctx.create_exclusive_lock(); + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_init_exclusive_lock>(this); + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + m_exclusive_lock->init(m_features, ctx); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_init_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize exclusive lock: " + << cpp_strerror(*result) << dendl; + save_result(result); + } + + // object map and journal will be opened when exclusive lock is + // acquired (if features are enabled) + send_v2_apply(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_open_journal() { + bool journal_disabled = ( + (m_features & RBD_FEATURE_JOURNALING) == 0 || + m_image_ctx.read_only || + !m_image_ctx.snap_name.empty() || + m_image_ctx.journal != nullptr || + m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner()); + bool journal_disabled_by_policy; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + journal_disabled_by_policy = ( + !journal_disabled && + m_image_ctx.get_journal_policy()->journal_disabled()); + } + + if (journal_disabled || journal_disabled_by_policy) { + // journal dynamically enabled -- doesn't own exclusive lock + if ((m_features & RBD_FEATURE_JOURNALING) != 0 && + !journal_disabled_by_policy && + m_image_ctx.exclusive_lock != nullptr && + m_image_ctx.journal == nullptr) { + m_image_ctx.io_work_queue->set_require_lock(librbd::io::DIRECTION_BOTH, + true); + } + send_v2_block_writes(); + return; + } + + // implies journal dynamically enabled since ExclusiveLock will init + // the journal upon acquiring the lock + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_open_journal>(this); + + // TODO need safe close + m_journal = m_image_ctx.create_journal(); + m_journal->open(ctx); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_open_journal(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize journal: " << cpp_strerror(*result) + << dendl; + save_result(result); + } + + send_v2_block_writes(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_block_writes() { + bool disabled_journaling = false; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + disabled_journaling = ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 && + (m_features & RBD_FEATURE_JOURNALING) == 0 && + m_image_ctx.journal != nullptr); + } + + if (!disabled_journaling) { + send_v2_apply(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // we need to block writes temporarily to avoid in-flight journal + // writes + m_blocked_writes = true; + Context *ctx = create_context_callback< + RefreshRequest<I>, &RefreshRequest<I>::handle_v2_block_writes>(this); + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + m_image_ctx.io_work_queue->block_writes(ctx); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_block_writes(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) + << dendl; + save_result(result); + } + send_v2_apply(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_open_object_map() { + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0 || + m_image_ctx.object_map != nullptr || + (m_image_ctx.snap_name.empty() && + (m_image_ctx.read_only || + m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner()))) { + send_v2_open_journal(); + return; + } + + // implies object map dynamically enabled or image open in-progress + // since SetSnapRequest loads the object map for a snapshot and + // ExclusiveLock loads the object map for HEAD + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + if (m_image_ctx.snap_name.empty()) { + m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP); + } else { + for (size_t snap_idx = 0; snap_idx < m_snap_infos.size(); ++snap_idx) { + if (m_snap_infos[snap_idx].name == m_image_ctx.snap_name) { + m_object_map = m_image_ctx.create_object_map( + m_snapc.snaps[snap_idx].val); + break; + } + } + + if (m_object_map == nullptr) { + lderr(cct) << "failed to locate snapshot: " << m_image_ctx.snap_name + << dendl; + send_v2_open_journal(); + return; + } + } + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_open_object_map>(this); + m_object_map->open(ctx); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_open_object_map(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to open object map: " << cpp_strerror(*result) + << dendl; + delete m_object_map; + m_object_map = nullptr; + + if (*result != -EFBIG) { + save_result(result); + } + } + + send_v2_open_journal(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_v2_apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // ensure we are not in a rados callback when applying updates + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_apply>(this); + m_image_ctx.op_work_queue->queue(ctx, 0); +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_apply(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + apply(); + + return send_v2_finalize_refresh_parent(); +} + +template <typename I> +Context *RefreshRequest<I>::send_v2_finalize_refresh_parent() { + if (m_refresh_parent == nullptr) { + return send_v2_shut_down_exclusive_lock(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_finalize_refresh_parent>(this); + m_refresh_parent->finalize(ctx); + return nullptr; +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_finalize_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + ceph_assert(m_refresh_parent != nullptr); + delete m_refresh_parent; + m_refresh_parent = nullptr; + + return send_v2_shut_down_exclusive_lock(); +} + +template <typename I> +Context *RefreshRequest<I>::send_v2_shut_down_exclusive_lock() { + if (m_exclusive_lock == nullptr) { + return send_v2_close_journal(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // exclusive lock feature was dynamically disabled. in-flight IO will be + // flushed and in-flight requests will be canceled before releasing lock + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_shut_down_exclusive_lock>(this); + m_exclusive_lock->shut_down(ctx); + return nullptr; +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_shut_down_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to shut down exclusive lock: " + << cpp_strerror(*result) << dendl; + save_result(result); + } + + { + RWLock::WLocker owner_locker(m_image_ctx.owner_lock); + ceph_assert(m_image_ctx.exclusive_lock == nullptr); + } + + ceph_assert(m_exclusive_lock != nullptr); + delete m_exclusive_lock; + m_exclusive_lock = nullptr; + + return send_v2_close_journal(); +} + +template <typename I> +Context *RefreshRequest<I>::send_v2_close_journal() { + if (m_journal == nullptr) { + return send_v2_close_object_map(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // journal feature was dynamically disabled + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_close_journal>(this); + m_journal->close(ctx); + return nullptr; +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_close_journal(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + save_result(result); + lderr(cct) << "failed to close journal: " << cpp_strerror(*result) + << dendl; + } + + ceph_assert(m_journal != nullptr); + delete m_journal; + m_journal = nullptr; + + ceph_assert(m_blocked_writes); + m_blocked_writes = false; + + m_image_ctx.io_work_queue->unblock_writes(); + return send_v2_close_object_map(); +} + +template <typename I> +Context *RefreshRequest<I>::send_v2_close_object_map() { + if (m_object_map == nullptr) { + return send_flush_aio(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // object map was dynamically disabled + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_close_object_map>(this); + m_object_map->close(ctx); + return nullptr; +} + +template <typename I> +Context *RefreshRequest<I>::handle_v2_close_object_map(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close object map: " << cpp_strerror(*result) + << dendl; + } + + ceph_assert(m_object_map != nullptr); + delete m_object_map; + m_object_map = nullptr; + + return send_flush_aio(); +} + +template <typename I> +Context *RefreshRequest<I>::send_flush_aio() { + if (m_incomplete_update && m_error_result == 0) { + // if this was a partial refresh, notify ImageState + m_error_result = -ERESTART; + } + + if (m_flush_aio) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + auto ctx = create_context_callback< + RefreshRequest<I>, &RefreshRequest<I>::handle_flush_aio>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec<I>::create_flush_request( + m_image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + delete req; + return nullptr; + } else if (m_error_result < 0) { + // propagate saved error back to caller + Context *ctx = create_context_callback< + RefreshRequest<I>, &RefreshRequest<I>::handle_error>(this); + m_image_ctx.op_work_queue->queue(ctx, 0); + return nullptr; + } + + return m_on_finish; +} + +template <typename I> +Context *RefreshRequest<I>::handle_flush_aio(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to flush pending AIO: " << cpp_strerror(*result) + << dendl; + } + + return handle_error(result); +} + +template <typename I> +Context *RefreshRequest<I>::handle_error(int *result) { + if (m_error_result < 0) { + *result = m_error_result; + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + } + return m_on_finish; +} + +template <typename I> +void RefreshRequest<I>::apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + RWLock::WLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker md_locker(m_image_ctx.md_lock); + + { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + RWLock::WLocker parent_locker(m_image_ctx.parent_lock); + + m_image_ctx.size = m_size; + m_image_ctx.lockers = m_lockers; + m_image_ctx.lock_tag = m_lock_tag; + m_image_ctx.exclusive_locked = m_exclusive_locked; + + std::map<uint64_t, uint64_t> migration_reverse_snap_seq; + + if (m_image_ctx.old_format) { + m_image_ctx.order = m_order; + m_image_ctx.features = 0; + m_image_ctx.flags = 0; + m_image_ctx.op_features = 0; + m_image_ctx.operations_disabled = false; + m_image_ctx.object_prefix = std::move(m_object_prefix); + m_image_ctx.init_layout(m_image_ctx.md_ctx.get_id()); + } else { + // HEAD revision doesn't have a defined overlap so it's only + // applicable to snapshots + if (!m_head_parent_overlap) { + m_parent_md = {}; + } + + m_image_ctx.features = m_features; + m_image_ctx.flags = m_flags; + m_image_ctx.op_features = m_op_features; + m_image_ctx.operations_disabled = ( + (m_op_features & ~RBD_OPERATION_FEATURES_ALL) != 0ULL); + m_image_ctx.group_spec = m_group_spec; + + bool migration_info_valid; + int r = get_migration_info(&m_image_ctx.parent_md, + &m_image_ctx.migration_info, + &migration_info_valid); + ceph_assert(r == 0); // validated in refresh parent step + + if (migration_info_valid) { + for (auto it : m_image_ctx.migration_info.snap_map) { + migration_reverse_snap_seq[it.second.front()] = it.first; + } + } else { + m_image_ctx.parent_md = m_parent_md; + m_image_ctx.migration_info = {}; + } + } + + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + std::vector<librados::snap_t>::const_iterator it = std::find( + m_image_ctx.snaps.begin(), m_image_ctx.snaps.end(), + m_snapc.snaps[i].val); + if (it == m_image_ctx.snaps.end()) { + m_flush_aio = true; + ldout(cct, 20) << "new snapshot id=" << m_snapc.snaps[i].val + << " name=" << m_snap_infos[i].name + << " size=" << m_snap_infos[i].image_size + << dendl; + } + } + + m_image_ctx.snaps.clear(); + m_image_ctx.snap_info.clear(); + m_image_ctx.snap_ids.clear(); + auto overlap = m_image_ctx.parent_md.overlap; + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + uint64_t flags = m_image_ctx.old_format ? 0 : m_snap_flags[i]; + uint8_t protection_status = m_image_ctx.old_format ? + static_cast<uint8_t>(RBD_PROTECTION_STATUS_UNPROTECTED) : + m_snap_protection[i]; + ParentImageInfo parent; + if (!m_image_ctx.old_format) { + if (!m_image_ctx.migration_info.empty()) { + parent = m_image_ctx.parent_md; + auto it = migration_reverse_snap_seq.find(m_snapc.snaps[i].val); + if (it != migration_reverse_snap_seq.end()) { + parent.spec.snap_id = it->second; + parent.overlap = m_snap_infos[i].image_size; + } else { + overlap = std::min(overlap, m_snap_infos[i].image_size); + parent.overlap = overlap; + } + } else { + parent = m_snap_parents[i]; + } + } + m_image_ctx.add_snap(m_snap_infos[i].snapshot_namespace, + m_snap_infos[i].name, m_snapc.snaps[i].val, + m_snap_infos[i].image_size, parent, + protection_status, flags, + m_snap_infos[i].timestamp); + } + m_image_ctx.parent_md.overlap = std::min(overlap, m_image_ctx.size); + m_image_ctx.snapc = m_snapc; + + if (m_image_ctx.snap_id != CEPH_NOSNAP && + m_image_ctx.get_snap_id(m_image_ctx.snap_namespace, + m_image_ctx.snap_name) != m_image_ctx.snap_id) { + lderr(cct) << "tried to read from a snapshot that no longer exists: " + << m_image_ctx.snap_name << dendl; + m_image_ctx.snap_exists = false; + } + + if (m_refresh_parent != nullptr) { + m_refresh_parent->apply(); + } + if (m_image_ctx.data_ctx.is_valid()) { + m_image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(m_image_ctx.snapc.seq, + m_image_ctx.snaps); + } + + // handle dynamically enabled / disabled features + if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK, + m_image_ctx.snap_lock)) { + // disabling exclusive lock will automatically handle closing + // object map and journaling + ceph_assert(m_exclusive_lock == nullptr); + m_exclusive_lock = m_image_ctx.exclusive_lock; + } else { + if (m_exclusive_lock != nullptr) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr); + std::swap(m_exclusive_lock, m_image_ctx.exclusive_lock); + } + if (!m_image_ctx.test_features(RBD_FEATURE_JOURNALING, + m_image_ctx.snap_lock)) { + if (!m_image_ctx.clone_copy_on_read && m_image_ctx.journal != nullptr) { + m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_READ, + false); + } + std::swap(m_journal, m_image_ctx.journal); + } else if (m_journal != nullptr) { + std::swap(m_journal, m_image_ctx.journal); + } + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.snap_lock) || + m_object_map != nullptr) { + std::swap(m_object_map, m_image_ctx.object_map); + } + } + } +} + +template <typename I> +int RefreshRequest<I>::get_parent_info(uint64_t snap_id, + ParentImageInfo *parent_md, + MigrationInfo *migration_info) { + bool migration_info_valid; + int r = get_migration_info(parent_md, migration_info, &migration_info_valid); + if (r < 0) { + return r; + } + + if (migration_info_valid) { + return 0; + } else if (snap_id == CEPH_NOSNAP) { + *parent_md = m_parent_md; + *migration_info = {}; + return 0; + } else { + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + if (m_snapc.snaps[i].val == snap_id) { + *parent_md = m_snap_parents[i]; + *migration_info = {}; + return 0; + } + } + } + return -ENOENT; +} + +template <typename I> +int RefreshRequest<I>::get_migration_info(ParentImageInfo *parent_md, + MigrationInfo *migration_info, + bool* migration_info_valid) { + CephContext *cct = m_image_ctx.cct; + if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST || + (m_migration_spec.state != cls::rbd::MIGRATION_STATE_PREPARED && + m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTING && + m_migration_spec.state != cls::rbd::MIGRATION_STATE_ABORTING)) { + if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_SRC && + m_migration_spec.pool_id != -1 && + m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTED) { + lderr(cct) << this << " " << __func__ << ": invalid migration spec" + << dendl; + return -EINVAL; + } + + *migration_info_valid = false; + return 0; + } + + parent_md->spec.pool_id = m_migration_spec.pool_id; + parent_md->spec.pool_namespace = m_migration_spec.pool_namespace; + parent_md->spec.image_id = m_migration_spec.image_id; + parent_md->spec.snap_id = CEPH_NOSNAP; + parent_md->overlap = std::min(m_size, m_migration_spec.overlap); + + auto snap_seqs = m_migration_spec.snap_seqs; + // If new snapshots have been created on destination image after + // migration stared, map the source CEPH_NOSNAP to the earliest of + // these snapshots. + snapid_t snap_id = snap_seqs.empty() ? 0 : snap_seqs.rbegin()->second; + auto it = std::upper_bound(m_snapc.snaps.rbegin(), m_snapc.snaps.rend(), + snap_id); + if (it != m_snapc.snaps.rend()) { + snap_seqs[CEPH_NOSNAP] = *it; + } else { + snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP; + } + + std::set<uint64_t> snap_ids; + for (auto& it : snap_seqs) { + snap_ids.insert(it.second); + } + uint64_t overlap = snap_ids.find(CEPH_NOSNAP) != snap_ids.end() ? + parent_md->overlap : 0; + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + if (snap_ids.find(m_snapc.snaps[i].val) != snap_ids.end()) { + overlap = std::max(overlap, m_snap_infos[i].image_size); + } + } + + *migration_info = {m_migration_spec.pool_id, m_migration_spec.pool_namespace, + m_migration_spec.image_name, m_migration_spec.image_id, {}, + overlap, m_migration_spec.flatten}; + *migration_info_valid = true; + + deep_copy::util::compute_snap_map(m_image_ctx.cct, 0, CEPH_NOSNAP, {}, + snap_seqs, &migration_info->snap_map); + return 0; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::RefreshRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/RefreshRequest.h b/src/librbd/image/RefreshRequest.h new file mode 100644 index 00000000..0a0f1e8d --- /dev/null +++ b/src/librbd/image/RefreshRequest.h @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H +#define CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/utime.h" +#include "common/snap_types.h" +#include "cls/lock/cls_lock_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include <string> +#include <vector> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template<typename> class RefreshParentRequest; + +template<typename ImageCtxT = ImageCtx> +class RefreshRequest { +public: + static RefreshRequest *create(ImageCtxT &image_ctx, bool acquiring_lock, + bool skip_open_parent, Context *on_finish) { + return new RefreshRequest(image_ctx, acquiring_lock, skip_open_parent, + on_finish); + } + + RefreshRequest(ImageCtxT &image_ctx, bool acquiring_lock, + bool skip_open_parent, Context *on_finish); + ~RefreshRequest(); + + void send(); + +private: + /** + * @verbatim + * + * <start> < * * * * * * * * * * * * * * * * * * * * * * * * * * (ENOENT) + * ^ | * + * * | (v1) * + * * |-----> V1_READ_HEADER -------------> GET_MIGRATION_HEADER (skip if not + * * | | migrating) + * * | (v2) v + * * \-----> V2_GET_MUTABLE_METADATA V1_GET_SNAPSHOTS + * * | | + * * | -EOPNOTSUPP v + * * | * * * V1_GET_LOCKS + * * | * * | + * * v v * v + * * V2_GET_PARENT <apply> + * * | | + * * v | + * * * * * * GET_MIGRATION_HEADER (skip if not | + * (ENOENT) | migrating) | + * v | + * V2_GET_METADATA | + * | | + * v | + * V2_GET_POOL_METADATA | + * | | + * v (skip if not enabled) | + * V2_GET_OP_FEATURES | + * | | + * v | + * V2_GET_GROUP | + * | | + * | -EOPNOTSUPP | + * | * * * * | + * | * * | + * v v * | + * V2_GET_SNAPSHOTS (skip if no snaps) | + * | | + * v | + * V2_REFRESH_PARENT (skip if no parent or | + * | refresh not needed) | + * v | + * V2_INIT_EXCLUSIVE_LOCK (skip if lock | + * | active or disabled) | + * v | + * V2_OPEN_OBJECT_MAP (skip if map | + * | active or disabled) | + * v | + * V2_OPEN_JOURNAL (skip if journal | + * | active or disabled) | + * v | + * V2_BLOCK_WRITES (skip if journal not | + * | disabled) | + * v | + * <apply> | + * | | + * v | + * V2_FINALIZE_REFRESH_PARENT (skip if refresh | + * | not needed) | + * (error) v | + * * * * * > V2_SHUT_DOWN_EXCLUSIVE_LOCK (skip if lock | + * | active or enabled) | + * v | + * V2_CLOSE_JOURNAL (skip if journal inactive | + * | or enabled) | + * v | + * V2_CLOSE_OBJECT_MAP (skip if map inactive | + * | or enabled) | + * | | + * \-------------------\/--------------------/ + * | + * v + * FLUSH (skip if no new + * | snapshots) + * v + * <finish> + * + * @endverbatim + */ + + enum LegacySnapshot { + LEGACY_SNAPSHOT_DISABLED, + LEGACY_SNAPSHOT_ENABLED, + LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP + }; + + ImageCtxT &m_image_ctx; + bool m_acquiring_lock; + bool m_skip_open_parent_image; + Context *m_on_finish; + + cls::rbd::MigrationSpec m_migration_spec; + int m_error_result; + bool m_flush_aio; + decltype(m_image_ctx.exclusive_lock) m_exclusive_lock; + decltype(m_image_ctx.object_map) m_object_map; + decltype(m_image_ctx.journal) m_journal; + RefreshParentRequest<ImageCtxT> *m_refresh_parent; + + bufferlist m_out_bl; + + bool m_legacy_parent = false; + LegacySnapshot m_legacy_snapshot = LEGACY_SNAPSHOT_DISABLED; + + uint8_t m_order = 0; + uint64_t m_size = 0; + uint64_t m_features = 0; + uint64_t m_incompatible_features = 0; + uint64_t m_flags = 0; + uint64_t m_op_features = 0; + + librados::IoCtx m_pool_metadata_io_ctx; + std::string m_last_metadata_key; + std::map<std::string, bufferlist> m_metadata; + + std::string m_object_prefix; + ParentImageInfo m_parent_md; + bool m_head_parent_overlap = false; + cls::rbd::GroupSpec m_group_spec; + + ::SnapContext m_snapc; + std::vector<cls::rbd::SnapshotInfo> m_snap_infos; + std::vector<ParentImageInfo> m_snap_parents; + std::vector<uint8_t> m_snap_protection; + std::vector<uint64_t> m_snap_flags; + + std::map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t> m_lockers; + std::string m_lock_tag; + bool m_exclusive_locked = false; + + bool m_blocked_writes = false; + bool m_incomplete_update = false; + + void send_get_migration_header(); + Context *handle_get_migration_header(int *result); + + void send_v1_read_header(); + Context *handle_v1_read_header(int *result); + + void send_v1_get_snapshots(); + Context *handle_v1_get_snapshots(int *result); + + void send_v1_get_locks(); + Context *handle_v1_get_locks(int *result); + + void send_v1_apply(); + Context *handle_v1_apply(int *result); + + void send_v2_get_mutable_metadata(); + Context *handle_v2_get_mutable_metadata(int *result); + + void send_v2_get_parent(); + Context *handle_v2_get_parent(int *result); + + void send_v2_get_metadata(); + Context *handle_v2_get_metadata(int *result); + + void send_v2_get_pool_metadata(); + Context *handle_v2_get_pool_metadata(int *result); + + void send_v2_get_op_features(); + Context *handle_v2_get_op_features(int *result); + + void send_v2_get_group(); + Context *handle_v2_get_group(int *result); + + void send_v2_get_snapshots(); + Context *handle_v2_get_snapshots(int *result); + + void send_v2_get_snapshots_legacy(); + Context *handle_v2_get_snapshots_legacy(int *result); + + void send_v2_refresh_parent(); + Context *handle_v2_refresh_parent(int *result); + + void send_v2_init_exclusive_lock(); + Context *handle_v2_init_exclusive_lock(int *result); + + void send_v2_open_journal(); + Context *handle_v2_open_journal(int *result); + + void send_v2_block_writes(); + Context *handle_v2_block_writes(int *result); + + void send_v2_open_object_map(); + Context *handle_v2_open_object_map(int *result); + + void send_v2_apply(); + Context *handle_v2_apply(int *result); + + Context *send_v2_finalize_refresh_parent(); + Context *handle_v2_finalize_refresh_parent(int *result); + + Context *send_v2_shut_down_exclusive_lock(); + Context *handle_v2_shut_down_exclusive_lock(int *result); + + Context *send_v2_close_journal(); + Context *handle_v2_close_journal(int *result); + + Context *send_v2_close_object_map(); + Context *handle_v2_close_object_map(int *result); + + Context *send_flush_aio(); + Context *handle_flush_aio(int *result); + + Context *handle_error(int *result); + + void save_result(int *result) { + if (m_error_result == 0 && *result < 0) { + m_error_result = *result; + } + } + + void apply(); + int get_parent_info(uint64_t snap_id, ParentImageInfo *parent_md, + MigrationInfo *migration_info); + int get_migration_info(ParentImageInfo *parent_md, + MigrationInfo *migration_info, + bool* migration_info_valid); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::RefreshRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H diff --git a/src/librbd/image/RemoveRequest.cc b/src/librbd/image/RemoveRequest.cc new file mode 100644 index 00000000..8e029f8b --- /dev/null +++ b/src/librbd/image/RemoveRequest.cc @@ -0,0 +1,605 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/RemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/internal.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/image/PreRemoveRequest.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/operation/TrimRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::RemoveRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace image { + +using librados::IoCtx; +using util::create_context_callback; +using util::create_async_context_callback; +using util::create_rados_callback; + +template<typename I> +RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, const std::string &image_name, + const std::string &image_id, bool force, + bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, Context *on_finish) + : m_ioctx(ioctx), m_image_name(image_name), m_image_id(image_id), + m_force(force), m_from_trash_remove(from_trash_remove), + m_prog_ctx(prog_ctx), m_op_work_queue(op_work_queue), + m_on_finish(on_finish) { + m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); +} + +template<typename I> +RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, I *image_ctx, bool force, + bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, Context *on_finish) + : m_ioctx(ioctx), m_image_name(image_ctx->name), m_image_id(image_ctx->id), + m_image_ctx(image_ctx), m_force(force), + m_from_trash_remove(from_trash_remove), m_prog_ctx(prog_ctx), + m_op_work_queue(op_work_queue), m_on_finish(on_finish), + m_cct(image_ctx->cct), m_header_oid(image_ctx->header_oid), + m_old_format(image_ctx->old_format), m_unknown_format(false) { +} + +template<typename I> +void RemoveRequest<I>::send() { + ldout(m_cct, 20) << dendl; + + open_image(); +} + +template<typename I> +void RemoveRequest<I>::open_image() { + if (m_image_ctx != nullptr) { + pre_remove_image(); + return; + } + + m_image_ctx = I::create(m_image_id.empty() ? m_image_name : "", m_image_id, + nullptr, m_ioctx, false); + + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_open_image>( + this); + + m_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template<typename I> +void RemoveRequest<I>::handle_open_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_image_ctx->destroy(); + m_image_ctx = nullptr; + + if (r != -ENOENT) { + lderr(m_cct) << "error opening image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + remove_image(); + return; + } + + m_image_id = m_image_ctx->id; + m_image_name = m_image_ctx->name; + m_header_oid = m_image_ctx->header_oid; + m_old_format = m_image_ctx->old_format; + m_unknown_format = false; + + pre_remove_image(); +} + +template<typename I> +void RemoveRequest<I>::pre_remove_image() { + ldout(m_cct, 5) << dendl; + + auto ctx = create_context_callback< + RemoveRequest<I>, &RemoveRequest<I>::handle_pre_remove_image>(this); + auto req = PreRemoveRequest<I>::create(m_image_ctx, m_force, ctx); + req->send(); +} + +template<typename I> +void RemoveRequest<I>::handle_pre_remove_image(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r < 0) { + send_close_image(r); + return; + } + + if (!m_image_ctx->data_ctx.is_valid()) { + detach_child(); + return; + } + + trim_image(); +} + +template<typename I> +void RemoveRequest<I>::trim_image() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + klass, &klass::handle_trim_image>(this)); + + RWLock::RLocker owner_lock(m_image_ctx->owner_lock); + auto req = librbd::operation::TrimRequest<I>::create( + *m_image_ctx, ctx, m_image_ctx->size, 0, m_prog_ctx); + req->send(); +} + +template<typename I> +void RemoveRequest<I>::handle_trim_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove some object(s): " + << cpp_strerror(r) << dendl; + send_close_image(r); + return; + } + + if (m_old_format) { + send_close_image(r); + return; + } + + detach_child(); +} + +template<typename I> +void RemoveRequest<I>::detach_child() { + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + RemoveRequest<I>, &RemoveRequest<I>::handle_detach_child>(this); + auto req = DetachChildRequest<I>::create(*m_image_ctx, ctx); + req->send(); +} + +template<typename I> +void RemoveRequest<I>::handle_detach_child(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to detach child from parent: " + << cpp_strerror(r) << dendl; + send_close_image(r); + return; + } + + send_disable_mirror(); +} + +template<typename I> +void RemoveRequest<I>::send_disable_mirror() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_disable_mirror>(this); + + mirror::DisableRequest<I> *req = + mirror::DisableRequest<I>::create(m_image_ctx, m_force, !m_force, ctx); + req->send(); +} + +template<typename I> +void RemoveRequest<I>::handle_disable_mirror(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + r = 0; + } else if (r < 0) { + lderr(m_cct) << "error disabling image mirroring: " + << cpp_strerror(r) << dendl; + } + + send_close_image(r); +} + +template<typename I> +void RemoveRequest<I>::send_close_image(int r) { + ldout(m_cct, 20) << dendl; + + m_ret_val = r; + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_send_close_image>(this); + + m_image_ctx->state->close(ctx); +} + +template<typename I> +void RemoveRequest<I>::handle_send_close_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error encountered while closing image: " + << cpp_strerror(r) << dendl; + } + + m_image_ctx->destroy(); + m_image_ctx = nullptr; + if (m_ret_val < 0) { + r = m_ret_val; + finish(r); + return; + } + + remove_header(); +} + +template<typename I> +void RemoveRequest<I>::remove_header() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_remove_header>(this); + int r = m_ioctx.aio_remove(m_header_oid, rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_remove_header(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + remove_image(); +} + +template<typename I> +void RemoveRequest<I>::remove_header_v2() { + ldout(m_cct, 20) << dendl; + + if (m_header_oid.empty()) { + m_header_oid = util::header_name(m_image_id); + } + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_remove_header_v2>(this); + int r = m_ioctx.aio_remove(m_header_oid, rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_remove_header_v2(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_journal_remove(); +} + +template<typename I> +void RemoveRequest<I>::send_journal_remove() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_journal_remove>(this); + + journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create( + m_ioctx, m_image_id, Journal<>::IMAGE_CLIENT_ID, m_op_work_queue, ctx); + req->send(); +} + +template<typename I> +void RemoveRequest<I>::handle_journal_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else { + r = 0; + } + + send_object_map_remove(); +} + +template<typename I> +void RemoveRequest<I>::send_object_map_remove() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_object_map_remove>(this); + + int r = ObjectMap<>::aio_remove(m_ioctx, + m_image_id, + rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_object_map_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else { + r = 0; + } + + mirror_image_remove(); +} + +template<typename I> +void RemoveRequest<I>::mirror_image_remove() { + ldout(m_cct, 20) << dendl; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_remove(&op, m_image_id); + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_mirror_image_remove>(this); + int r = m_ioctx.aio_operate(RBD_MIRRORING, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_mirror_image_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) { + lderr(m_cct) << "failed to remove mirror image state: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_from_trash_remove) { + // both the id object and the directory entry have been removed in + // a previous call to trash_move. + finish(0); + return; + } + + remove_id_object(); +} + +template<typename I> +void RemoveRequest<I>::remove_image() { + ldout(m_cct, 20) << dendl; + + if (m_old_format || m_unknown_format) { + remove_v1_image(); + } else { + remove_v2_image(); + } +} + +template<typename I> +void RemoveRequest<I>::remove_v1_image() { + ldout(m_cct, 20) << dendl; + + Context *ctx = new FunctionContext([this] (int r) { + r = tmap_rm(m_ioctx, m_image_name); + handle_remove_v1_image(r); + }); + + m_op_work_queue->queue(ctx, 0); +} + +template<typename I> +void RemoveRequest<I>::handle_remove_v1_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_old_format = (r == 0); + if (r == 0 || (r < 0 && !m_unknown_format)) { + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing image from v1 directory: " + << cpp_strerror(r) << dendl; + } + + m_on_finish->complete(r); + delete this; + return; + } + + if (!m_old_format) { + remove_v2_image(); + } +} + +template<typename I> +void RemoveRequest<I>::remove_v2_image() { + ldout(m_cct, 20) << dendl; + + if (m_image_id.empty()) { + dir_get_image_id(); + return; + } else if (m_image_name.empty()) { + dir_get_image_name(); + return; + } + + remove_header_v2(); + return; +} + +template<typename I> +void RemoveRequest<I>::dir_get_image_id() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::dir_get_id_start(&op, m_image_name); + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_dir_get_image_id>(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_dir_get_image_id(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error fetching image id: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::dir_get_id_finish(&iter, &m_image_id); + if (r < 0) { + finish(r); + return; + } + } + + remove_header_v2(); +} + +template<typename I> +void RemoveRequest<I>::dir_get_image_name() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::dir_get_name_start(&op, m_image_id); + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_dir_get_image_name>(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_dir_get_image_name(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error fetching image name: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::dir_get_name_finish(&iter, &m_image_name); + if (r < 0) { + finish(r); + return; + } + } + + remove_header_v2(); +} + +template<typename I> +void RemoveRequest<I>::remove_id_object() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_remove_id_object>(this); + int r = m_ioctx.aio_remove(util::id_obj_name(m_image_name), rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_remove_id_object(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing id object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + dir_remove_image(); +} + +template<typename I> +void RemoveRequest<I>::dir_remove_image() { + ldout(m_cct, 20) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::dir_remove_image(&op, m_image_name, m_image_id); + + using klass = RemoveRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_dir_remove_image>(this); + int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template<typename I> +void RemoveRequest<I>::handle_dir_remove_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing image from v2 directory: " + << cpp_strerror(r) << dendl; + } + + finish(r); +} + +template<typename I> +void RemoveRequest<I>::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::RemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/RemoveRequest.h b/src/librbd/image/RemoveRequest.h new file mode 100644 index 00000000..98d59764 --- /dev/null +++ b/src/librbd/image/RemoveRequest.h @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/image/TypeTraits.h" + +#include <list> + +class Context; +class ContextWQ; +class SafeTimer; + +namespace librbd { + +class ProgressContext; + +namespace image { + +template<typename ImageCtxT = ImageCtx> +class RemoveRequest { +private: + // mock unit testing support + typedef ::librbd::image::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::ContextWQ ContextWQ; +public: + static RemoveRequest *create(librados::IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, + bool force, bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, + Context *on_finish) { + return new RemoveRequest(ioctx, image_name, image_id, force, + from_trash_remove, prog_ctx, op_work_queue, + on_finish); + } + + static RemoveRequest *create(librados::IoCtx &ioctx, ImageCtxT *image_ctx, + bool force, bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, + Context *on_finish) { + return new RemoveRequest(ioctx, image_ctx, force, from_trash_remove, + prog_ctx, op_work_queue, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * (skip if already opened) OPEN IMAGE------------------\ + * | | + * v | + * PRE REMOVE IMAGE * * * | + * | * | + * v * | + * (skip if invalid data pool) TRIM IMAGE * * * * * | + * | * | + * v * | + * DETACH CHILD * | + * | * | + * v * v + * CLOSE IMAGE < * * * * | + * | | + * error v | + * /------<--------\ REMOVE HEADER<--------------/ + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE JOURNAL + * | | / | + * | |-------<-------/ | + * | | v + * v ^ REMOVE OBJECTMAP + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE MIRROR IMAGE + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE ID OBJECT + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE IMAGE + * | | / | + * | \-------<-------/ | + * | v + * \------------------>------------<finish> + * + * @endverbatim + */ + + RemoveRequest(librados::IoCtx &ioctx, const std::string &image_name, + const std::string &image_id, bool force, bool from_trash_remove, + ProgressContext &prog_ctx, ContextWQ *op_work_queue, + Context *on_finish); + + RemoveRequest(librados::IoCtx &ioctx, ImageCtxT *image_ctx, bool force, + bool from_trash_remove, ProgressContext &prog_ctx, + ContextWQ *op_work_queue, Context *on_finish); + + librados::IoCtx &m_ioctx; + std::string m_image_name; + std::string m_image_id; + ImageCtxT *m_image_ctx = nullptr; + bool m_force; + bool m_from_trash_remove; + ProgressContext &m_prog_ctx; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + std::string m_header_oid; + bool m_old_format = false; + bool m_unknown_format = true; + + librados::IoCtx m_parent_io_ctx; + + decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr; + + int m_ret_val = 0; + bufferlist m_out_bl; + std::list<obj_watch_t> m_watchers; + + std::map<uint64_t, SnapInfo> m_snap_infos; + + void open_image(); + void handle_open_image(int r); + + void send_journal_remove(); + void handle_journal_remove(int r); + + void send_object_map_remove(); + void handle_object_map_remove(int r); + + void mirror_image_remove(); + void handle_mirror_image_remove(int r); + + void pre_remove_image(); + void handle_pre_remove_image(int r); + + void trim_image(); + void handle_trim_image(int r); + + void detach_child(); + void handle_detach_child(int r); + + void send_disable_mirror(); + void handle_disable_mirror(int r); + + void send_close_image(int r); + void handle_send_close_image(int r); + + void remove_header(); + void handle_remove_header(int r); + + void remove_header_v2(); + void handle_remove_header_v2(int r); + + void remove_image(); + + void remove_v1_image(); + void handle_remove_v1_image(int r); + + void remove_v2_image(); + + void dir_get_image_id(); + void handle_dir_get_image_id(int r); + + void dir_get_image_name(); + void handle_dir_get_image_name(int r); + + void remove_id_object(); + void handle_remove_id_object(int r); + + void dir_remove_image(); + void handle_dir_remove_image(int r); + + void finish(int r); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::RemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H diff --git a/src/librbd/image/SetFlagsRequest.cc b/src/librbd/image/SetFlagsRequest.cc new file mode 100644 index 00000000..a18c7499 --- /dev/null +++ b/src/librbd/image/SetFlagsRequest.cc @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/SetFlagsRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::SetFlagsRequest: " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +SetFlagsRequest<I>::SetFlagsRequest(I *image_ctx, uint64_t flags, + uint64_t mask, Context *on_finish) + : m_image_ctx(image_ctx), m_flags(flags), m_mask(mask), + m_on_finish(on_finish) { +} + +template <typename I> +void SetFlagsRequest<I>::send() { + send_set_flags(); +} + +template <typename I> +void SetFlagsRequest<I>::send_set_flags() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + std::vector<uint64_t> snap_ids; + snap_ids.push_back(CEPH_NOSNAP); + for (auto it : m_image_ctx->snap_info) { + snap_ids.push_back(it.first); + } + + Context *ctx = create_context_callback< + SetFlagsRequest<I>, &SetFlagsRequest<I>::handle_set_flags>(this); + C_Gather *gather_ctx = new C_Gather(cct, ctx); + + for (auto snap_id : snap_ids) { + librados::ObjectWriteOperation op; + cls_client::set_flags(&op, snap_id, m_flags, m_mask); + + librados::AioCompletion *comp = + create_rados_callback(gather_ctx->new_sub()); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + gather_ctx->activate(); +} + +template <typename I> +Context *SetFlagsRequest<I>::handle_set_flags(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "set_flags failed: " << cpp_strerror(*result) + << dendl; + } + return m_on_finish; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::SetFlagsRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/SetFlagsRequest.h b/src/librbd/image/SetFlagsRequest.h new file mode 100644 index 00000000..36905e62 --- /dev/null +++ b/src/librbd/image/SetFlagsRequest.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H +#define CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H + +#include "include/buffer.h" +#include "common/Mutex.h" +#include <map> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template <typename ImageCtxT = ImageCtx> +class SetFlagsRequest { +public: + static SetFlagsRequest *create(ImageCtxT *image_ctx, uint64_t flags, + uint64_t mask, Context *on_finish) { + return new SetFlagsRequest(image_ctx, flags, mask, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | . . . + * v v . + * SET_FLAGS . (for every snapshot) + * | . . + * v . . . + * <finis> + * + * @endverbatim + */ + + SetFlagsRequest(ImageCtxT *image_ctx, uint64_t flags, uint64_t mask, + Context *on_finish); + + ImageCtxT *m_image_ctx; + uint64_t m_flags; + uint64_t m_mask; + Context *m_on_finish; + + void send_set_flags(); + Context *handle_set_flags(int *result); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::SetFlagsRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H diff --git a/src/librbd/image/SetSnapRequest.cc b/src/librbd/image/SetSnapRequest.cc new file mode 100644 index 00000000..f342147f --- /dev/null +++ b/src/librbd/image/SetSnapRequest.cc @@ -0,0 +1,367 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/SetSnapRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/RefreshParentRequest.h" +#include "librbd/io/ImageRequestWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::SetSnapRequest: " + +namespace librbd { +namespace image { + +using util::create_context_callback; + +template <typename I> +SetSnapRequest<I>::SetSnapRequest(I &image_ctx, uint64_t snap_id, + Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish), + m_exclusive_lock(nullptr), m_object_map(nullptr), m_refresh_parent(nullptr), + m_writes_blocked(false) { +} + +template <typename I> +SetSnapRequest<I>::~SetSnapRequest() { + ceph_assert(!m_writes_blocked); + delete m_refresh_parent; + delete m_object_map; + delete m_exclusive_lock; +} + +template <typename I> +void SetSnapRequest<I>::send() { + if (m_snap_id == CEPH_NOSNAP) { + send_init_exclusive_lock(); + } else { + send_block_writes(); + } +} + +template <typename I> +void SetSnapRequest<I>::send_init_exclusive_lock() { + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + if (m_image_ctx.exclusive_lock != nullptr) { + ceph_assert(m_image_ctx.snap_id == CEPH_NOSNAP); + send_complete(); + return; + } + } + + if (m_image_ctx.read_only || + !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) { + int r = 0; + if (send_refresh_parent(&r) != nullptr) { + send_complete(); + } + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + m_exclusive_lock = ExclusiveLock<I>::create(m_image_ctx); + + using klass = SetSnapRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_init_exclusive_lock>(this); + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + m_exclusive_lock->init(m_image_ctx.features, ctx); +} + +template <typename I> +Context *SetSnapRequest<I>::handle_init_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize exclusive lock: " + << cpp_strerror(*result) << dendl; + finalize(); + return m_on_finish; + } + return send_refresh_parent(result); +} + +template <typename I> +void SetSnapRequest<I>::send_block_writes() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + m_writes_blocked = true; + + using klass = SetSnapRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_block_writes>(this); + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + m_image_ctx.io_work_queue->block_writes(ctx); +} + +template <typename I> +Context *SetSnapRequest<I>::handle_block_writes(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) + << dendl; + finalize(); + return m_on_finish; + } + + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + auto it = m_image_ctx.snap_info.find(m_snap_id); + if (it == m_image_ctx.snap_info.end()) { + ldout(cct, 5) << "failed to locate snapshot '" << m_snap_id << "'" + << dendl; + + *result = -ENOENT; + finalize(); + return m_on_finish; + } + } + + return send_shut_down_exclusive_lock(result); +} + +template <typename I> +Context *SetSnapRequest<I>::send_shut_down_exclusive_lock(int *result) { + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + m_exclusive_lock = m_image_ctx.exclusive_lock; + } + + if (m_exclusive_lock == nullptr) { + return send_refresh_parent(result); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + using klass = SetSnapRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_shut_down_exclusive_lock>(this); + m_exclusive_lock->shut_down(ctx); + return nullptr; +} + +template <typename I> +Context *SetSnapRequest<I>::handle_shut_down_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to shut down exclusive lock: " + << cpp_strerror(*result) << dendl; + finalize(); + return m_on_finish; + } + + return send_refresh_parent(result); +} + +template <typename I> +Context *SetSnapRequest<I>::send_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + + ParentImageInfo parent_md; + bool refresh_parent; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + RWLock::RLocker parent_locker(m_image_ctx.parent_lock); + + const auto parent_info = m_image_ctx.get_parent_info(m_snap_id); + if (parent_info == nullptr) { + *result = -ENOENT; + lderr(cct) << "failed to retrieve snapshot parent info" << dendl; + finalize(); + return m_on_finish; + } + + parent_md = *parent_info; + refresh_parent = RefreshParentRequest<I>::is_refresh_required( + m_image_ctx, parent_md, m_image_ctx.migration_info); + } + + if (!refresh_parent) { + if (m_snap_id == CEPH_NOSNAP) { + // object map is loaded when exclusive lock is acquired + *result = apply(); + finalize(); + return m_on_finish; + } else { + // load snapshot object map + return send_open_object_map(result); + } + } + + ldout(cct, 10) << __func__ << dendl; + + using klass = SetSnapRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_refresh_parent>(this); + m_refresh_parent = RefreshParentRequest<I>::create(m_image_ctx, parent_md, + m_image_ctx.migration_info, + ctx); + m_refresh_parent->send(); + return nullptr; +} + +template <typename I> +Context *SetSnapRequest<I>::handle_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to refresh snapshot parent: " << cpp_strerror(*result) + << dendl; + finalize(); + return m_on_finish; + } + + if (m_snap_id == CEPH_NOSNAP) { + // object map is loaded when exclusive lock is acquired + *result = apply(); + if (*result < 0) { + finalize(); + return m_on_finish; + } + + return send_finalize_refresh_parent(result); + } else { + // load snapshot object map + return send_open_object_map(result); + } +} + +template <typename I> +Context *SetSnapRequest<I>::send_open_object_map(int *result) { + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + *result = apply(); + if (*result < 0) { + finalize(); + return m_on_finish; + } + + return send_finalize_refresh_parent(result); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + using klass = SetSnapRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_open_object_map>(this); + m_object_map = ObjectMap<I>::create(m_image_ctx, m_snap_id); + m_object_map->open(ctx); + return nullptr; +} + +template <typename I> +Context *SetSnapRequest<I>::handle_open_object_map(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to open object map: " << cpp_strerror(*result) + << dendl; + delete m_object_map; + m_object_map = nullptr; + } + + *result = apply(); + if (*result < 0) { + finalize(); + return m_on_finish; + } + + return send_finalize_refresh_parent(result); +} + +template <typename I> +Context *SetSnapRequest<I>::send_finalize_refresh_parent(int *result) { + if (m_refresh_parent == nullptr) { + finalize(); + return m_on_finish; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = SetSnapRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_finalize_refresh_parent>(this); + m_refresh_parent->finalize(ctx); + return nullptr; +} + +template <typename I> +Context *SetSnapRequest<I>::handle_finalize_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close parent image: " << cpp_strerror(*result) + << dendl; + } + finalize(); + return m_on_finish; +} + +template <typename I> +int SetSnapRequest<I>::apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + RWLock::WLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + RWLock::WLocker parent_locker(m_image_ctx.parent_lock); + if (m_snap_id != CEPH_NOSNAP) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr); + int r = m_image_ctx.snap_set(m_snap_id); + if (r < 0) { + return r; + } + } else { + std::swap(m_image_ctx.exclusive_lock, m_exclusive_lock); + m_image_ctx.snap_unset(); + } + + if (m_refresh_parent != nullptr) { + m_refresh_parent->apply(); + } + + std::swap(m_object_map, m_image_ctx.object_map); + return 0; +} + +template <typename I> +void SetSnapRequest<I>::finalize() { + if (m_writes_blocked) { + m_image_ctx.io_work_queue->unblock_writes(); + m_writes_blocked = false; + } +} + +template <typename I> +void SetSnapRequest<I>::send_complete() { + finalize(); + m_on_finish->complete(0); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::SetSnapRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/SetSnapRequest.h b/src/librbd/image/SetSnapRequest.h new file mode 100644 index 00000000..c12ea9f2 --- /dev/null +++ b/src/librbd/image/SetSnapRequest.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H +#define CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H + +#include "cls/rbd/cls_rbd_client.h" +#include <string> + +class Context; + +namespace librbd { + +template <typename> class ExclusiveLock; +class ImageCtx; +template <typename> class ObjectMap; + +namespace image { + +template <typename> class RefreshParentRequest; + +template <typename ImageCtxT = ImageCtx> +class SetSnapRequest { +public: + static SetSnapRequest *create(ImageCtxT &image_ctx, uint64_t snap_id, + Context *on_finish) { + return new SetSnapRequest(image_ctx, snap_id, on_finish); + } + + ~SetSnapRequest(); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * | (set snap) + * |-----------> BLOCK_WRITES + * | | + * | v + * | SHUTDOWN_EXCLUSIVE_LOCK (skip if lock inactive + * | | or disabled) + * | v + * | REFRESH_PARENT (skip if no parent + * | | or refresh not needed) + * | v + * | OPEN_OBJECT_MAP (skip if map disabled) + * | | + * | v + * | <apply> + * | | + * | v + * | FINALIZE_REFRESH_PARENT (skip if no parent + * | | or refresh not needed) + * | v + * | <finish> + * | + * \-----------> INIT_EXCLUSIVE_LOCK (skip if active or + * | disabled) + * v + * REFRESH_PARENT (skip if no parent + * | or refresh not needed) + * v + * <apply> + * | + * v + * FINALIZE_REFRESH_PARENT (skip if no parent + * | or refresh not needed) + * v + * <finish> + * + * @endverbatim + */ + + SetSnapRequest(ImageCtxT &image_ctx, uint64_t snap_id, Context *on_finish); + + ImageCtxT &m_image_ctx; + uint64_t m_snap_id; + Context *m_on_finish; + + ExclusiveLock<ImageCtxT> *m_exclusive_lock; + ObjectMap<ImageCtxT> *m_object_map; + RefreshParentRequest<ImageCtxT> *m_refresh_parent; + + bool m_writes_blocked; + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_init_exclusive_lock(); + Context *handle_init_exclusive_lock(int *result); + + Context *send_shut_down_exclusive_lock(int *result); + Context *handle_shut_down_exclusive_lock(int *result); + + Context *send_refresh_parent(int *result); + Context *handle_refresh_parent(int *result); + + Context *send_open_object_map(int *result); + Context *handle_open_object_map(int *result); + + Context *send_finalize_refresh_parent(int *result); + Context *handle_finalize_refresh_parent(int *result); + + int apply(); + void finalize(); + void send_complete(); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::SetSnapRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H diff --git a/src/librbd/image/TypeTraits.h b/src/librbd/image/TypeTraits.h new file mode 100644 index 00000000..17d84fc8 --- /dev/null +++ b/src/librbd/image/TypeTraits.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H +#define CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H + +class ContextWQ; + +namespace librbd { +namespace image { + +template <typename ImageCtxT> +struct TypeTraits { + typedef ::ContextWQ ContextWQ; +}; + +} // namespace image +} // namespace librbd + +#endif // CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H diff --git a/src/librbd/image/ValidatePoolRequest.cc b/src/librbd/image/ValidatePoolRequest.cc new file mode 100644 index 00000000..2214f81b --- /dev/null +++ b/src/librbd/image/ValidatePoolRequest.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/ValidatePoolRequest.h" +#include "include/rados/librados.hpp" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::ValidatePoolRequest: " \ + << __func__ << ": " + +namespace librbd { +namespace image { + +namespace { + +const std::string OVERWRITE_VALIDATED("overwrite validated"); +const std::string VALIDATE("validate"); + +} // anonymous namespace + +using util::create_rados_callback; +using util::create_context_callback; +using util::create_async_context_callback; + +template <typename I> +ValidatePoolRequest<I>::ValidatePoolRequest(librados::IoCtx& io_ctx, + ContextWQ *op_work_queue, + Context *on_finish) + : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + // validatation should occur in default namespace + m_io_ctx.dup(io_ctx); + m_io_ctx.set_namespace(""); + } + +template <typename I> +void ValidatePoolRequest<I>::send() { + read_rbd_info(); +} + +template <typename I> +void ValidatePoolRequest<I>::read_rbd_info() { + ldout(m_cct, 5) << dendl; + + auto comp = create_rados_callback< + ValidatePoolRequest<I>, + &ValidatePoolRequest<I>::handle_read_rbd_info>(this); + + librados::ObjectReadOperation op; + op.read(0, 0, nullptr, nullptr); + + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void ValidatePoolRequest<I>::handle_read_rbd_info(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r >= 0) { + bufferlist validated_bl; + validated_bl.append(OVERWRITE_VALIDATED); + + bufferlist validate_bl; + validate_bl.append(VALIDATE); + + if (m_out_bl.contents_equal(validated_bl)) { + // already validated pool + finish(0); + return; + } else if (m_out_bl.contents_equal(validate_bl)) { + // implies snapshot was already successfully created + overwrite_rbd_info(); + return; + } + } else if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to read RBD info: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + create_snapshot(); +} + +template <typename I> +void ValidatePoolRequest<I>::create_snapshot() { + ldout(m_cct, 5) << dendl; + + // allocate a self-managed snapshot id if this a new pool to force + // self-managed snapshot mode + auto ctx = new FunctionContext([this](int r) { + r = m_io_ctx.selfmanaged_snap_create(&m_snap_id); + handle_create_snapshot(r); + }); + m_op_work_queue->queue(ctx, 0); +} + +template <typename I> +void ValidatePoolRequest<I>::handle_create_snapshot(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r == -EINVAL) { + lderr(m_cct) << "pool not configured for self-managed RBD snapshot support" + << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to allocate self-managed snapshot: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + write_rbd_info(); +} + +template <typename I> +void ValidatePoolRequest<I>::write_rbd_info() { + ldout(m_cct, 5) << dendl; + + bufferlist bl; + bl.append(VALIDATE); + + librados::ObjectWriteOperation op; + op.create(true); + op.write(0, bl); + + auto comp = create_rados_callback< + ValidatePoolRequest<I>, + &ValidatePoolRequest<I>::handle_write_rbd_info>(this); + int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void ValidatePoolRequest<I>::handle_write_rbd_info(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + lderr(m_cct) << "pool missing required overwrite support" << dendl; + m_ret_val = -EINVAL; + } else if (r < 0 && r != -EEXIST) { + lderr(m_cct) << "failed to write RBD info: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + remove_snapshot(); +} + +template <typename I> +void ValidatePoolRequest<I>::remove_snapshot() { + ldout(m_cct, 5) << dendl; + + auto ctx = new FunctionContext([this](int r) { + r = m_io_ctx.selfmanaged_snap_remove(m_snap_id); + handle_remove_snapshot(r); + }); + m_op_work_queue->queue(ctx, 0); +} + +template <typename I> +void ValidatePoolRequest<I>::handle_remove_snapshot(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r < 0) { + // not a fatal error + lderr(m_cct) << "failed to remove validation snapshot: " << cpp_strerror(r) + << dendl; + } + + if (m_ret_val < 0) { + finish(m_ret_val); + return; + } + + overwrite_rbd_info(); +} + +template <typename I> +void ValidatePoolRequest<I>::overwrite_rbd_info() { + ldout(m_cct, 5) << dendl; + + bufferlist bl; + bl.append(OVERWRITE_VALIDATED); + + librados::ObjectWriteOperation op; + op.write(0, bl); + + auto comp = create_rados_callback< + ValidatePoolRequest<I>, + &ValidatePoolRequest<I>::handle_overwrite_rbd_info>(this); + int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void ValidatePoolRequest<I>::handle_overwrite_rbd_info(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + lderr(m_cct) << "pool missing required overwrite support" << dendl; + finish(-EINVAL); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to validate overwrite support: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void ValidatePoolRequest<I>::finish(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::ValidatePoolRequest<librbd::ImageCtx>; diff --git a/src/librbd/image/ValidatePoolRequest.h b/src/librbd/image/ValidatePoolRequest.h new file mode 100644 index 00000000..8ea6e4a0 --- /dev/null +++ b/src/librbd/image/ValidatePoolRequest.h @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H +#define CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/buffer.h" + +class CephContext; +class Context; +class ContextWQ; + +namespace librbd { + +struct ImageCtx; + +namespace image { + +template <typename ImageCtxT> +class ValidatePoolRequest { +public: + static ValidatePoolRequest* create(librados::IoCtx& io_ctx, + ContextWQ *op_work_queue, + Context *on_finish) { + return new ValidatePoolRequest(io_ctx, op_work_queue, on_finish); + } + + ValidatePoolRequest(librados::IoCtx& io_ctx, ContextWQ *op_work_queue, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v (overwrites validated) + * READ RBD INFO . . . . . . . . . + * | . . + * | . (snapshots validated) . + * | . . . . . . . . . . . + * v . . + * CREATE SNAPSHOT . . + * | . . + * v . . + * WRITE RBD INFO . . + * | . . + * v . . + * REMOVE SNAPSHOT . . + * | . . + * v . . + * OVERWRITE RBD INFO < . . . . + * | . + * v . + * <finish> < . . . . . . . . . .` + * + * @endverbatim + */ + + librados::IoCtx m_io_ctx; + CephContext* m_cct; + ContextWQ* m_op_work_queue; + Context* m_on_finish; + + int m_ret_val = 0; + bufferlist m_out_bl; + uint64_t m_snap_id = 0; + + void read_rbd_info(); + void handle_read_rbd_info(int r); + + void create_snapshot(); + void handle_create_snapshot(int r); + + void write_rbd_info(); + void handle_write_rbd_info(int r); + + void remove_snapshot(); + void handle_remove_snapshot(int r); + + void overwrite_rbd_info(); + void handle_overwrite_rbd_info(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::ValidatePoolRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H diff --git a/src/librbd/image_watcher/NotifyLockOwner.cc b/src/librbd/image_watcher/NotifyLockOwner.cc new file mode 100644 index 00000000..ead5f214 --- /dev/null +++ b/src/librbd/image_watcher/NotifyLockOwner.cc @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image_watcher/NotifyLockOwner.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/WatchNotifyTypes.h" +#include "librbd/watcher/Notifier.h" +#include <map> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image_watcher::NotifyLockOwner: " \ + << this << " " << __func__ + +namespace librbd { + +namespace image_watcher { + +using namespace watch_notify; +using util::create_context_callback; + +NotifyLockOwner::NotifyLockOwner(ImageCtx &image_ctx, + watcher::Notifier ¬ifier, + bufferlist &&bl, Context *on_finish) + : m_image_ctx(image_ctx), m_notifier(notifier), m_bl(std::move(bl)), + m_on_finish(on_finish) { +} + +void NotifyLockOwner::send() { + send_notify(); +} + +void NotifyLockOwner::send_notify() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + ceph_assert(m_image_ctx.owner_lock.is_locked()); + m_notifier.notify(m_bl, &m_notify_response, create_context_callback< + NotifyLockOwner, &NotifyLockOwner::handle_notify>(this)); +} + +void NotifyLockOwner::handle_notify(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": r=" << r << dendl; + + if (r < 0 && r != -ETIMEDOUT) { + lderr(cct) << ": lock owner notification failed: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + bufferlist response; + bool lock_owner_responded = false; + for (auto &it : m_notify_response.acks) { + if (it.second.length() > 0) { + if (lock_owner_responded) { + lderr(cct) << ": duplicate lock owners detected" << dendl; + finish(-EINVAL); + return; + } + lock_owner_responded = true; + response.claim(it.second); + } + } + + if (!lock_owner_responded) { + ldout(cct, 1) << ": no lock owners detected" << dendl; + finish(-ETIMEDOUT); + return; + } + + try { + auto iter = response.cbegin(); + + ResponseMessage response_message; + using ceph::decode; + decode(response_message, iter); + + r = response_message.result; + } catch (const buffer::error &err) { + r = -EINVAL; + } + finish(r); +} + +void NotifyLockOwner::finish(int r) { + m_on_finish->complete(r); + delete this; +} + +} // namespace image_watcher +} // namespace librbd diff --git a/src/librbd/image_watcher/NotifyLockOwner.h b/src/librbd/image_watcher/NotifyLockOwner.h new file mode 100644 index 00000000..6249bc12 --- /dev/null +++ b/src/librbd/image_watcher/NotifyLockOwner.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H +#define CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H + +#include "include/buffer.h" +#include "librbd/watcher/Types.h" + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace watcher { class Notifier; } + +namespace image_watcher { + +class NotifyLockOwner { +public: + static NotifyLockOwner *create(ImageCtx &image_ctx, + watcher::Notifier ¬ifier, + bufferlist &&bl, Context *on_finish) { + return new NotifyLockOwner(image_ctx, notifier, std::move(bl), on_finish); + } + + NotifyLockOwner(ImageCtx &image_ctx, watcher::Notifier ¬ifier, + bufferlist &&bl, Context *on_finish); + + void send(); + +private: + ImageCtx &m_image_ctx; + watcher::Notifier &m_notifier; + + bufferlist m_bl; + watcher::NotifyResponse m_notify_response; + Context *m_on_finish; + + void send_notify(); + void handle_notify(int r); + + void finish(int r); +}; + +} // namespace image_watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc new file mode 100644 index 00000000..ebf2603f --- /dev/null +++ b/src/librbd/internal.cc @@ -0,0 +1,2021 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "include/int_types.h" + +#include <errno.h> +#include <limits.h> + +#include "include/types.h" +#include "include/uuid.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/event_socket.h" +#include "common/perf_counters.h" +#include "osdc/Striper.h" +#include "include/stringify.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rbd/cls_rbd.h" +#include "cls/rbd/cls_rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/journal/cls_journal_types.h" +#include "cls/journal/cls_journal_client.h" + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Image.h" +#include "librbd/exclusive_lock/AutomaticPolicy.h" +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ReadResult.h" +#include "librbd/journal/Types.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/operation/TrimRequest.h" + +#include "journal/Journaler.h" + +#include <boost/scope_exit.hpp> +#include <boost/variant.hpp> +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd: " + +#define rbd_howmany(x, y) (((x) + (y) - 1) / (y)) + +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; +// list binds to list() here, so std::list is explicitly used below + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; +using librados::Rados; + +namespace librbd { + +namespace { + +int validate_pool(IoCtx &io_ctx, CephContext *cct) { + if (!cct->_conf.get_val<bool>("rbd_validate_pool")) { + return 0; + } + + int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL); + if (r == 0) { + return 0; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl; + return r; + } + + // allocate a self-managed snapshot id if this a new pool to force + // self-managed snapshot mode + uint64_t snap_id; + r = io_ctx.selfmanaged_snap_create(&snap_id); + if (r == -EINVAL) { + lderr(cct) << "pool not configured for self-managed RBD snapshot support" + << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to allocate self-managed snapshot: " + << cpp_strerror(r) << dendl; + return r; + } + + r = io_ctx.selfmanaged_snap_remove(snap_id); + if (r < 0) { + lderr(cct) << "failed to release self-managed snapshot " << snap_id + << ": " << cpp_strerror(r) << dendl; + } + return 0; +} + +} // anonymous namespace + + int detect_format(IoCtx &io_ctx, const string &name, + bool *old_format, uint64_t *size) + { + CephContext *cct = (CephContext *)io_ctx.cct(); + if (old_format) + *old_format = true; + int r = io_ctx.stat(util::old_header_name(name), size, NULL); + if (r == -ENOENT) { + if (old_format) + *old_format = false; + r = io_ctx.stat(util::id_obj_name(name), size, NULL); + if (r < 0) + return r; + } else if (r < 0) { + return r; + } + + ldout(cct, 20) << "detect format of " << name << " : " + << (old_format ? (*old_format ? "old" : "new") : + "don't care") << dendl; + return 0; + } + + bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap) + { + return (parent_pool_id != -1 && off <= overlap); + } + + void init_rbd_header(struct rbd_obj_header_ondisk& ondisk, + uint64_t size, int order, uint64_t bid) + { + uint32_t hi = bid >> 32; + uint32_t lo = bid & 0xFFFFFFFF; + uint32_t extra = rand() % 0xFFFFFFFF; + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&ondisk, 0, sizeof(ondisk)); + + memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)); + memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE, + sizeof(RBD_HEADER_SIGNATURE)); + memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION)); + + snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x", + hi, lo, extra); + + ondisk.image_size = size; + ondisk.options.order = order; + ondisk.options.crypt_type = RBD_CRYPT_NONE; + ondisk.options.comp_type = RBD_COMP_NONE; + ondisk.snap_seq = 0; + ondisk.snap_count = 0; + ondisk.reserved = 0; + ondisk.snap_names_len = 0; + } + + void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize) + { + int obj_order = ictx->order; + ictx->snap_lock.get_read(); + info.size = ictx->get_image_size(ictx->snap_id); + ictx->snap_lock.put_read(); + info.obj_size = 1ULL << obj_order; + info.num_objs = Striper::get_num_objects(ictx->layout, info.size); + info.order = obj_order; + strncpy(info.block_name_prefix, ictx->object_prefix.c_str(), + RBD_MAX_BLOCK_NAME_SIZE); + info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0'; + + // clear deprecated fields + info.parent_pool = -1L; + info.parent_name[0] = '\0'; + } + + uint64_t oid_to_object_no(const string& oid, const string& object_prefix) + { + istringstream iss(oid); + // skip object prefix and separator + iss.ignore(object_prefix.length() + 1); + uint64_t num; + iss >> std::hex >> num; + return num; + } + + void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx) + { + ceph_assert(ictx->owner_lock.is_locked()); + ceph_assert(ictx->exclusive_lock == nullptr || + ictx->exclusive_lock->is_lock_owner()); + + C_SaferCond ctx; + ictx->snap_lock.get_read(); + operation::TrimRequest<> *req = operation::TrimRequest<>::create( + *ictx, &ctx, ictx->size, newsize, prog_ctx); + ictx->snap_lock.put_read(); + req->send(); + + int r = ctx.wait(); + if (r < 0) { + lderr(ictx->cct) << "warning: failed to remove some object(s): " + << cpp_strerror(r) << dendl; + } + } + + int read_header_bl(IoCtx& io_ctx, const string& header_oid, + bufferlist& header, uint64_t *ver) + { + int r; + uint64_t off = 0; +#define READ_SIZE 4096 + do { + bufferlist bl; + r = io_ctx.read(header_oid, bl, READ_SIZE, off); + if (r < 0) + return r; + header.claim_append(bl); + off += r; + } while (r == READ_SIZE); + + static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT), + "length of rbd headers must be the same"); + + if (header.length() < sizeof(RBD_HEADER_TEXT) || + (memcmp(RBD_HEADER_TEXT, header.c_str(), + sizeof(RBD_HEADER_TEXT)) != 0 && + memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(), + sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) { + CephContext *cct = (CephContext *)io_ctx.cct(); + lderr(cct) << "unrecognized header format" << dendl; + return -ENXIO; + } + + if (ver) + *ver = io_ctx.get_last_version(); + + return 0; + } + + int read_header(IoCtx& io_ctx, const string& header_oid, + struct rbd_obj_header_ondisk *header, uint64_t *ver) + { + bufferlist header_bl; + int r = read_header_bl(io_ctx, header_oid, header_bl, ver); + if (r < 0) + return r; + if (header_bl.length() < (int)sizeof(*header)) + return -EIO; + memcpy(header, header_bl.c_str(), sizeof(*header)); + + return 0; + } + + int tmap_set(IoCtx& io_ctx, const string& imgname) + { + bufferlist cmdbl, emptybl; + __u8 c = CEPH_OSD_TMAP_SET; + encode(c, cmdbl); + encode(imgname, cmdbl); + encode(emptybl, cmdbl); + return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl); + } + + int tmap_rm(IoCtx& io_ctx, const string& imgname) + { + bufferlist cmdbl; + __u8 c = CEPH_OSD_TMAP_RM; + encode(c, cmdbl); + encode(imgname, cmdbl); + return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl); + } + + typedef boost::variant<std::string,uint64_t> image_option_value_t; + typedef std::map<int,image_option_value_t> image_options_t; + typedef std::shared_ptr<image_options_t> image_options_ref; + + enum image_option_type_t { + STR, + UINT64, + }; + + const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = { + {RBD_IMAGE_OPTION_FORMAT, UINT64}, + {RBD_IMAGE_OPTION_FEATURES, UINT64}, + {RBD_IMAGE_OPTION_ORDER, UINT64}, + {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64}, + {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64}, + {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64}, + {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64}, + {RBD_IMAGE_OPTION_JOURNAL_POOL, STR}, + {RBD_IMAGE_OPTION_FEATURES_SET, UINT64}, + {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64}, + {RBD_IMAGE_OPTION_DATA_POOL, STR}, + {RBD_IMAGE_OPTION_FLATTEN, UINT64}, + {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64}, + }; + + std::string image_option_name(int optname) { + switch (optname) { + case RBD_IMAGE_OPTION_FORMAT: + return "format"; + case RBD_IMAGE_OPTION_FEATURES: + return "features"; + case RBD_IMAGE_OPTION_ORDER: + return "order"; + case RBD_IMAGE_OPTION_STRIPE_UNIT: + return "stripe_unit"; + case RBD_IMAGE_OPTION_STRIPE_COUNT: + return "stripe_count"; + case RBD_IMAGE_OPTION_JOURNAL_ORDER: + return "journal_order"; + case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH: + return "journal_splay_width"; + case RBD_IMAGE_OPTION_JOURNAL_POOL: + return "journal_pool"; + case RBD_IMAGE_OPTION_FEATURES_SET: + return "features_set"; + case RBD_IMAGE_OPTION_FEATURES_CLEAR: + return "features_clear"; + case RBD_IMAGE_OPTION_DATA_POOL: + return "data_pool"; + case RBD_IMAGE_OPTION_FLATTEN: + return "flatten"; + case RBD_IMAGE_OPTION_CLONE_FORMAT: + return "clone_format"; + default: + return "unknown (" + stringify(optname) + ")"; + } + } + + void image_options_create(rbd_image_options_t* opts) + { + image_options_ref* opts_ = new image_options_ref(new image_options_t()); + + *opts = static_cast<rbd_image_options_t>(opts_); + } + + void image_options_create_ref(rbd_image_options_t* opts, + rbd_image_options_t orig) + { + image_options_ref* orig_ = static_cast<image_options_ref*>(orig); + image_options_ref* opts_ = new image_options_ref(*orig_); + + *opts = static_cast<rbd_image_options_t>(opts_); + } + + void image_options_copy(rbd_image_options_t* opts, + const ImageOptions &orig) + { + image_options_ref* opts_ = new image_options_ref(new image_options_t()); + + *opts = static_cast<rbd_image_options_t>(opts_); + + std::string str_val; + uint64_t uint64_val; + for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) { + switch (i.second) { + case STR: + if (orig.get(i.first, &str_val) == 0) { + image_options_set(*opts, i.first, str_val); + } + continue; + case UINT64: + if (orig.get(i.first, &uint64_val) == 0) { + image_options_set(*opts, i.first, uint64_val); + } + continue; + } + } + } + + void image_options_destroy(rbd_image_options_t opts) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + delete opts_; + } + + int image_options_set(rbd_image_options_t opts, int optname, + const std::string& optval) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + std::map<int, image_option_type_t>::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) { + return -EINVAL; + } + + (*opts_->get())[optname] = optval; + return 0; + } + + int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + std::map<int, image_option_type_t>::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) { + return -EINVAL; + } + + (*opts_->get())[optname] = optval; + return 0; + } + + int image_options_get(rbd_image_options_t opts, int optname, + std::string* optval) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + std::map<int, image_option_type_t>::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) { + return -EINVAL; + } + + image_options_t::const_iterator j = (*opts_)->find(optname); + + if (j == (*opts_)->end()) { + return -ENOENT; + } + + *optval = boost::get<std::string>(j->second); + return 0; + } + + int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + std::map<int, image_option_type_t>::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) { + return -EINVAL; + } + + image_options_t::const_iterator j = (*opts_)->find(optname); + + if (j == (*opts_)->end()) { + return -ENOENT; + } + + *optval = boost::get<uint64_t>(j->second); + return 0; + } + + int image_options_is_set(rbd_image_options_t opts, int optname, + bool* is_set) + { + if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) == + IMAGE_OPTIONS_TYPE_MAPPING.end()) { + return -EINVAL; + } + + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + *is_set = ((*opts_)->find(optname) != (*opts_)->end()); + return 0; + } + + int image_options_unset(rbd_image_options_t opts, int optname) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + std::map<int, image_option_type_t>::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) { + ceph_assert((*opts_)->find(optname) == (*opts_)->end()); + return -EINVAL; + } + + image_options_t::const_iterator j = (*opts_)->find(optname); + + if (j == (*opts_)->end()) { + return -ENOENT; + } + + (*opts_)->erase(j); + return 0; + } + + void image_options_clear(rbd_image_options_t opts) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + (*opts_)->clear(); + } + + bool image_options_is_empty(rbd_image_options_t opts) + { + image_options_ref* opts_ = static_cast<image_options_ref*>(opts); + + return (*opts_)->empty(); + } + + int flatten_children(ImageCtx *ictx, const char* snap_name, + ProgressContext& pctx) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "children flatten " << ictx->name << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker l(ictx->snap_lock); + snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), + snap_name); + + cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(), + ictx->md_ctx.get_namespace(), + ictx->id, snap_id}; + std::vector<librbd::linked_image_spec_t> child_images; + r = api::Image<>::list_children(ictx, parent_spec, &child_images); + if (r < 0) { + return r; + } + + size_t size = child_images.size(); + if (size == 0) { + return 0; + } + + librados::IoCtx child_io_ctx; + int64_t child_pool_id = -1; + size_t i = 0; + for (auto &child_image : child_images){ + std::string pool = child_image.pool_name; + if (child_pool_id == -1 || + child_pool_id != child_image.pool_id || + child_io_ctx.get_namespace() != child_image.pool_namespace) { + r = util::create_ioctx(ictx->md_ctx, "child image", + child_image.pool_id, child_image.pool_namespace, + &child_io_ctx); + if (r < 0) { + return r; + } + + child_pool_id = child_image.pool_id; + } + + ImageCtx *imctx = new ImageCtx("", child_image.image_id, nullptr, + child_io_ctx, false); + r = imctx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening image: " << cpp_strerror(r) << dendl; + return r; + } + + if ((imctx->features & RBD_FEATURE_DEEP_FLATTEN) == 0 && + !imctx->snaps.empty()) { + lderr(cct) << "snapshot in-use by " << pool << "/" << imctx->name + << dendl; + imctx->state->close(); + return -EBUSY; + } + + librbd::NoOpProgressContext prog_ctx; + r = imctx->operations->flatten(prog_ctx); + if (r < 0) { + lderr(cct) << "error flattening image: " << pool << "/" + << (child_image.pool_namespace.empty() ? + "" : "/" + child_image.pool_namespace) + << child_image.image_name << cpp_strerror(r) << dendl; + imctx->state->close(); + return r; + } + + r = imctx->state->close(); + if (r < 0) { + lderr(cct) << "failed to close image: " << cpp_strerror(r) << dendl; + return r; + } + + pctx.update_progress(++i, size); + ceph_assert(i <= size); + } + + return 0; + } + + int get_snap_namespace(ImageCtx *ictx, + const char *snap_name, + cls::rbd::SnapshotNamespace *snap_namespace) { + ldout(ictx->cct, 20) << "get_snap_namespace " << ictx << " " << snap_name + << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + RWLock::RLocker l(ictx->snap_lock); + snap_t snap_id = ictx->get_snap_id(*snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; + r = ictx->get_snap_namespace(snap_id, snap_namespace); + return r; + } + + int snap_is_protected(ImageCtx *ictx, const char *snap_name, bool *is_protected) + { + ldout(ictx->cct, 20) << "snap_is_protected " << ictx << " " << snap_name + << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + RWLock::RLocker l(ictx->snap_lock); + snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; + bool is_unprotected; + r = ictx->is_snap_unprotected(snap_id, &is_unprotected); + // consider both PROTECTED or UNPROTECTING to be 'protected', + // since in either state they can't be deleted + *is_protected = !is_unprotected; + return r; + } + + int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order) + { + CephContext *cct = (CephContext *)io_ctx.cct(); + + ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname + << " size = " << size << " order = " << order << dendl; + int r = validate_pool(io_ctx, cct); + if (r < 0) { + return r; + } + + if (!io_ctx.get_namespace().empty()) { + lderr(cct) << "attempting to add v1 image to namespace" << dendl; + return -EINVAL; + } + + ldout(cct, 2) << "adding rbd image to directory..." << dendl; + r = tmap_set(io_ctx, imgname); + if (r < 0) { + lderr(cct) << "error adding image to directory: " << cpp_strerror(r) + << dendl; + return r; + } + + Rados rados(io_ctx); + uint64_t bid = rados.get_instance_id(); + + ldout(cct, 2) << "creating rbd image..." << dendl; + struct rbd_obj_header_ondisk header; + init_rbd_header(header, size, order, bid); + + bufferlist bl; + bl.append((const char *)&header, sizeof(header)); + + string header_oid = util::old_header_name(imgname); + r = io_ctx.write(header_oid, bl, bl.length(), 0); + if (r < 0) { + lderr(cct) << "Error writing image header: " << cpp_strerror(r) + << dendl; + int remove_r = tmap_rm(io_ctx, imgname); + if (remove_r < 0) { + lderr(cct) << "Could not remove image from directory after " + << "header creation failed: " + << cpp_strerror(remove_r) << dendl; + } + return r; + } + + ldout(cct, 2) << "done." << dendl; + return 0; + } + + int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size, + int *order) + { + uint64_t order_ = *order; + ImageOptions opts; + + int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_); + ceph_assert(r == 0); + + r = create(io_ctx, imgname, "", size, opts, "", "", false); + + int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_); + ceph_assert(r1 == 0); + *order = order_; + + return r; + } + + int create(IoCtx& io_ctx, const char *imgname, uint64_t size, + bool old_format, uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count) + { + if (!order) + return -EINVAL; + + uint64_t order_ = *order; + uint64_t format = old_format ? 1 : 2; + ImageOptions opts; + int r; + + r = opts.set(RBD_IMAGE_OPTION_FORMAT, format); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_FEATURES, features); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_ORDER, order_); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + ceph_assert(r == 0); + + r = create(io_ctx, imgname, "", size, opts, "", "", false); + + int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_); + ceph_assert(r1 == 0); + *order = order_; + + return r; + } + + int create(IoCtx& io_ctx, const std::string &image_name, + const std::string &image_id, uint64_t size, + ImageOptions& opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable) + { + std::string id(image_id); + if (id.empty()) { + id = util::generate_image_id(io_ctx); + } + + CephContext *cct = (CephContext *)io_ctx.cct(); + uint64_t option; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) { + lderr(cct) << "create does not support 'flatten' image option" << dendl; + return -EINVAL; + } + if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) { + lderr(cct) << "create does not support 'clone_format' image option" + << dendl; + return -EINVAL; + } + + ldout(cct, 10) << __func__ << " name=" << image_name << ", " + << "id= " << id << ", " + << "size=" << size << ", opts=" << opts << dendl; + + uint64_t format; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) + format = cct->_conf.get_val<uint64_t>("rbd_default_format"); + bool old_format = format == 1; + + // make sure it doesn't already exist, in either format + int r = detect_format(io_ctx, image_name, NULL, NULL); + if (r != -ENOENT) { + if (r) { + lderr(cct) << "Could not tell if " << image_name << " already exists" + << dendl; + return r; + } + lderr(cct) << "rbd image " << image_name << " already exists" << dendl; + return -EEXIST; + } + + uint64_t order = 0; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) { + order = cct->_conf.get_val<uint64_t>("rbd_default_order"); + } + r = image::CreateRequest<>::validate_order(cct, order); + if (r < 0) { + return r; + } + + if (old_format) { + if ( !getenv("RBD_FORCE_ALLOW_V1") ) { + lderr(cct) << "Format 1 image creation unsupported. " << dendl; + return -EINVAL; + } + lderr(cct) << "Forced V1 image creation. " << dendl; + r = create_v1(io_ctx, image_name.c_str(), size, order); + } else { + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + ConfigProxy config{cct->_conf}; + api::Config<>::apply_pool_overrides(io_ctx, &config); + + C_SaferCond cond; + image::CreateRequest<> *req = image::CreateRequest<>::create( + config, io_ctx, image_name, id, size, opts, non_primary_global_image_id, + primary_mirror_uuid, skip_mirror_enable, op_work_queue, &cond); + req->send(); + + r = cond.wait(); + } + + int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order); + ceph_assert(r1 == 0); + + return r; + } + + /* + * Parent may be in different pool, hence different IoCtx + */ + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, + uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count) + { + uint64_t order = *c_order; + + ImageOptions opts; + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + opts.set(RBD_IMAGE_OPTION_ORDER, order); + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + + int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr, + c_name, opts, "", ""); + opts.get(RBD_IMAGE_OPTION_ORDER, &order); + *c_order = order; + return r; + } + + int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name, + const char *p_snap_name, IoCtx& c_ioctx, const char *c_id, + const char *c_name, ImageOptions& c_opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid) + { + ceph_assert((p_id == nullptr) ^ (p_name == nullptr)); + + CephContext *cct = (CephContext *)p_ioctx.cct(); + if (p_snap_name == nullptr) { + lderr(cct) << "image to be cloned must be a snapshot" << dendl; + return -EINVAL; + } + + uint64_t flatten; + if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) { + lderr(cct) << "clone does not support 'flatten' image option" << dendl; + return -EINVAL; + } + + int r; + std::string parent_id; + if (p_id == nullptr) { + r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name, + &parent_id); + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "failed to retrieve parent image id: " + << cpp_strerror(r) << dendl; + } + return r; + } + } else { + parent_id = p_id; + } + + std::string clone_id; + if (c_id == nullptr) { + clone_id = util::generate_image_id(c_ioctx); + } else { + clone_id = c_id; + } + + ldout(cct, 10) << __func__ << " " + << "c_name=" << c_name << ", " + << "c_id= " << clone_id << ", " + << "c_opts=" << c_opts << dendl; + + ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf}; + api::Config<>::apply_pool_overrides(c_ioctx, &config); + + ThreadPool *thread_pool; + ContextWQ *op_work_queue; + ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue); + + C_SaferCond cond; + auto *req = image::CloneRequest<>::create( + config, p_ioctx, parent_id, p_snap_name, CEPH_NOSNAP, c_ioctx, c_name, + clone_id, c_opts, non_primary_global_image_id, primary_mirror_uuid, + op_work_queue, &cond); + req->send(); + + r = cond.wait(); + if (r < 0) { + return r; + } + + return 0; + } + + int rename(IoCtx& io_ctx, const char *srcname, const char *dstname) + { + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> " + << dstname << dendl; + + ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false); + int r = ictx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl; + return r; + } + BOOST_SCOPE_EXIT((ictx)) { + ictx->state->close(); + } BOOST_SCOPE_EXIT_END + + return ictx->operations->rename(dstname); + } + + int info(ImageCtx *ictx, image_info_t& info, size_t infosize) + { + ldout(ictx->cct, 20) << "info " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + image_info(ictx, info, infosize); + return 0; + } + + int get_old_format(ImageCtx *ictx, uint8_t *old) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + *old = ictx->old_format; + return 0; + } + + int get_size(ImageCtx *ictx, uint64_t *size) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + RWLock::RLocker l2(ictx->snap_lock); + *size = ictx->get_image_size(ictx->snap_id); + return 0; + } + + int get_features(ImageCtx *ictx, uint64_t *features) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + RWLock::RLocker l(ictx->snap_lock); + *features = ictx->features; + return 0; + } + + int get_overlap(ImageCtx *ictx, uint64_t *overlap) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + RWLock::RLocker l(ictx->snap_lock); + RWLock::RLocker l2(ictx->parent_lock); + return ictx->get_parent_overlap(ictx->snap_id, overlap); + } + + int get_flags(ImageCtx *ictx, uint64_t *flags) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + RWLock::RLocker l2(ictx->snap_lock); + return ictx->get_flags(ictx->snap_id, flags); + } + + int set_image_notification(ImageCtx *ictx, int fd, int type) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (ictx->event_socket.is_valid()) + return -EINVAL; + return ictx->event_socket.init(fd, type); + } + + int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl; + *is_owner = false; + + RWLock::RLocker owner_locker(ictx->owner_lock); + if (ictx->exclusive_lock == nullptr) { + return 0; + } + + // might have been blacklisted by peer -- ensure we still own + // the lock by pinging the OSD + int r = ictx->exclusive_lock->assert_header_locked(); + if (r == -EBUSY || r == -ENOENT) { + return 0; + } else if (r < 0) { + return r; + } + + *is_owner = true; + return 0; + } + + int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", " + << "lock_mode=" << lock_mode << dendl; + + if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) { + return -EOPNOTSUPP; + } + + C_SaferCond lock_ctx; + { + RWLock::WLocker l(ictx->owner_lock); + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) { + ictx->set_exclusive_lock_policy( + new exclusive_lock::StandardPolicy(ictx)); + } + + if (ictx->exclusive_lock->is_lock_owner()) { + return 0; + } + + ictx->exclusive_lock->acquire_lock(&lock_ctx); + } + + int r = lock_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + + RWLock::RLocker l(ictx->owner_lock); + if (ictx->exclusive_lock == nullptr) { + return -EINVAL; + } else if (!ictx->exclusive_lock->is_lock_owner()) { + lderr(cct) << "failed to acquire exclusive lock" << dendl; + return ictx->exclusive_lock->get_unlocked_op_error(); + } + + return 0; + } + + int lock_release(ImageCtx *ictx) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl; + + C_SaferCond lock_ctx; + { + RWLock::WLocker l(ictx->owner_lock); + + if (ictx->exclusive_lock == nullptr || + !ictx->exclusive_lock->is_lock_owner()) { + lderr(cct) << "not exclusive lock owner" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->release_lock(&lock_ctx); + } + + int r = lock_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + return 0; + } + + int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode, + std::list<std::string> *lock_owners) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl; + + if (!ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + managed_lock::Locker locker; + C_SaferCond get_owner_ctx; + ExclusiveLock<>(*ictx).get_locker(&locker, &get_owner_ctx); + int r = get_owner_ctx.wait(); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed to determine current lock owner: " + << cpp_strerror(r) << dendl; + return r; + } + + *lock_mode = RBD_LOCK_MODE_EXCLUSIVE; + lock_owners->clear(); + lock_owners->emplace_back(locker.address); + return 0; + } + + int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode, + const std::string &lock_owner) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", " + << "lock_mode=" << lock_mode << ", " + << "lock_owner=" << lock_owner << dendl; + + if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) { + return -EOPNOTSUPP; + } + + if (ictx->read_only) { + return -EROFS; + } + + managed_lock::Locker locker; + C_SaferCond get_owner_ctx; + { + RWLock::RLocker l(ictx->owner_lock); + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx); + } + int r = get_owner_ctx.wait(); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed to determine current lock owner: " + << cpp_strerror(r) << dendl; + return r; + } + + if (locker.address != lock_owner) { + return -EBUSY; + } + + C_SaferCond break_ctx; + { + RWLock::RLocker l(ictx->owner_lock); + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->break_lock(locker, true, &break_ctx); + } + r = break_ctx.wait(); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl; + return r; + } + return 0; + } + + int snap_list(ImageCtx *ictx, vector<snap_info_t>& snaps) + { + ldout(ictx->cct, 20) << "snap_list " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + RWLock::RLocker l(ictx->snap_lock); + for (map<snap_t, SnapInfo>::iterator it = ictx->snap_info.begin(); + it != ictx->snap_info.end(); ++it) { + snap_info_t info; + info.name = it->second.name; + info.id = it->first; + info.size = it->second.size; + snaps.push_back(info); + } + + return 0; + } + + int snap_exists(ImageCtx *ictx, const cls::rbd::SnapshotNamespace& snap_namespace, + const char *snap_name, bool *exists) + { + ldout(ictx->cct, 20) << "snap_exists " << ictx << " " << snap_name << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + RWLock::RLocker l(ictx->snap_lock); + *exists = ictx->get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP; + return 0; + } + + int snap_remove(ImageCtx *ictx, const char *snap_name, uint32_t flags, + ProgressContext& pctx) + { + ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_name << " flags: " << flags << dendl; + + int r = 0; + + r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + if (flags & RBD_SNAP_REMOVE_FLATTEN) { + r = flatten_children(ictx, snap_name, pctx); + if (r < 0) { + return r; + } + } + + bool is_protected; + r = snap_is_protected(ictx, snap_name, &is_protected); + if (r < 0) { + return r; + } + + if (is_protected && flags & RBD_SNAP_REMOVE_UNPROTECT) { + r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name); + if (r < 0) { + lderr(ictx->cct) << "failed to unprotect snapshot: " << snap_name << dendl; + return r; + } + + r = snap_is_protected(ictx, snap_name, &is_protected); + if (r < 0) { + return r; + } + if (is_protected) { + lderr(ictx->cct) << "snapshot is still protected after unprotection" << dendl; + ceph_abort(); + } + } + + C_SaferCond ctx; + ictx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(), snap_name, &ctx); + + r = ctx.wait(); + return r; + } + + int snap_get_timestamp(ImageCtx *ictx, uint64_t snap_id, struct timespec *timestamp) + { + std::map<librados::snap_t, SnapInfo>::iterator snap_it = ictx->snap_info.find(snap_id); + ceph_assert(snap_it != ictx->snap_info.end()); + utime_t time = snap_it->second.timestamp; + time.to_timespec(timestamp); + return 0; + } + + int snap_get_limit(ImageCtx *ictx, uint64_t *limit) + { + int r = cls_client::snapshot_get_limit(&ictx->md_ctx, ictx->header_oid, + limit); + if (r == -EOPNOTSUPP) { + *limit = UINT64_MAX; + r = 0; + } + return r; + } + + int snap_set_limit(ImageCtx *ictx, uint64_t limit) + { + return ictx->operations->snap_set_limit(limit); + } + + struct CopyProgressCtx { + explicit CopyProgressCtx(ProgressContext &p) + : destictx(NULL), src_size(0), prog_ctx(p) + { } + + ImageCtx *destictx; + uint64_t src_size; + ProgressContext &prog_ctx; + }; + + int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size) + { + CephContext *cct = (CephContext *)dest_md_ctx.cct(); + uint64_t option; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) { + lderr(cct) << "copy does not support 'flatten' image option" << dendl; + return -EINVAL; + } + if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) { + lderr(cct) << "copy does not support 'clone_format' image option" + << dendl; + return -EINVAL; + } + + ldout(cct, 20) << "copy " << src->name + << (src->snap_name.length() ? "@" + src->snap_name : "") + << " -> " << destname << " opts = " << opts << dendl; + + src->snap_lock.get_read(); + uint64_t features = src->features; + uint64_t src_size = src->get_image_size(src->snap_id); + src->snap_lock.put_read(); + uint64_t format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, format); + } + uint64_t stripe_unit = src->stripe_unit; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + uint64_t stripe_count = src->stripe_count; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + uint64_t order = src->order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) { + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + } + if (features & ~RBD_FEATURES_ALL) { + lderr(cct) << "librbd does not support requested features" << dendl; + return -ENOSYS; + } + + int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false); + if (r < 0) { + lderr(cct) << "header creation failed" << dendl; + return r; + } + opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order)); + + ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx, + false); + r = dest->state->open(0); + if (r < 0) { + lderr(cct) << "failed to read newly created header" << dendl; + return r; + } + + r = copy(src, dest, prog_ctx, sparse_size); + + int close_r = dest->state->close(); + if (r == 0 && close_r < 0) { + r = close_r; + } + return r; + } + + class C_CopyWrite : public Context { + public: + C_CopyWrite(bufferlist *bl, Context* ctx) + : m_bl(bl), m_ctx(ctx) {} + void finish(int r) override { + delete m_bl; + m_ctx->complete(r); + } + private: + bufferlist *m_bl; + Context *m_ctx; + }; + + class C_CopyRead : public Context { + public: + C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset, + bufferlist *bl, size_t sparse_size) + : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl), + m_sparse_size(sparse_size) { + m_throttle->start_op(); + } + void finish(int r) override { + if (r < 0) { + lderr(m_dest->cct) << "error reading from source image at offset " + << m_offset << ": " << cpp_strerror(r) << dendl; + delete m_bl; + m_throttle->end_op(r); + return; + } + ceph_assert(m_bl->length() == (size_t)r); + + if (m_bl->is_zero()) { + delete m_bl; + m_throttle->end_op(r); + return; + } + + if (!m_sparse_size) { + m_sparse_size = (1 << m_dest->order); + } + + auto *throttle = m_throttle; + auto *end_op_ctx = new FunctionContext([throttle](int r) { + throttle->end_op(r); + }); + auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx); + + m_bl->rebuild(buffer::ptr_node::create(m_bl->length())); + size_t write_offset = 0; + size_t write_length = 0; + size_t offset = 0; + size_t length = m_bl->length(); + const auto& m_ptr = m_bl->front(); + while (offset < length) { + if (util::calc_sparse_extent(m_ptr, + m_sparse_size, + length, + &write_offset, + &write_length, + &offset)) { + bufferlist *write_bl = new bufferlist(); + write_bl->push_back( + buffer::ptr_node::create(m_ptr, write_offset, write_length)); + Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub()); + auto comp = io::AioCompletion::create(ctx); + + // coordinate through AIO WQ to ensure lock is acquired if needed + m_dest->io_work_queue->aio_write(comp, m_offset + write_offset, + write_length, + std::move(*write_bl), + LIBRADOS_OP_FLAG_FADVISE_DONTNEED, + std::move(read_trace)); + write_offset = offset; + write_length = 0; + } + } + delete m_bl; + ceph_assert(gather_ctx->get_sub_created_count() > 0); + gather_ctx->activate(); + } + + ZTracer::Trace read_trace; + + private: + SimpleThrottle *m_throttle; + ImageCtx *m_dest; + uint64_t m_offset; + bufferlist *m_bl; + size_t m_sparse_size; + }; + + int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size) + { + src->snap_lock.get_read(); + uint64_t src_size = src->get_image_size(src->snap_id); + src->snap_lock.put_read(); + + dest->snap_lock.get_read(); + uint64_t dest_size = dest->get_image_size(dest->snap_id); + dest->snap_lock.put_read(); + + CephContext *cct = src->cct; + if (dest_size < src_size) { + lderr(cct) << " src size " << src_size << " > dest size " + << dest_size << dendl; + return -EINVAL; + } + int r; + const uint32_t MAX_KEYS = 64; + map<string, bufferlist> pairs; + std::string last_key = ""; + bool more_results = true; + + while (more_results) { + r = cls_client::metadata_list(&src->md_ctx, src->header_oid, last_key, 0, &pairs); + if (r < 0 && r != -EOPNOTSUPP && r != -EIO) { + lderr(cct) << "couldn't list metadata: " << cpp_strerror(r) << dendl; + return r; + } else if (r == 0 && !pairs.empty()) { + r = cls_client::metadata_set(&dest->md_ctx, dest->header_oid, pairs); + if (r < 0) { + lderr(cct) << "couldn't set metadata: " << cpp_strerror(r) << dendl; + return r; + } + + last_key = pairs.rbegin()->first; + } + + more_results = (pairs.size() == MAX_KEYS); + pairs.clear(); + } + + ZTracer::Trace trace; + if (src->blkin_trace_all) { + trace.init("copy", &src->trace_endpoint); + } + + RWLock::RLocker owner_lock(src->owner_lock); + SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false); + uint64_t period = src->get_stripe_period(); + unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + uint64_t object_id = 0; + for (uint64_t offset = 0; offset < src_size; offset += period) { + if (throttle.pending_error()) { + return throttle.wait_for_ret(); + } + + { + RWLock::RLocker snap_locker(src->snap_lock); + if (src->object_map != nullptr) { + bool skip = true; + // each period is related to src->stripe_count objects, check them all + for (uint64_t i=0; i < src->stripe_count; i++) { + if (object_id < src->object_map->size() && + src->object_map->object_may_exist(object_id)) { + skip = false; + } + ++object_id; + } + + if (skip) continue; + } else { + object_id += src->stripe_count; + } + } + + uint64_t len = min(period, src_size - offset); + bufferlist *bl = new bufferlist(); + auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size); + auto comp = io::AioCompletion::create_and_start<Context>( + ctx, src, io::AIO_TYPE_READ); + + io::ImageReadRequest<> req(*src, comp, {{offset, len}}, + io::ReadResult{bl}, fadvise_flags, + std::move(trace)); + ctx->read_trace = req.get_trace(); + + req.send(); + prog_ctx.update_progress(offset, src_size); + } + + r = throttle.wait_for_ret(); + if (r >= 0) + prog_ctx.update_progress(src_size, src_size); + return r; + } + + int list_lockers(ImageCtx *ictx, + std::list<locker_t> *lockers, + bool *exclusive, + string *tag) + { + ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + RWLock::RLocker locker(ictx->md_lock); + if (exclusive) + *exclusive = ictx->exclusive_locked; + if (tag) + *tag = ictx->lock_tag; + if (lockers) { + lockers->clear(); + map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t>::const_iterator it; + for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) { + locker_t locker; + locker.client = stringify(it->first.locker); + locker.cookie = it->first.cookie; + locker.address = it->second.addr.get_legacy_str(); + lockers->push_back(locker); + } + } + + return 0; + } + + int lock(ImageCtx *ictx, bool exclusive, const string& cookie, + const string& tag) + { + ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive + << " cookie='" << cookie << "' tag='" << tag << "'" + << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + /** + * If we wanted we could do something more intelligent, like local + * checks that we think we will succeed. But for now, let's not + * duplicate that code. + */ + { + RWLock::RLocker locker(ictx->md_lock); + r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME, + exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED, + cookie, tag, "", utime_t(), 0); + if (r < 0) { + return r; + } + } + + ictx->notify_update(); + return 0; + } + + int unlock(ImageCtx *ictx, const string& cookie) + { + ldout(ictx->cct, 20) << "unlock image " << ictx + << " cookie='" << cookie << "'" << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + { + RWLock::RLocker locker(ictx->md_lock); + r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid, + RBD_LOCK_NAME, cookie); + if (r < 0) { + return r; + } + } + + ictx->notify_update(); + return 0; + } + + int break_lock(ImageCtx *ictx, const string& client, + const string& cookie) + { + ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client + << "' cookie='" << cookie << "'" << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + entity_name_t lock_client; + if (!lock_client.parse(client)) { + lderr(ictx->cct) << "Unable to parse client '" << client + << "'" << dendl; + return -EINVAL; + } + + if (ictx->config.get_val<bool>("rbd_blacklist_on_break_lock")) { + typedef std::map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t> Lockers; + Lockers lockers; + ClsLockType lock_type; + std::string lock_tag; + r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid, + RBD_LOCK_NAME, &lockers, &lock_type, + &lock_tag); + if (r < 0) { + lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string client_address; + for (Lockers::iterator it = lockers.begin(); + it != lockers.end(); ++it) { + if (it->first.locker == lock_client) { + client_address = it->second.addr.get_legacy_str(); + break; + } + } + if (client_address.empty()) { + return -ENOENT; + } + + RWLock::RLocker locker(ictx->md_lock); + librados::Rados rados(ictx->md_ctx); + r = rados.blacklist_add( + client_address, + ictx->config.get_val<uint64_t>("rbd_blacklist_expire_seconds")); + if (r < 0) { + lderr(ictx->cct) << "unable to blacklist client: " << cpp_strerror(r) + << dendl; + return r; + } + } + + r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid, + RBD_LOCK_NAME, cookie, lock_client); + if (r < 0) + return r; + ictx->notify_update(); + return 0; + } + + void rbd_ctx_cb(completion_t cb, void *arg) + { + Context *ctx = reinterpret_cast<Context *>(arg); + auto comp = reinterpret_cast<io::AioCompletion *>(cb); + ctx->complete(comp->get_return_value()); + comp->release(); + } + + int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) + { + coarse_mono_time start_time; + ceph::timespan elapsed; + + ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off + << " len = " << len << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + uint64_t mylen = len; + ictx->snap_lock.get_read(); + r = clip_io(ictx, off, &mylen); + ictx->snap_lock.put_read(); + if (r < 0) + return r; + + int64_t total_read = 0; + uint64_t period = ictx->get_stripe_period(); + uint64_t left = mylen; + + ZTracer::Trace trace; + if (ictx->blkin_trace_all) { + trace.init("read_iterate", &ictx->trace_endpoint); + } + + RWLock::RLocker owner_locker(ictx->owner_lock); + start_time = coarse_mono_clock::now(); + while (left > 0) { + uint64_t period_off = off - (off % period); + uint64_t read_len = min(period_off + period - off, left); + + bufferlist bl; + + C_SaferCond ctx; + auto c = io::AioCompletion::create_and_start(&ctx, ictx, + io::AIO_TYPE_READ); + io::ImageRequest<>::aio_read(ictx, c, {{off, read_len}}, + io::ReadResult{&bl}, 0, std::move(trace)); + + int ret = ctx.wait(); + if (ret < 0) { + return ret; + } + + r = cb(total_read, ret, bl.c_str(), arg); + if (r < 0) { + return r; + } + + total_read += ret; + left -= ret; + off += ret; + } + + elapsed = coarse_mono_clock::now() - start_time; + ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); + ictx->perfcounter->inc(l_librbd_rd); + ictx->perfcounter->inc(l_librbd_rd_bytes, mylen); + return total_read; + } + + // validate extent against image size; clip to image size if necessary + int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len) + { + ceph_assert(ictx->snap_lock.is_locked()); + uint64_t image_size = ictx->get_image_size(ictx->snap_id); + bool snap_exists = ictx->snap_exists; + + if (!snap_exists) + return -ENOENT; + + // special-case "len == 0" requests: always valid + if (*len == 0) + return 0; + + // can't start past end + if (off >= image_size) + return -EINVAL; + + // clip requests that extend past end to just end + if ((off + *len) > image_size) + *len = (size_t)(image_size - off); + + return 0; + } + + int invalidate_cache(ImageCtx *ictx) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "invalidate_cache " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + { + RWLock::RLocker owner_locker(ictx->owner_lock); + ictx->io_object_dispatcher->invalidate_cache(&ctx); + } + r = ctx.wait(); + ictx->perfcounter->inc(l_librbd_invalidate_cache); + return r; + } + + int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp) + { + if (numcomp <= 0) + return -EINVAL; + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp + << dendl; + int i = 0; + Mutex::Locker l(ictx->completed_reqs_lock); + numcomp = std::min(numcomp, (int)ictx->completed_reqs.size()); + while (i < numcomp) { + comps[i++] = ictx->completed_reqs.front(); + ictx->completed_reqs.pop_front(); + } + return i; + } + + int metadata_get(ImageCtx *ictx, const string &key, string *value) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value); + } + + int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "metadata_list " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + return cls_client::metadata_list(&ictx->md_ctx, ictx->header_oid, start, max, pairs); + } + + struct C_RBD_Readahead : public Context { + ImageCtx *ictx; + object_t oid; + uint64_t offset; + uint64_t length; + + bufferlist read_data; + io::ExtentMap extent_map; + + C_RBD_Readahead(ImageCtx *ictx, object_t oid, uint64_t offset, uint64_t length) + : ictx(ictx), oid(oid), offset(offset), length(length) { + ictx->readahead.inc_pending(); + } + + void finish(int r) override { + ldout(ictx->cct, 20) << "C_RBD_Readahead on " << oid << ": " + << offset << "~" << length << dendl; + ictx->readahead.dec_pending(); + } + }; + + void readahead(ImageCtx *ictx, + const vector<pair<uint64_t,uint64_t> >& image_extents) + { + uint64_t total_bytes = 0; + for (vector<pair<uint64_t,uint64_t> >::const_iterator p = image_extents.begin(); + p != image_extents.end(); + ++p) { + total_bytes += p->second; + } + + ictx->md_lock.get_write(); + bool abort = ictx->readahead_disable_after_bytes != 0 && + ictx->total_bytes_read > ictx->readahead_disable_after_bytes; + if (abort) { + ictx->md_lock.put_write(); + return; + } + ictx->total_bytes_read += total_bytes; + ictx->snap_lock.get_read(); + uint64_t image_size = ictx->get_image_size(ictx->snap_id); + auto snap_id = ictx->snap_id; + ictx->snap_lock.put_read(); + ictx->md_lock.put_write(); + + pair<uint64_t, uint64_t> readahead_extent = ictx->readahead.update(image_extents, image_size); + uint64_t readahead_offset = readahead_extent.first; + uint64_t readahead_length = readahead_extent.second; + + if (readahead_length > 0) { + ldout(ictx->cct, 20) << "(readahead logical) " << readahead_offset << "~" << readahead_length << dendl; + map<object_t,vector<ObjectExtent> > readahead_object_extents; + Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout, + readahead_offset, readahead_length, 0, readahead_object_extents); + for (map<object_t,vector<ObjectExtent> >::iterator p = readahead_object_extents.begin(); p != readahead_object_extents.end(); ++p) { + for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) { + ldout(ictx->cct, 20) << "(readahead) oid " << q->oid << " " << q->offset << "~" << q->length << dendl; + + auto req_comp = new C_RBD_Readahead(ictx, q->oid, q->offset, + q->length); + auto req = io::ObjectDispatchSpec::create_read( + ictx, io::OBJECT_DISPATCH_LAYER_NONE, q->oid.name, q->objectno, + q->offset, q->length, snap_id, 0, {}, &req_comp->read_data, + &req_comp->extent_map, req_comp); + req->send(); + } + } + ictx->perfcounter->inc(l_librbd_readahead); + ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length); + } + } + + int list_watchers(ImageCtx *ictx, + std::list<librbd::image_watcher_t> &watchers) + { + int r; + std::string header_oid; + std::list<obj_watch_t> obj_watchers; + + if (ictx->old_format) { + header_oid = util::old_header_name(ictx->name); + } else { + header_oid = util::header_name(ictx->id); + } + + r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers); + if (r < 0) { + return r; + } + + for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) { + librbd::image_watcher_t watcher; + watcher.addr = i->addr; + watcher.id = i->watcher_id; + watcher.cookie = i->cookie; + + watchers.push_back(watcher); + } + + return 0; + } + +} + +std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) { + os << "["; + + const char *delimiter = ""; + for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) { + if (i.second == librbd::STR) { + std::string val; + if (opts.get(i.first, &val) == 0) { + os << delimiter << librbd::image_option_name(i.first) << "=" << val; + delimiter = ", "; + } + } else if (i.second == librbd::UINT64) { + uint64_t val; + if (opts.get(i.first, &val) == 0) { + os << delimiter << librbd::image_option_name(i.first) << "=" << val; + delimiter = ", "; + } + } + } + + os << "]"; + + return os; +} diff --git a/src/librbd/internal.h b/src/librbd/internal.h new file mode 100644 index 00000000..1e1864f3 --- /dev/null +++ b/src/librbd/internal.h @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_INTERNAL_H +#define CEPH_LIBRBD_INTERNAL_H + +#include "include/int_types.h" + +#include <map> +#include <set> +#include <string> +#include <vector> + +#include "include/buffer_fwd.h" +#include "include/rbd/librbd.hpp" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/WorkQueue.h" +#include "common/ceph_time.h" +#include "librbd/Types.h" + +namespace librbd { + + struct ImageCtx; + namespace io { struct AioCompletion; } + + class NoOpProgressContext : public ProgressContext + { + public: + NoOpProgressContext() + { + } + int update_progress(uint64_t offset, uint64_t src_size) override + { + return 0; + } + }; + + int detect_format(librados::IoCtx &io_ctx, const std::string &name, + bool *old_format, uint64_t *size); + + bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap); + + std::string image_option_name(int optname); + void image_options_create(rbd_image_options_t* opts); + void image_options_create_ref(rbd_image_options_t* opts, + rbd_image_options_t orig); + void image_options_copy(rbd_image_options_t *opts, + const ImageOptions &orig); + void image_options_destroy(rbd_image_options_t opts); + int image_options_set(rbd_image_options_t opts, int optname, + const std::string& optval); + int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval); + int image_options_get(rbd_image_options_t opts, int optname, + std::string* optval); + int image_options_get(rbd_image_options_t opts, int optname, + uint64_t* optval); + int image_options_is_set(rbd_image_options_t opts, int optname, + bool* is_set); + int image_options_unset(rbd_image_options_t opts, int optname); + void image_options_clear(rbd_image_options_t opts); + bool image_options_is_empty(rbd_image_options_t opts); + + int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size, + int *order); + int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size, + bool old_format, uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); + int create(IoCtx& io_ctx, const std::string &image_name, + const std::string &image_id, uint64_t size, ImageOptions& opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable); + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, + uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count); + int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name, + const char *p_snap_name, IoCtx& c_ioctx, const char *c_id, + const char *c_name, ImageOptions& c_opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid); + int rename(librados::IoCtx& io_ctx, const char *srcname, const char *dstname); + int info(ImageCtx *ictx, image_info_t& info, size_t image_size); + int get_old_format(ImageCtx *ictx, uint8_t *old); + int get_size(ImageCtx *ictx, uint64_t *size); + int get_features(ImageCtx *ictx, uint64_t *features); + int get_overlap(ImageCtx *ictx, uint64_t *overlap); + int get_flags(ImageCtx *ictx, uint64_t *flags); + int set_image_notification(ImageCtx *ictx, int fd, int type); + int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner); + int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode); + int lock_release(ImageCtx *ictx); + int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode, + std::list<std::string> *lock_owners); + int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode, + const std::string &lock_owner); + + int snap_list(ImageCtx *ictx, std::vector<snap_info_t>& snaps); + int snap_exists(ImageCtx *ictx, const cls::rbd::SnapshotNamespace& snap_namespace, + const char *snap_name, bool *exists); + int snap_get_limit(ImageCtx *ictx, uint64_t *limit); + int snap_set_limit(ImageCtx *ictx, uint64_t limit); + int snap_get_timestamp(ImageCtx *ictx, uint64_t snap_id, struct timespec *timestamp); + int snap_remove(ImageCtx *ictx, const char *snap_name, uint32_t flags, ProgressContext& pctx); + int snap_is_protected(ImageCtx *ictx, const char *snap_name, + bool *is_protected); + int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size); + int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size); + + /* cooperative locking */ + int list_lockers(ImageCtx *ictx, + std::list<locker_t> *locks, + bool *exclusive, + std::string *tag); + int lock(ImageCtx *ictx, bool exclusive, const std::string& cookie, + const std::string& tag); + int lock_shared(ImageCtx *ictx, const std::string& cookie, + const std::string& tag); + int unlock(ImageCtx *ictx, const std::string& cookie); + int break_lock(ImageCtx *ictx, const std::string& client, + const std::string& cookie); + + void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx); + + int read_header_bl(librados::IoCtx& io_ctx, const std::string& md_oid, + ceph::bufferlist& header, uint64_t *ver); + int read_header(librados::IoCtx& io_ctx, const std::string& md_oid, + struct rbd_obj_header_ondisk *header, uint64_t *ver); + int tmap_set(librados::IoCtx& io_ctx, const std::string& imgname); + int tmap_rm(librados::IoCtx& io_ctx, const std::string& imgname); + void image_info(const ImageCtx *ictx, image_info_t& info, size_t info_size); + uint64_t oid_to_object_no(const std::string& oid, + const std::string& object_prefix); + int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len); + void init_rbd_header(struct rbd_obj_header_ondisk& ondisk, + uint64_t size, int order, uint64_t bid); + + int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); + void readahead(ImageCtx *ictx, + const vector<pair<uint64_t,uint64_t> >& image_extents); + + int invalidate_cache(ImageCtx *ictx); + int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp); + int metadata_list(ImageCtx *ictx, const string &last, uint64_t max, map<string, bufferlist> *pairs); + int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value); + + int list_watchers(ImageCtx *ictx, std::list<librbd::image_watcher_t> &watchers); +} + +std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts); + +#endif diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc new file mode 100644 index 00000000..73c58167 --- /dev/null +++ b/src/librbd/io/AioCompletion.cc @@ -0,0 +1,216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/AioCompletion.h" +#include <errno.h> + +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/WorkQueue.h" + +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Types.h" + +#ifdef WITH_LTTNG +#include "tracing/librbd.h" +#else +#define tracepoint(...) +#endif + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +int AioCompletion::wait_for_complete() { + tracepoint(librbd, aio_wait_for_complete_enter, this); + lock.Lock(); + while (state != AIO_STATE_COMPLETE) + cond.Wait(lock); + lock.Unlock(); + tracepoint(librbd, aio_wait_for_complete_exit, 0); + return 0; +} + +void AioCompletion::finalize(ssize_t rval) +{ + ceph_assert(lock.is_locked()); + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + ldout(cct, 20) << "r=" << rval << dendl; + if (rval >= 0 && aio_type == AIO_TYPE_READ) { + read_result.assemble_result(cct); + } +} + +void AioCompletion::complete() { + ceph_assert(lock.is_locked()); + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + tracepoint(librbd, aio_complete_enter, this, rval); + if (ictx->perfcounter != nullptr) { + ceph::timespan elapsed = coarse_mono_clock::now() - start_time; + switch (aio_type) { + case AIO_TYPE_GENERIC: + case AIO_TYPE_OPEN: + case AIO_TYPE_CLOSE: + break; + case AIO_TYPE_READ: + ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break; + case AIO_TYPE_WRITE: + ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break; + case AIO_TYPE_DISCARD: + ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break; + case AIO_TYPE_FLUSH: + ictx->perfcounter->tinc(l_librbd_flush_latency, elapsed); break; + case AIO_TYPE_WRITESAME: + ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break; + case AIO_TYPE_COMPARE_AND_WRITE: + ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break; + default: + lderr(cct) << "completed invalid aio_type: " << aio_type << dendl; + break; + } + } + + if ((aio_type == AIO_TYPE_CLOSE) || + (aio_type == AIO_TYPE_OPEN && rval < 0)) { + // must destroy ImageCtx prior to invoking callback + delete ictx; + ictx = nullptr; + } + + state = AIO_STATE_CALLBACK; + if (complete_cb) { + lock.Unlock(); + complete_cb(rbd_comp, complete_arg); + lock.Lock(); + } + + if (ictx != nullptr && event_notify && ictx->event_socket.is_valid()) { + ictx->completed_reqs_lock.Lock(); + ictx->completed_reqs.push_back(&m_xlist_item); + ictx->completed_reqs_lock.Unlock(); + ictx->event_socket.notify(); + } + + state = AIO_STATE_COMPLETE; + cond.Signal(); + + // note: possible for image to be closed after op marked finished + if (async_op.started()) { + async_op.finish_op(); + } + tracepoint(librbd, aio_complete_exit); +} + +void AioCompletion::init_time(ImageCtx *i, aio_type_t t) { + Mutex::Locker locker(lock); + if (ictx == nullptr) { + ictx = i; + aio_type = t; + start_time = coarse_mono_clock::now(); + } +} + +void AioCompletion::start_op() { + Mutex::Locker locker(lock); + ceph_assert(ictx != nullptr); + + if (aio_type == AIO_TYPE_OPEN || aio_type == AIO_TYPE_CLOSE) { + // no need to track async open/close operations + return; + } + + ceph_assert(!async_op.started()); + async_op.start_op(*ictx); +} + +void AioCompletion::fail(int r) +{ + lock.Lock(); + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + lderr(cct) << cpp_strerror(r) << dendl; + ceph_assert(pending_count == 0); + rval = r; + complete(); + put_unlock(); +} + +void AioCompletion::set_request_count(uint32_t count) { + lock.Lock(); + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + ldout(cct, 20) << "pending=" << count << dendl; + ceph_assert(pending_count == 0); + + if (count > 0) { + pending_count = count; + lock.Unlock(); + } else { + pending_count = 1; + lock.Unlock(); + + // ensure completion fires in clean lock context + ictx->op_work_queue->queue(new C_AioRequest(this), 0); + } +} + +void AioCompletion::complete_request(ssize_t r) +{ + lock.Lock(); + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + if (rval >= 0) { + if (r < 0 && r != -EEXIST) + rval = r; + else if (r > 0) + rval += r; + } + ceph_assert(pending_count); + int count = --pending_count; + + ldout(cct, 20) << "cb=" << complete_cb << ", " + << "pending=" << pending_count << dendl; + if (!count) { + finalize(rval); + complete(); + } + put_unlock(); +} + +bool AioCompletion::is_complete() { + tracepoint(librbd, aio_is_complete_enter, this); + bool done; + { + Mutex::Locker l(lock); + done = this->state == AIO_STATE_COMPLETE; + } + tracepoint(librbd, aio_is_complete_exit, done); + return done; +} + +ssize_t AioCompletion::get_return_value() { + tracepoint(librbd, aio_get_return_value_enter, this); + lock.Lock(); + ssize_t r = rval; + lock.Unlock(); + tracepoint(librbd, aio_get_return_value_exit, r); + return r; +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/AioCompletion.h b/src/librbd/io/AioCompletion.h new file mode 100644 index 00000000..f3551a02 --- /dev/null +++ b/src/librbd/io/AioCompletion.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_AIO_COMPLETION_H +#define CEPH_LIBRBD_IO_AIO_COMPLETION_H + +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/ceph_time.h" +#include "include/Context.h" +#include "include/utime.h" +#include "include/rbd/librbd.hpp" + +#include "librbd/ImageCtx.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +class CephContext; + +namespace librbd { +namespace io { + + +/** + * AioCompletion is the overall completion for a single + * rbd I/O request. It may be composed of many AioObjectRequests, + * which each go to a single object. + * + * The retrying of individual requests is handled at a lower level, + * so all AioCompletion cares about is the count of outstanding + * requests. The number of expected individual requests should be + * set initially using set_request_count() prior to issuing the + * requests. This ensures that the completion will not be completed + * within the caller's thread of execution (instead via a librados + * context or via a thread pool context for cache read hits). + */ +struct AioCompletion { + typedef enum { + AIO_STATE_PENDING = 0, + AIO_STATE_CALLBACK, + AIO_STATE_COMPLETE, + } aio_state_t; + + mutable Mutex lock; + Cond cond; + aio_state_t state; + ssize_t rval; + callback_t complete_cb; + void *complete_arg; + rbd_completion_t rbd_comp; + uint32_t pending_count; ///< number of requests + int ref; + bool released; + ImageCtx *ictx; + coarse_mono_time start_time; + aio_type_t aio_type; + + ReadResult read_result; + + AsyncOperation async_op; + + xlist<AioCompletion*>::item m_xlist_item; + bool event_notify; + + template <typename T, void (T::*MF)(int)> + static void callback_adapter(completion_t cb, void *arg) { + AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb); + T *t = reinterpret_cast<T *>(arg); + (t->*MF)(comp->get_return_value()); + comp->release(); + } + + static AioCompletion *create(void *cb_arg, callback_t cb_complete, + rbd_completion_t rbd_comp) { + AioCompletion *comp = new AioCompletion(); + comp->set_complete_cb(cb_arg, cb_complete); + comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp); + return comp; + } + + template <typename T, void (T::*MF)(int) = &T::complete> + static AioCompletion *create(T *obj) { + AioCompletion *comp = new AioCompletion(); + comp->set_complete_cb(obj, &callback_adapter<T, MF>); + comp->rbd_comp = comp; + return comp; + } + + template <typename T, void (T::*MF)(int) = &T::complete> + static AioCompletion *create_and_start(T *obj, ImageCtx *image_ctx, + aio_type_t type) { + AioCompletion *comp = create<T, MF>(obj); + comp->init_time(image_ctx, type); + comp->start_op(); + return comp; + } + + AioCompletion() : lock("AioCompletion::lock", true, false), + state(AIO_STATE_PENDING), rval(0), complete_cb(NULL), + complete_arg(NULL), rbd_comp(NULL), + pending_count(0), ref(1), released(false), ictx(NULL), + aio_type(AIO_TYPE_NONE), m_xlist_item(this), + event_notify(false) { + } + + ~AioCompletion() { + } + + int wait_for_complete(); + + void finalize(ssize_t rval); + + inline bool is_initialized(aio_type_t type) const { + Mutex::Locker locker(lock); + return ((ictx != nullptr) && (aio_type == type)); + } + inline bool is_started() const { + Mutex::Locker locker(lock); + return async_op.started(); + } + + void init_time(ImageCtx *i, aio_type_t t); + void start_op(); + void fail(int r); + + void complete(); + + void set_complete_cb(void *cb_arg, callback_t cb) { + complete_cb = cb; + complete_arg = cb_arg; + } + + void set_request_count(uint32_t num); + void add_request() { + lock.Lock(); + ceph_assert(pending_count > 0); + lock.Unlock(); + get(); + } + void complete_request(ssize_t r); + + bool is_complete(); + + ssize_t get_return_value(); + + void get() { + lock.Lock(); + ceph_assert(ref > 0); + ref++; + lock.Unlock(); + } + void release() { + lock.Lock(); + ceph_assert(!released); + released = true; + put_unlock(); + } + void put() { + lock.Lock(); + put_unlock(); + } + void put_unlock() { + ceph_assert(ref > 0); + int n = --ref; + lock.Unlock(); + if (!n) { + if (ictx != nullptr && event_notify) { + ictx->completed_reqs_lock.Lock(); + m_xlist_item.remove_myself(); + ictx->completed_reqs_lock.Unlock(); + } + delete this; + } + } + + void set_event_notify(bool s) { + Mutex::Locker l(lock); + event_notify = s; + } + + void *get_arg() { + return complete_arg; + } +}; + +class C_AioRequest : public Context { +public: + C_AioRequest(AioCompletion *completion) : m_completion(completion) { + m_completion->add_request(); + } + ~C_AioRequest() override {} + void finish(int r) override { + m_completion->complete_request(r); + } +protected: + AioCompletion *m_completion; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_AIO_COMPLETION_H diff --git a/src/librbd/io/AsyncOperation.cc b/src/librbd/io/AsyncOperation.cc new file mode 100644 index 00000000..c5a3bc93 --- /dev/null +++ b/src/librbd/io/AsyncOperation.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/AsyncOperation.h" +#include "librbd/ImageCtx.h" +#include "common/dout.h" +#include "common/WorkQueue.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::AsyncOperation: " + +namespace librbd { +namespace io { + +namespace { + +struct C_CompleteFlushes : public Context { + ImageCtx *image_ctx; + std::list<Context *> flush_contexts; + + explicit C_CompleteFlushes(ImageCtx *image_ctx, + std::list<Context *> &&flush_contexts) + : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) { + } + void finish(int r) override { + RWLock::RLocker owner_locker(image_ctx->owner_lock); + while (!flush_contexts.empty()) { + Context *flush_ctx = flush_contexts.front(); + flush_contexts.pop_front(); + + ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl; + flush_ctx->complete(0); + } + } +}; + +} // anonymous namespace + +void AsyncOperation::start_op(ImageCtx &image_ctx) { + ceph_assert(m_image_ctx == NULL); + m_image_ctx = &image_ctx; + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; + Mutex::Locker l(m_image_ctx->async_ops_lock); + m_image_ctx->async_ops.push_front(&m_xlist_item); +} + +void AsyncOperation::finish_op() { + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; + + { + Mutex::Locker l(m_image_ctx->async_ops_lock); + xlist<AsyncOperation *>::iterator iter(&m_xlist_item); + ++iter; + ceph_assert(m_xlist_item.remove_myself()); + + // linked list stored newest -> oldest ops + if (!iter.end() && !m_flush_contexts.empty()) { + ldout(m_image_ctx->cct, 20) << "moving flush contexts to previous op: " + << *iter << dendl; + (*iter)->m_flush_contexts.insert((*iter)->m_flush_contexts.end(), + m_flush_contexts.begin(), + m_flush_contexts.end()); + return; + } + } + + if (!m_flush_contexts.empty()) { + C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx, + std::move(m_flush_contexts)); + m_image_ctx->op_work_queue->queue(ctx); + } +} + +void AsyncOperation::flush(Context* on_finish) { + { + Mutex::Locker locker(m_image_ctx->async_ops_lock); + xlist<AsyncOperation *>::iterator iter(&m_xlist_item); + ++iter; + + // linked list stored newest -> oldest ops + if (!iter.end()) { + (*iter)->m_flush_contexts.push_back(on_finish); + return; + } + } + + m_image_ctx->op_work_queue->queue(on_finish); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/AsyncOperation.h b/src/librbd/io/AsyncOperation.h new file mode 100644 index 00000000..b0a37c4b --- /dev/null +++ b/src/librbd/io/AsyncOperation.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_IO_ASYNC_OPERATION_H +#define LIBRBD_IO_ASYNC_OPERATION_H + +#include "include/ceph_assert.h" +#include "include/xlist.h" +#include <list> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace io { + +class AsyncOperation { +public: + + AsyncOperation() + : m_image_ctx(NULL), m_xlist_item(this) + { + } + + ~AsyncOperation() + { + ceph_assert(!m_xlist_item.is_on_list()); + } + + inline bool started() const { + return m_xlist_item.is_on_list(); + } + + void start_op(ImageCtx &image_ctx); + void finish_op(); + + void flush(Context *on_finish); + +private: + + ImageCtx *m_image_ctx; + xlist<AsyncOperation *>::item m_xlist_item; + std::list<Context *> m_flush_contexts; + +}; + +} // namespace io +} // namespace librbd + +#endif // LIBRBD_IO_ASYNC_OPERATION_H diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc new file mode 100644 index 00000000..c2ebb10f --- /dev/null +++ b/src/librbd/io/CopyupRequest.cc @@ -0,0 +1,625 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/CopyupRequest.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Mutex.h" +#include "common/WorkQueue.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/ObjectCopyRequest.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ReadResult.h" + +#include <boost/bind.hpp> +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +template <typename I> +class C_UpdateObjectMap : public C_AsyncObjectThrottle<I> { +public: + C_UpdateObjectMap(AsyncObjectThrottle<I> &throttle, I *image_ctx, + uint64_t object_no, uint8_t head_object_map_state, + const std::vector<uint64_t> *snap_ids, + bool first_snap_is_clean, const ZTracer::Trace &trace, + size_t snap_id_idx) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no), + m_head_object_map_state(head_object_map_state), m_snap_ids(*snap_ids), + m_first_snap_is_clean(first_snap_is_clean), m_trace(trace), + m_snap_id_idx(snap_id_idx) + { + } + + int send() override { + auto& image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + if (image_ctx.exclusive_lock == nullptr) { + return 1; + } + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.object_map == nullptr) { + return 1; + } + + uint64_t snap_id = m_snap_ids[m_snap_id_idx]; + if (snap_id == CEPH_NOSNAP) { + return update_head(); + } else { + return update_snapshot(snap_id); + } + } + + int update_head() { + auto& image_ctx = this->m_image_ctx; + RWLock::WLocker object_map_locker(image_ctx.object_map_lock); + bool sent = image_ctx.object_map->template aio_update<Context>( + CEPH_NOSNAP, m_object_no, m_head_object_map_state, {}, m_trace, false, + this); + return (sent ? 0 : 1); + } + + int update_snapshot(uint64_t snap_id) { + auto& image_ctx = this->m_image_ctx; + uint8_t state = OBJECT_EXISTS; + if (image_ctx.test_features(RBD_FEATURE_FAST_DIFF, image_ctx.snap_lock) && + (m_snap_id_idx > 0 || m_first_snap_is_clean)) { + // first snapshot should be exists+dirty since it contains + // the copyup data -- later snapshots inherit the data. + state = OBJECT_EXISTS_CLEAN; + } + + RWLock::RLocker object_map_locker(image_ctx.object_map_lock); + bool sent = image_ctx.object_map->template aio_update<Context>( + snap_id, m_object_no, state, {}, m_trace, true, this); + ceph_assert(sent); + return 0; + } + +private: + uint64_t m_object_no; + uint8_t m_head_object_map_state; + const std::vector<uint64_t> &m_snap_ids; + bool m_first_snap_is_clean; + const ZTracer::Trace &m_trace; + size_t m_snap_id_idx; +}; + +} // anonymous namespace + +template <typename I> +CopyupRequest<I>::CopyupRequest(I *ictx, const std::string &oid, + uint64_t objectno, Extents &&image_extents, + const ZTracer::Trace &parent_trace) + : m_image_ctx(ictx), m_oid(oid), m_object_no(objectno), + m_image_extents(image_extents), + m_trace(util::create_trace(*m_image_ctx, "copy-up", parent_trace)), + m_lock("CopyupRequest", false, false) +{ + ceph_assert(m_image_ctx->data_ctx.is_valid()); + m_async_op.start_op(*util::get_image_ctx(m_image_ctx)); +} + +template <typename I> +CopyupRequest<I>::~CopyupRequest() { + ceph_assert(m_pending_requests.empty()); + m_async_op.finish_op(); +} + +template <typename I> +void CopyupRequest<I>::append_request(AbstractObjectWriteRequest<I> *req) { + Mutex::Locker locker(m_lock); + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "oid=" << m_oid << ", " + << "object_request=" << req << ", " + << "append=" << m_append_request_permitted << dendl; + if (m_append_request_permitted) { + m_pending_requests.push_back(req); + } else { + m_restart_requests.push_back(req); + } +} + +template <typename I> +void CopyupRequest<I>::send() { + read_from_parent(); +} + +template <typename I> +void CopyupRequest<I>::read_from_parent() { + auto cct = m_image_ctx->cct; + RWLock::RLocker snap_locker(m_image_ctx->snap_lock); + RWLock::RLocker parent_locker(m_image_ctx->parent_lock); + + if (m_image_ctx->parent == nullptr) { + ldout(cct, 5) << "parent detached" << dendl; + + m_image_ctx->op_work_queue->queue( + util::create_context_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_read_from_parent>(this), + -ENOENT); + return; + } else if (is_deep_copy()) { + deep_copy(); + return; + } + + auto comp = AioCompletion::create_and_start< + CopyupRequest<I>, + &CopyupRequest<I>::handle_read_from_parent>( + this, util::get_image_ctx(m_image_ctx->parent), AIO_TYPE_READ); + + ldout(cct, 20) << "oid=" << m_oid << ", " + << "completion=" << comp << ", " + << "extents=" << m_image_extents + << dendl; + ImageRequest<I>::aio_read(m_image_ctx->parent, comp, + std::move(m_image_extents), + ReadResult{&m_copyup_data}, 0, m_trace); +} + +template <typename I> +void CopyupRequest<I>::handle_read_from_parent(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl; + + m_image_ctx->snap_lock.get_read(); + m_lock.Lock(); + m_copyup_is_zero = m_copyup_data.is_zero(); + m_copyup_required = is_copyup_required(); + disable_append_requests(); + + if (r < 0 && r != -ENOENT) { + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + lderr(cct) << "error reading from parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_copyup_required) { + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + ldout(cct, 20) << "no-op, skipping" << dendl; + finish(0); + return; + } + + // copyup() will affect snapshots only if parent data is not all + // zeros. + if (!m_copyup_is_zero) { + m_snap_ids.insert(m_snap_ids.end(), m_image_ctx->snaps.rbegin(), + m_image_ctx->snaps.rend()); + } + + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + update_object_maps(); +} + +template <typename I> +void CopyupRequest<I>::deep_copy() { + auto cct = m_image_ctx->cct; + ceph_assert(m_image_ctx->snap_lock.is_locked()); + ceph_assert(m_image_ctx->parent_lock.is_locked()); + ceph_assert(m_image_ctx->parent != nullptr); + + m_lock.Lock(); + m_flatten = is_copyup_required() ? true : m_image_ctx->migration_info.flatten; + m_lock.Unlock(); + + ldout(cct, 20) << "oid=" << m_oid << ", flatten=" << m_flatten << dendl; + + auto ctx = util::create_context_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_deep_copy>(this); + auto req = deep_copy::ObjectCopyRequest<I>::create( + m_image_ctx->parent, m_image_ctx, 0, 0, + m_image_ctx->migration_info.snap_map, m_object_no, m_flatten, ctx); + + req->send(); +} + +template <typename I> +void CopyupRequest<I>::handle_deep_copy(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl; + + m_image_ctx->snap_lock.get_read(); + m_lock.Lock(); + m_copyup_required = is_copyup_required(); + if (r == -ENOENT && !m_flatten && m_copyup_required) { + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + ldout(cct, 10) << "restart deep-copy with flatten" << dendl; + send(); + return; + } + + disable_append_requests(); + + if (r < 0 && r != -ENOENT) { + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + lderr(cct) << "error encountered during deep-copy: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (!m_copyup_required && !is_update_object_map_required(r)) { + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + if (r == -ENOENT) { + r = 0; + } + + ldout(cct, 20) << "skipping" << dendl; + finish(r); + return; + } + + // For deep-copy, copyup() will never affect snapshots. However, + // this state machine is responsible for updating object maps for + // snapshots that have been created on destination image after + // migration started. + if (r != -ENOENT) { + compute_deep_copy_snap_ids(); + } + + m_lock.Unlock(); + m_image_ctx->snap_lock.put_read(); + + update_object_maps(); +} + +template <typename I> +void CopyupRequest<I>::update_object_maps() { + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + RWLock::RLocker snap_locker(m_image_ctx->snap_lock); + if (m_image_ctx->object_map == nullptr) { + snap_locker.unlock(); + owner_locker.unlock(); + + copyup(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "oid=" << m_oid << dendl; + + bool copy_on_read = m_pending_requests.empty(); + uint8_t head_object_map_state = OBJECT_EXISTS; + if (copy_on_read && !m_snap_ids.empty() && + m_image_ctx->test_features(RBD_FEATURE_FAST_DIFF, + m_image_ctx->snap_lock)) { + // HEAD is non-dirty since data is tied to first snapshot + head_object_map_state = OBJECT_EXISTS_CLEAN; + } + + auto r_it = m_pending_requests.rbegin(); + if (r_it != m_pending_requests.rend()) { + // last write-op determines the final object map state + head_object_map_state = (*r_it)->get_pre_write_object_map_state(); + } + + RWLock::WLocker object_map_locker(m_image_ctx->object_map_lock); + if ((*m_image_ctx->object_map)[m_object_no] != head_object_map_state) { + // (maybe) need to update the HEAD object map state + m_snap_ids.push_back(CEPH_NOSNAP); + } + object_map_locker.unlock(); + snap_locker.unlock(); + + ceph_assert(m_image_ctx->exclusive_lock->is_lock_owner()); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_UpdateObjectMap<I>>(), + boost::lambda::_1, m_image_ctx, m_object_no, head_object_map_state, + &m_snap_ids, m_first_snap_is_clean, m_trace, boost::lambda::_2)); + auto ctx = util::create_context_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_update_object_maps>(this); + auto throttle = new AsyncObjectThrottle<I>( + nullptr, *m_image_ctx, context_factory, ctx, nullptr, 0, m_snap_ids.size()); + throttle->start_ops( + m_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void CopyupRequest<I>::handle_update_object_maps(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + + finish(r); + return; + } + + copyup(); +} + +template <typename I> +void CopyupRequest<I>::copyup() { + auto cct = m_image_ctx->cct; + m_image_ctx->snap_lock.get_read(); + auto snapc = m_image_ctx->snapc; + m_image_ctx->snap_lock.put_read(); + + m_lock.Lock(); + if (!m_copyup_required) { + m_lock.Unlock(); + + ldout(cct, 20) << "skipping copyup" << dendl; + finish(0); + return; + } + + ldout(cct, 20) << "oid=" << m_oid << dendl; + + bool copy_on_read = m_pending_requests.empty(); + bool deep_copyup = !snapc.snaps.empty() && !m_copyup_is_zero; + if (m_copyup_is_zero) { + m_copyup_data.clear(); + } + + int r; + librados::ObjectWriteOperation copyup_op; + if (copy_on_read || deep_copyup) { + copyup_op.exec("rbd", "copyup", m_copyup_data); + ObjectRequest<I>::add_write_hint(*m_image_ctx, ©up_op); + ++m_pending_copyups; + } + + librados::ObjectWriteOperation write_op; + if (!copy_on_read) { + if (!deep_copyup) { + write_op.exec("rbd", "copyup", m_copyup_data); + ObjectRequest<I>::add_write_hint(*m_image_ctx, &write_op); + } + + // merge all pending write ops into this single RADOS op + for (auto req : m_pending_requests) { + ldout(cct, 20) << "add_copyup_ops " << req << dendl; + req->add_copyup_ops(&write_op); + } + + if (write_op.size() > 0) { + ++m_pending_copyups; + } + } + m_lock.Unlock(); + + // issue librados ops at the end to simplify test cases + std::vector<librados::snap_t> snaps; + if (copyup_op.size() > 0) { + // send only the copyup request with a blank snapshot context so that + // all snapshots are detected from the parent for this object. If + // this is a CoW request, a second request will be created for the + // actual modification. + ldout(cct, 20) << "copyup with empty snapshot context" << dendl; + + auto comp = util::create_rados_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_copyup>(this); + r = m_image_ctx->data_ctx.aio_operate( + m_oid, comp, ©up_op, 0, snaps, + (m_trace.valid() ? m_trace.get_info() : nullptr)); + ceph_assert(r == 0); + comp->release(); + } + + if (write_op.size() > 0) { + // compare-and-write doesn't add any write ops (copyup+cmpext+write + // can't be executed in the same RADOS op because, unless the object + // was already present in the clone, cmpext wouldn't see it) + ldout(cct, 20) << (!deep_copyup && write_op.size() > 2 ? + "copyup + ops" : !deep_copyup ? "copyup" : "ops") + << " with current snapshot context" << dendl; + + snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end()); + auto comp = util::create_rados_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_copyup>(this); + r = m_image_ctx->data_ctx.aio_operate( + m_oid, comp, &write_op, snapc.seq, snaps, + (m_trace.valid() ? m_trace.get_info() : nullptr)); + ceph_assert(r == 0); + comp->release(); + } +} + +template <typename I> +void CopyupRequest<I>::handle_copyup(int r) { + auto cct = m_image_ctx->cct; + unsigned pending_copyups; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_pending_copyups > 0); + pending_copyups = --m_pending_copyups; + } + + ldout(cct, 20) << "oid=" << m_oid << ", " << "r=" << r << ", " + << "pending=" << pending_copyups << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to copyup object: " << cpp_strerror(r) << dendl; + complete_requests(false, r); + } + + if (pending_copyups == 0) { + finish(0); + } +} + +template <typename I> +void CopyupRequest<I>::finish(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl; + + complete_requests(true, r); + delete this; +} + +template <typename I> +void CopyupRequest<I>::complete_requests(bool override_restart_retval, int r) { + auto cct = m_image_ctx->cct; + remove_from_list(); + + while (!m_pending_requests.empty()) { + auto it = m_pending_requests.begin(); + auto req = *it; + ldout(cct, 20) << "completing request " << req << dendl; + req->handle_copyup(r); + m_pending_requests.erase(it); + } + + if (override_restart_retval) { + r = -ERESTART; + } + + while (!m_restart_requests.empty()) { + auto it = m_restart_requests.begin(); + auto req = *it; + ldout(cct, 20) << "restarting request " << req << dendl; + req->handle_copyup(r); + m_restart_requests.erase(it); + } +} + +template <typename I> +void CopyupRequest<I>::disable_append_requests() { + ceph_assert(m_lock.is_locked()); + m_append_request_permitted = false; +} + +template <typename I> +void CopyupRequest<I>::remove_from_list() { + Mutex::Locker copyup_list_locker(m_image_ctx->copyup_list_lock); + + auto it = m_image_ctx->copyup_list.find(m_object_no); + if (it != m_image_ctx->copyup_list.end()) { + m_image_ctx->copyup_list.erase(it); + } +} + +template <typename I> +bool CopyupRequest<I>::is_copyup_required() { + ceph_assert(m_lock.is_locked()); + + bool copy_on_read = m_pending_requests.empty(); + if (copy_on_read) { + // always force a copyup if CoR enabled + return true; + } + + if (!m_copyup_is_zero) { + return true; + } + + for (auto req : m_pending_requests) { + if (!req->is_empty_write_op()) { + return true; + } + } + return false; +} + +template <typename I> +bool CopyupRequest<I>::is_deep_copy() const { + ceph_assert(m_image_ctx->snap_lock.is_locked()); + return !m_image_ctx->migration_info.empty(); +} + +template <typename I> +bool CopyupRequest<I>::is_update_object_map_required(int r) { + ceph_assert(m_image_ctx->snap_lock.is_locked()); + + if (r < 0) { + return false; + } + + if (m_image_ctx->object_map == nullptr) { + return false; + } + + if (m_image_ctx->migration_info.empty()) { + // migration might have completed while IO was in-flight, + // assume worst-case and perform an object map update + return true; + } + + auto it = m_image_ctx->migration_info.snap_map.find(CEPH_NOSNAP); + ceph_assert(it != m_image_ctx->migration_info.snap_map.end()); + return it->second[0] != CEPH_NOSNAP; +} + +template <typename I> +void CopyupRequest<I>::compute_deep_copy_snap_ids() { + ceph_assert(m_image_ctx->snap_lock.is_locked()); + + // don't copy ids for the snaps updated by object deep copy or + // that don't overlap + std::set<uint64_t> deep_copied; + for (auto &it : m_image_ctx->migration_info.snap_map) { + if (it.first != CEPH_NOSNAP) { + deep_copied.insert(it.second.front()); + } + } + + RWLock::RLocker parent_locker(m_image_ctx->parent_lock); + std::copy_if(m_image_ctx->snaps.rbegin(), m_image_ctx->snaps.rend(), + std::back_inserter(m_snap_ids), + [this, cct=m_image_ctx->cct, &deep_copied](uint64_t snap_id) { + if (deep_copied.count(snap_id)) { + m_first_snap_is_clean = true; + return false; + } + + uint64_t parent_overlap = 0; + int r = m_image_ctx->get_parent_overlap(snap_id, &parent_overlap); + if (r < 0) { + ldout(cct, 5) << "failed getting parent overlap for snap_id: " + << snap_id << ": " << cpp_strerror(r) << dendl; + } + if (parent_overlap == 0) { + return false; + } + std::vector<std::pair<uint64_t, uint64_t>> extents; + Striper::extent_to_file(cct, &m_image_ctx->layout, + m_object_no, 0, + m_image_ctx->layout.object_size, + extents); + auto overlap = m_image_ctx->prune_parent_extents( + extents, parent_overlap); + return overlap > 0; + }); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::CopyupRequest<librbd::ImageCtx>; diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h new file mode 100644 index 00000000..e4b3a2e7 --- /dev/null +++ b/src/librbd/io/CopyupRequest.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_COPYUP_REQUEST_H +#define CEPH_LIBRBD_IO_COPYUP_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "include/buffer.h" +#include "common/Mutex.h" +#include "common/zipkin_trace.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/Types.h" + +#include <string> +#include <vector> + +namespace ZTracer { struct Trace; } + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template <typename I> class AbstractObjectWriteRequest; + +template <typename ImageCtxT = librbd::ImageCtx> +class CopyupRequest { +public: + static CopyupRequest* create(ImageCtxT *ictx, const std::string &oid, + uint64_t objectno, Extents &&image_extents, + const ZTracer::Trace &parent_trace) { + return new CopyupRequest(ictx, oid, objectno, std::move(image_extents), + parent_trace); + } + + CopyupRequest(ImageCtxT *ictx, const std::string &oid, uint64_t objectno, + Extents &&image_extents, const ZTracer::Trace &parent_trace); + ~CopyupRequest(); + + void append_request(AbstractObjectWriteRequest<ImageCtxT> *req); + + void send(); + +private: + /** + * Copyup requests go through the following state machine to read from the + * parent image, update the object map, and copyup the object: + * + * + * @verbatim + * + * <start> + * | + * /---------/ \---------\ + * | | + * v v + * READ_FROM_PARENT DEEP_COPY + * | | + * \---------\ /---------/ + * | + * v (skip if not needed) + * UPDATE_OBJECT_MAPS + * | + * v (skip if not needed) + * COPYUP + * | + * v + * <finish> + * + * @endverbatim + * + * The OBJECT_MAP state is skipped if the object map isn't enabled or if + * an object map update isn't required. The COPYUP state is skipped if + * no data was read from the parent *and* there are no additional ops. + */ + + typedef std::vector<AbstractObjectWriteRequest<ImageCtxT> *> WriteRequests; + + ImageCtxT *m_image_ctx; + std::string m_oid; + uint64_t m_object_no; + Extents m_image_extents; + ZTracer::Trace m_trace; + + bool m_flatten = false; + bool m_copyup_required = true; + bool m_copyup_is_zero = true; + + ceph::bufferlist m_copyup_data; + + AsyncOperation m_async_op; + + std::vector<uint64_t> m_snap_ids; + bool m_first_snap_is_clean = false; + + Mutex m_lock; + WriteRequests m_pending_requests; + unsigned m_pending_copyups = 0; + + WriteRequests m_restart_requests; + bool m_append_request_permitted = true; + + void read_from_parent(); + void handle_read_from_parent(int r); + + void deep_copy(); + void handle_deep_copy(int r); + + void update_object_maps(); + void handle_update_object_maps(int r); + + void copyup(); + void handle_copyup(int r); + + void finish(int r); + void complete_requests(bool override_restart_retval, int r); + + void disable_append_requests(); + void remove_from_list(); + + bool is_copyup_required(); + bool is_update_object_map_required(int r); + bool is_deep_copy() const; + + void compute_deep_copy_snap_ids(); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::CopyupRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_COPYUP_REQUEST_H diff --git a/src/librbd/io/ImageDispatchSpec.cc b/src/librbd/io/ImageDispatchSpec.cc new file mode 100644 index 00000000..2c405d74 --- /dev/null +++ b/src/librbd/io/ImageDispatchSpec.cc @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include <boost/variant.hpp> + +namespace librbd { +namespace io { + +template <typename I> +struct ImageDispatchSpec<I>::SendVisitor + : public boost::static_visitor<void> { + ImageDispatchSpec* spec; + + explicit SendVisitor(ImageDispatchSpec* spec) + : spec(spec) { + } + + void operator()(Read& read) const { + ImageRequest<I>::aio_read( + &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents), + std::move(read.read_result), spec->m_op_flags, spec->m_parent_trace); + } + + void operator()(Discard& discard) const { + ImageRequest<I>::aio_discard( + &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents), + discard.discard_granularity_bytes, spec->m_parent_trace); + } + + void operator()(Write& write) const { + ImageRequest<I>::aio_write( + &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents), + std::move(write.bl), spec->m_op_flags, spec->m_parent_trace); + } + + void operator()(WriteSame& write_same) const { + ImageRequest<I>::aio_writesame( + &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents), + std::move(write_same.bl), spec->m_op_flags, spec->m_parent_trace); + } + + void operator()(CompareAndWrite& compare_and_write) const { + ImageRequest<I>::aio_compare_and_write( + &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents), + std::move(compare_and_write.cmp_bl), std::move(compare_and_write.bl), + compare_and_write.mismatch_offset, spec->m_op_flags, + spec->m_parent_trace); + } + + void operator()(Flush& flush) const { + ImageRequest<I>::aio_flush( + &spec->m_image_ctx, spec->m_aio_comp, flush.flush_source, + spec->m_parent_trace); + } +}; + +template <typename I> +struct ImageDispatchSpec<I>::IsWriteOpVisitor + : public boost::static_visitor<bool> { + bool operator()(const Read&) const { + return false; + } + + template <typename T> + bool operator()(const T&) const { + return true; + } +}; + +template <typename I> +struct ImageDispatchSpec<I>::TokenRequestedVisitor + : public boost::static_visitor<uint64_t> { + ImageDispatchSpec* spec; + uint64_t flag; + uint64_t *tokens; + + TokenRequestedVisitor(ImageDispatchSpec* spec, uint64_t _flag, + uint64_t *tokens) + : spec(spec), flag(_flag), tokens(tokens) { + } + + uint64_t operator()(const Read&) const { + if (flag & RBD_QOS_WRITE_MASK) { + *tokens = 0; + return false; + } + + *tokens = (flag & RBD_QOS_BPS_MASK) ? spec->extents_length() : 1; + return true; + } + + uint64_t operator()(const Flush&) const { + *tokens = 0; + return true; + } + + template <typename T> + uint64_t operator()(const T&) const { + if (flag & RBD_QOS_READ_MASK) { + *tokens = 0; + return false; + } + + *tokens = (flag & RBD_QOS_BPS_MASK) ? spec->extents_length() : 1; + return true; + } +}; + +template <typename I> +void ImageDispatchSpec<I>::send() { + boost::apply_visitor(SendVisitor{this}, m_request); +} + +template <typename I> +void ImageDispatchSpec<I>::fail(int r) { + m_aio_comp->get(); + m_aio_comp->fail(r); +} + +template <typename I> +uint64_t ImageDispatchSpec<I>::extents_length() { + uint64_t length = 0; + auto &extents = this->m_image_extents; + + for (auto &extent : extents) { + length += extent.second; + } + return length; +} + +template <typename I> +bool ImageDispatchSpec<I>::is_write_op() const { + return boost::apply_visitor(IsWriteOpVisitor(), m_request); +} + +template <typename I> +bool ImageDispatchSpec<I>::tokens_requested(uint64_t flag, uint64_t *tokens) { + return boost::apply_visitor(TokenRequestedVisitor{this, flag, tokens}, + m_request); +} + +template <typename I> +void ImageDispatchSpec<I>::start_op() { + m_aio_comp->start_op(); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageDispatchSpec<librbd::ImageCtx>; diff --git a/src/librbd/io/ImageDispatchSpec.h b/src/librbd/io/ImageDispatchSpec.h new file mode 100644 index 00000000..93c53a0f --- /dev/null +++ b/src/librbd/io/ImageDispatchSpec.h @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include "librbd/io/ReadResult.h" +#include <boost/variant/variant.hpp> + +namespace librbd { + +class ImageCtx; + +namespace io { + +class AioCompletion; + +template <typename ImageCtxT = ImageCtx> +class ImageDispatchSpec { +public: + struct Read { + ReadResult read_result; + + Read(ReadResult &&read_result) : read_result(std::move(read_result)) { + } + }; + + struct Discard { + uint32_t discard_granularity_bytes; + + Discard(uint32_t discard_granularity_bytes) + : discard_granularity_bytes(discard_granularity_bytes) { + } + }; + + struct Write { + bufferlist bl; + + Write(bufferlist&& bl) : bl(std::move(bl)) { + } + }; + + struct WriteSame { + bufferlist bl; + + WriteSame(bufferlist&& bl) : bl(std::move(bl)) { + } + }; + + struct CompareAndWrite { + bufferlist cmp_bl; + bufferlist bl; + uint64_t *mismatch_offset; + + CompareAndWrite(bufferlist&& cmp_bl, bufferlist&& bl, + uint64_t *mismatch_offset) + : cmp_bl(std::move(cmp_bl)), bl(std::move(bl)), + mismatch_offset(mismatch_offset) { + } + }; + + struct Flush { + FlushSource flush_source; + + Flush(FlushSource flush_source) : flush_source(flush_source) { + } + }; + + static ImageDispatchSpec* create_read_request( + ImageCtxT &image_ctx, AioCompletion *aio_comp, Extents &&image_extents, + ReadResult &&read_result, int op_flags, + const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx, aio_comp, + std::move(image_extents), + Read{std::move(read_result)}, + op_flags, parent_trace); + } + + static ImageDispatchSpec* create_discard_request( + ImageCtxT &image_ctx, AioCompletion *aio_comp, uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx, aio_comp, {{off, len}}, + Discard{discard_granularity_bytes}, + 0, parent_trace); + } + + static ImageDispatchSpec* create_write_request( + ImageCtxT &image_ctx, AioCompletion *aio_comp, Extents &&image_extents, + bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx, aio_comp, std::move(image_extents), + Write{std::move(bl)}, op_flags, parent_trace); + } + + static ImageDispatchSpec* create_write_same_request( + ImageCtxT &image_ctx, AioCompletion *aio_comp, uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx, aio_comp, {{off, len}}, + WriteSame{std::move(bl)}, op_flags, + parent_trace); + } + + static ImageDispatchSpec* create_compare_and_write_request( + ImageCtxT &image_ctx, AioCompletion *aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx, aio_comp, + std::move(image_extents), + CompareAndWrite{std::move(cmp_bl), + std::move(bl), + mismatch_offset}, + op_flags, parent_trace); + } + + static ImageDispatchSpec* create_flush_request( + ImageCtxT &image_ctx, AioCompletion *aio_comp, + FlushSource flush_source, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx, aio_comp, {}, Flush{flush_source}, + 0, parent_trace); + } + + void send(); + void fail(int r); + + bool is_write_op() const; + + void start_op(); + + bool tokens_requested(uint64_t flag, uint64_t *tokens); + + bool was_throttled(uint64_t flag) { + return m_throttled_flag & flag; + } + + void set_throttled(uint64_t flag) { + m_throttled_flag |= flag; + } + + bool were_all_throttled() { + return (m_throttled_flag & RBD_QOS_MASK) == RBD_QOS_MASK; + } + +private: + typedef boost::variant<Read, + Discard, + Write, + WriteSame, + CompareAndWrite, + Flush> Request; + + struct SendVisitor; + struct IsWriteOpVisitor; + struct TokenRequestedVisitor; + + ImageDispatchSpec(ImageCtxT& image_ctx, AioCompletion* aio_comp, + Extents&& image_extents, Request&& request, + int op_flags, const ZTracer::Trace& parent_trace) + : m_image_ctx(image_ctx), m_aio_comp(aio_comp), + m_image_extents(std::move(image_extents)), m_request(std::move(request)), + m_op_flags(op_flags), m_parent_trace(parent_trace) { + } + + ImageCtxT& m_image_ctx; + AioCompletion* m_aio_comp; + Extents m_image_extents; + Request m_request; + int m_op_flags; + ZTracer::Trace m_parent_trace; + std::atomic<uint64_t> m_throttled_flag = 0; + + uint64_t extents_length(); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageDispatchSpec<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc new file mode 100644 index 00000000..d6eb29fe --- /dev/null +++ b/src/librbd/io/ImageRequest.cc @@ -0,0 +1,824 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/cache/ImageCache.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/Utils.h" +#include "librbd/journal/Types.h" +#include "include/rados/librados.hpp" +#include "common/perf_counters.h" +#include "common/WorkQueue.h" +#include "osdc/Striper.h" +#include <algorithm> +#include <functional> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +using librbd::util::get_image_ctx; + +namespace { + +template <typename I> +struct C_UpdateTimestamp : public Context { +public: + I& m_image_ctx; + bool m_modify; // if modify set to 'true', modify timestamp is updated, + // access timestamp otherwise + AsyncOperation m_async_op; + + C_UpdateTimestamp(I& ictx, bool m) : m_image_ctx(ictx), m_modify(m) { + m_async_op.start_op(*get_image_ctx(&m_image_ctx)); + } + ~C_UpdateTimestamp() override { + m_async_op.finish_op(); + } + + void send() { + librados::ObjectWriteOperation op; + if (m_modify) { + cls_client::set_modify_timestamp(&op); + } else { + cls_client::set_access_timestamp(&op); + } + + auto comp = librbd::util::create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void finish(int r) override { + // ignore errors updating timestamp + } +}; + +bool should_update_timestamp(const utime_t& now, const utime_t& timestamp, + uint64_t interval) { + return (interval && + (static_cast<uint64_t>(now.sec()) >= interval + timestamp)); +} + +} // anonymous namespace + +template <typename I> +void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c, + Extents &&image_extents, + ReadResult &&read_result, int op_flags, + const ZTracer::Trace &parent_trace) { + ImageReadRequest<I> req(*ictx, c, std::move(image_extents), + std::move(read_result), op_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_write(I *ictx, AioCompletion *c, + Extents &&image_extents, bufferlist &&bl, + int op_flags, + const ZTracer::Trace &parent_trace) { + ImageWriteRequest<I> req(*ictx, c, std::move(image_extents), std::move(bl), + op_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_discard(I *ictx, AioCompletion *c, + Extents &&image_extents, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace) { + ImageDiscardRequest<I> req(*ictx, c, std::move(image_extents), + discard_granularity_bytes, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c, + FlushSource flush_source, + const ZTracer::Trace &parent_trace) { + ImageFlushRequest<I> req(*ictx, c, flush_source, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_writesame(I *ictx, AioCompletion *c, + Extents &&image_extents, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) { + ImageWriteSameRequest<I> req(*ictx, c, std::move(image_extents), + std::move(bl), op_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_compare_and_write(I *ictx, AioCompletion *c, + Extents &&image_extents, + bufferlist &&cmp_bl, + bufferlist &&bl, + uint64_t *mismatch_offset, + int op_flags, + const ZTracer::Trace &parent_trace) { + ImageCompareAndWriteRequest<I> req(*ictx, c, std::move(image_extents), + std::move(cmp_bl), std::move(bl), + mismatch_offset, op_flags, parent_trace); + req.send(); +} + + +template <typename I> +void ImageRequest<I>::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(m_aio_comp->is_initialized(get_aio_type())); + ceph_assert(m_aio_comp->is_started()); + + CephContext *cct = image_ctx.cct; + AioCompletion *aio_comp = this->m_aio_comp; + ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << dendl; + + aio_comp->get(); + int r = clip_request(); + if (r < 0) { + m_aio_comp->fail(r); + return; + } + + if (m_bypass_image_cache || m_image_ctx.image_cache == nullptr) { + update_timestamp(); + send_request(); + } else { + send_image_cache_request(); + } +} + +template <typename I> +int ImageRequest<I>::clip_request() { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + for (auto &image_extent : m_image_extents) { + auto clip_len = image_extent.second; + int r = clip_io(get_image_ctx(&m_image_ctx), image_extent.first, &clip_len); + if (r < 0) { + return r; + } + + image_extent.second = clip_len; + } + return 0; +} + +template <typename I> +void ImageRequest<I>::update_timestamp() { + bool modify = (get_aio_type() != AIO_TYPE_READ); + uint64_t update_interval; + if (modify) { + update_interval = m_image_ctx.mtime_update_interval; + } else { + update_interval = m_image_ctx.atime_update_interval; + } + + if (update_interval == 0) { + return; + } + + utime_t (I::*get_timestamp_fn)() const; + void (I::*set_timestamp_fn)(utime_t); + if (modify) { + get_timestamp_fn = &I::get_modify_timestamp; + set_timestamp_fn = &I::set_modify_timestamp; + } else { + get_timestamp_fn = &I::get_access_timestamp; + set_timestamp_fn = &I::set_access_timestamp; + } + + utime_t ts = ceph_clock_now(); + { + RWLock::RLocker timestamp_locker(m_image_ctx.timestamp_lock); + if(!should_update_timestamp(ts, std::invoke(get_timestamp_fn, m_image_ctx), + update_interval)) { + return; + } + } + + { + RWLock::WLocker timestamp_locker(m_image_ctx.timestamp_lock); + bool update = should_update_timestamp( + ts, std::invoke(get_timestamp_fn, m_image_ctx), update_interval); + if (!update) { + return; + } + + std::invoke(set_timestamp_fn, m_image_ctx, ts); + } + + // TODO we fire and forget this outside the IO path to prevent + // potential race conditions with librbd client IO callbacks + // between different threads (e.g. librados and object cacher) + ldout(m_image_ctx.cct, 10) << get_request_type() << dendl; + auto req = new C_UpdateTimestamp<I>(m_image_ctx, modify); + req->send(); +} + +template <typename I> +ImageReadRequest<I>::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, + ReadResult &&read_result, int op_flags, + const ZTracer::Trace &parent_trace) + : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), "read", + parent_trace), + m_op_flags(op_flags) { + aio_comp->read_result = std::move(read_result); +} + +template <typename I> +int ImageReadRequest<I>::clip_request() { + int r = ImageRequest<I>::clip_request(); + if (r < 0) { + return r; + } + + uint64_t buffer_length = 0; + auto &image_extents = this->m_image_extents; + for (auto &image_extent : image_extents) { + buffer_length += image_extent.second; + } + this->m_aio_comp->read_result.set_clip_length(buffer_length); + return 0; +} + +template <typename I> +void ImageReadRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + auto &image_extents = this->m_image_extents; + if (image_ctx.cache && image_ctx.readahead_max_bytes > 0 && + !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) { + readahead(get_image_ctx(&image_ctx), image_extents); + } + + AioCompletion *aio_comp = this->m_aio_comp; + librados::snap_t snap_id; + map<object_t,vector<ObjectExtent> > object_extents; + uint64_t buffer_ofs = 0; + { + // prevent image size from changing between computing clip and recording + // pending async operation + RWLock::RLocker snap_locker(image_ctx.snap_lock); + snap_id = image_ctx.snap_id; + + // map image extents to object extents + for (auto &extent : image_extents) { + if (extent.second == 0) { + continue; + } + + Striper::file_to_extents(cct, image_ctx.format_string, &image_ctx.layout, + extent.first, extent.second, 0, object_extents, + buffer_ofs); + buffer_ofs += extent.second; + } + } + + // pre-calculate the expected number of read requests + uint32_t request_count = 0; + for (auto &object_extent : object_extents) { + request_count += object_extent.second.size(); + } + aio_comp->set_request_count(request_count); + + // issue the requests + for (auto &object_extent : object_extents) { + for (auto &extent : object_extent.second) { + ldout(cct, 20) << "oid " << extent.oid << " " << extent.offset << "~" + << extent.length << " from " << extent.buffer_extents + << dendl; + + auto req_comp = new io::ReadResult::C_ObjectReadRequest( + aio_comp, extent.offset, extent.length, + std::move(extent.buffer_extents)); + auto req = ObjectDispatchSpec::create_read( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, extent.oid.name, + extent.objectno, extent.offset, extent.length, snap_id, m_op_flags, + this->m_trace, &req_comp->bl, &req_comp->extent_map, req_comp); + req->send(); + } + } + + aio_comp->put(); + + image_ctx.perfcounter->inc(l_librbd_rd); + image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs); +} + +template <typename I> +void ImageReadRequest<I>::send_image_cache_request() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.image_cache != nullptr); + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + + auto *req_comp = new io::ReadResult::C_ImageReadRequest( + aio_comp, this->m_image_extents); + image_ctx.image_cache->aio_read(std::move(this->m_image_extents), + &req_comp->bl, m_op_flags, + req_comp); +} + +template <typename I> +void AbstractImageWriteRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + RWLock::RLocker md_locker(image_ctx.md_lock); + + bool journaling = false; + + AioCompletion *aio_comp = this->m_aio_comp; + uint64_t clip_len = 0; + ObjectExtents object_extents; + ::SnapContext snapc; + { + // prevent image size from changing between computing clip and recording + // pending async operation + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.snap_id != CEPH_NOSNAP || image_ctx.read_only) { + aio_comp->fail(-EROFS); + return; + } + + for (auto &extent : this->m_image_extents) { + if (extent.second == 0) { + continue; + } + + // map to object extents + Striper::file_to_extents(cct, image_ctx.format_string, &image_ctx.layout, + extent.first, extent.second, 0, object_extents); + clip_len += extent.second; + } + + snapc = image_ctx.snapc; + journaling = (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()); + } + + int ret = prune_object_extents(&object_extents); + if (ret < 0) { + aio_comp->fail(ret); + return; + } + + if (!object_extents.empty()) { + uint64_t journal_tid = 0; + if (journaling) { + // in-flight ops are flushed prior to closing the journal + ceph_assert(image_ctx.journal != NULL); + journal_tid = append_journal_event(m_synchronous); + } + + aio_comp->set_request_count(object_extents.size()); + send_object_requests(object_extents, snapc, journal_tid); + } else { + // no IO to perform -- fire completion + aio_comp->set_request_count(0); + } + + update_stats(clip_len); + aio_comp->put(); +} + +template <typename I> +void AbstractImageWriteRequest<I>::send_object_requests( + const ObjectExtents &object_extents, const ::SnapContext &snapc, + uint64_t journal_tid) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + AioCompletion *aio_comp = this->m_aio_comp; + for (ObjectExtents::const_iterator p = object_extents.begin(); + p != object_extents.end(); ++p) { + ldout(cct, 20) << "oid " << p->oid << " " << p->offset << "~" << p->length + << " from " << p->buffer_extents << dendl; + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + auto request = create_object_request(*p, snapc, journal_tid, req_comp); + + // if journaling, stash the request for later; otherwise send + if (request != NULL) { + request->send(); + } + } +} + +template <typename I> +void ImageWriteRequest<I>::assemble_extent(const ObjectExtent &object_extent, + bufferlist *bl) { + for (auto q = object_extent.buffer_extents.begin(); + q != object_extent.buffer_extents.end(); ++q) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, q->first, q->second); + bl->claim_append(sub_bl); + } +} + +template <typename I> +uint64_t ImageWriteRequest<I>::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + uint64_t buffer_offset = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, buffer_offset, extent.second); + buffer_offset += extent.second; + + tid = image_ctx.journal->append_write_event(extent.first, extent.second, + sub_bl, synchronous); + } + + return tid; +} + +template <typename I> +void ImageWriteRequest<I>::send_image_cache_request() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.image_cache != nullptr); + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + image_ctx.image_cache->aio_write(std::move(this->m_image_extents), + std::move(m_bl), m_op_flags, req_comp); +} + +template <typename I> +ObjectDispatchSpec *ImageWriteRequest<I>::create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + assemble_extent(object_extent, &bl); + auto req = ObjectDispatchSpec::create_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name, + object_extent.objectno, object_extent.offset, std::move(bl), snapc, + m_op_flags, journal_tid, this->m_trace, on_finish); + return req; +} + +template <typename I> +void ImageWriteRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_wr); + image_ctx.perfcounter->inc(l_librbd_wr_bytes, length); +} + +template <typename I> +uint64_t ImageDiscardRequest<I>::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + journal::EventEntry event_entry( + journal::AioDiscardEvent(extent.first, + extent.second, + this->m_discard_granularity_bytes)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, 0); + } + + return tid; +} + +template <typename I> +void ImageDiscardRequest<I>::send_image_cache_request() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.image_cache != nullptr); + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(this->m_image_extents.size()); + for (auto &extent : this->m_image_extents) { + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + image_ctx.image_cache->aio_discard(extent.first, extent.second, + this->m_discard_granularity_bytes, + req_comp); + } +} + +template <typename I> +ObjectDispatchSpec *ImageDiscardRequest<I>::create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + auto req = ObjectDispatchSpec::create_discard( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name, + object_extent.objectno, object_extent.offset, object_extent.length, snapc, + OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE, journal_tid, this->m_trace, + on_finish); + return req; +} + +template <typename I> +void ImageDiscardRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_discard); + image_ctx.perfcounter->inc(l_librbd_discard_bytes, length); +} + +template <typename I> +int ImageDiscardRequest<I>::prune_object_extents( + ObjectExtents* object_extents) const { + if (m_discard_granularity_bytes == 0) { + return 0; + } + + // Align the range to discard_granularity_bytes boundary and skip + // and discards that are too small to free up any space. + // + // discard_granularity_bytes >= object_size && tail truncation + // is a special case for filestore + bool prune_required = false; + auto object_size = this->m_image_ctx.layout.object_size; + auto discard_granularity_bytes = std::min(m_discard_granularity_bytes, + object_size); + auto xform_lambda = + [discard_granularity_bytes, object_size, &prune_required] + (ObjectExtent& object_extent) { + auto& offset = object_extent.offset; + auto& length = object_extent.length; + auto next_offset = offset + length; + + if ((discard_granularity_bytes < object_size) || + (next_offset < object_size)) { + offset = p2roundup<uint64_t>(offset, discard_granularity_bytes); + next_offset = p2align<uint64_t>(next_offset, discard_granularity_bytes); + if (offset >= next_offset) { + prune_required = true; + length = 0; + } else { + length = next_offset - offset; + } + } + }; + std::for_each(object_extents->begin(), object_extents->end(), + xform_lambda); + + if (prune_required) { + // one or more object extents were skipped + auto remove_lambda = + [](const ObjectExtent& object_extent) { + return (object_extent.length == 0); + }; + object_extents->erase( + std::remove_if(object_extents->begin(), object_extents->end(), + remove_lambda), + object_extents->end()); + } + return 0; +} + +template <typename I> +void ImageFlushRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + + bool journaling = false; + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + journaling = (m_flush_source == FLUSH_SOURCE_USER && + image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()); + } + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + + Context *ctx = new C_AioRequest(aio_comp); + + // ensure no locks are held when flush is complete + ctx = librbd::util::create_async_context_callback(image_ctx, ctx); + + if (journaling) { + // in-flight ops are flushed prior to closing the journal + uint64_t journal_tid = image_ctx.journal->append_io_event( + journal::EventEntry(journal::AioFlushEvent()), 0, 0, false, 0); + image_ctx.journal->user_flushed(); + + ctx = new FunctionContext( + [&image_ctx, journal_tid, ctx](int r) { + image_ctx.journal->commit_io_event(journal_tid, r); + ctx->complete(r); + }); + ctx = new FunctionContext( + [&image_ctx, journal_tid, ctx](int r) { + image_ctx.journal->flush_event(journal_tid, ctx); + }); + } else { + // flush rbd cache only when journaling is not enabled + auto object_dispatch_spec = ObjectDispatchSpec::create_flush( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, m_flush_source, this->m_trace, + ctx); + ctx = new FunctionContext([object_dispatch_spec](int r) { + object_dispatch_spec->send(); + }); + } + + // ensure all in-flight IOs are settled if non-user flush request + aio_comp->async_op.flush(ctx); + aio_comp->put(); + + // might be flushing during image shutdown + if (image_ctx.perfcounter != nullptr) { + image_ctx.perfcounter->inc(l_librbd_flush); + } +} + +template <typename I> +void ImageFlushRequest<I>::send_image_cache_request() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.image_cache != nullptr); + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + image_ctx.image_cache->aio_flush(req_comp); +} + +template <typename I> +uint64_t ImageWriteSameRequest<I>::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first, + extent.second, + m_data_bl)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, 0); + } + + return tid; +} + +template <typename I> +void ImageWriteSameRequest<I>::send_image_cache_request() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.image_cache != nullptr); + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(this->m_image_extents.size()); + for (auto &extent : this->m_image_extents) { + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + image_ctx.image_cache->aio_writesame(extent.first, extent.second, + std::move(m_data_bl), m_op_flags, + req_comp); + } +} + +template <typename I> +ObjectDispatchSpec *ImageWriteSameRequest<I>::create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + ObjectDispatchSpec *req; + + if (util::assemble_write_same_extent(object_extent, m_data_bl, &bl, false)) { + Extents buffer_extents{object_extent.buffer_extents}; + + req = ObjectDispatchSpec::create_write_same( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name, + object_extent.objectno, object_extent.offset, object_extent.length, + std::move(buffer_extents), std::move(bl), snapc, m_op_flags, journal_tid, + this->m_trace, on_finish); + return req; + } + req = ObjectDispatchSpec::create_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name, + object_extent.objectno, object_extent.offset, std::move(bl), snapc, + m_op_flags, journal_tid, this->m_trace, on_finish); + return req; +} + +template <typename I> +void ImageWriteSameRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_ws); + image_ctx.perfcounter->inc(l_librbd_ws_bytes, length); +} + +template <typename I> +uint64_t ImageCompareAndWriteRequest<I>::append_journal_event( + bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(this->m_image_extents.size() == 1); + auto &extent = this->m_image_extents.front(); + journal::EventEntry event_entry( + journal::AioCompareAndWriteEvent(extent.first, extent.second, m_cmp_bl, + m_bl)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, -EILSEQ); + + return tid; +} + +template <typename I> +void ImageCompareAndWriteRequest<I>::assemble_extent( + const ObjectExtent &object_extent, bufferlist *bl) { + for (auto q = object_extent.buffer_extents.begin(); + q != object_extent.buffer_extents.end(); ++q) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, q->first, q->second); + bl->claim_append(sub_bl); + } +} + +template <typename I> +void ImageCompareAndWriteRequest<I>::send_image_cache_request() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.image_cache != nullptr); + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + image_ctx.image_cache->aio_compare_and_write( + std::move(this->m_image_extents), std::move(m_cmp_bl), std::move(m_bl), + m_mismatch_offset, m_op_flags, req_comp); +} + +template <typename I> +ObjectDispatchSpec *ImageCompareAndWriteRequest<I>::create_object_request( + const ObjectExtent &object_extent, + const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + // NOTE: safe to move m_cmp_bl since we only support this op against + // a single object + bufferlist bl; + assemble_extent(object_extent, &bl); + auto req = ObjectDispatchSpec::create_compare_and_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name, + object_extent.objectno, object_extent.offset, std::move(m_cmp_bl), + std::move(bl), snapc, m_mismatch_offset, m_op_flags, journal_tid, + this->m_trace, on_finish); + return req; +} + +template <typename I> +void ImageCompareAndWriteRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_cmp); + image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length); +} + +template <typename I> +int ImageCompareAndWriteRequest<I>::prune_object_extents( + ObjectExtents* object_extents) const { + if (object_extents->size() > 1) + return -EINVAL; + + I &image_ctx = this->m_image_ctx; + uint64_t sector_size = 512ULL; + uint64_t su = image_ctx.layout.stripe_unit; + ObjectExtent object_extent = object_extents->front(); + if (object_extent.offset % sector_size + object_extent.length > sector_size || + (su != 0 && (object_extent.offset % su + object_extent.length > su))) + return -EINVAL; + + return 0; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageRequest<librbd::ImageCtx>; +template class librbd::io::ImageReadRequest<librbd::ImageCtx>; +template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>; +template class librbd::io::ImageWriteRequest<librbd::ImageCtx>; +template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>; +template class librbd::io::ImageFlushRequest<librbd::ImageCtx>; +template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>; +template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>; diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h new file mode 100644 index 00000000..d7d10019 --- /dev/null +++ b/src/librbd/io/ImageRequest.h @@ -0,0 +1,365 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_H +#define CEPH_LIBRBD_IO_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "osd/osd_types.h" +#include "librbd/Utils.h" +#include "librbd/io/Types.h" +#include <list> +#include <utility> +#include <vector> + +namespace librbd { +class ImageCtx; + +namespace io { + +class AioCompletion; +class ObjectDispatchSpec; +class ReadResult; + +template <typename ImageCtxT = ImageCtx> +class ImageRequest { +public: + typedef std::vector<std::pair<uint64_t,uint64_t> > Extents; + + virtual ~ImageRequest() { + m_trace.event("finish"); + } + + static void aio_read(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ReadResult &&read_result, + int op_flags, const ZTracer::Trace &parent_trace); + static void aio_write(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace); + static void aio_discard(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace); + static void aio_flush(ImageCtxT *ictx, AioCompletion *c, + FlushSource flush_source, + const ZTracer::Trace &parent_trace); + static void aio_writesame(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace); + + static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, bufferlist &&cmp_bl, + bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace); + + void send(); + + void set_bypass_image_cache() { + m_bypass_image_cache = true; + } + + inline const ZTracer::Trace &get_trace() const { + return m_trace; + } + +protected: + typedef std::list<ObjectDispatchSpec*> ObjectRequests; + + ImageCtxT &m_image_ctx; + AioCompletion *m_aio_comp; + Extents m_image_extents; + ZTracer::Trace m_trace; + bool m_bypass_image_cache = false; + + ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, const char *trace_name, + const ZTracer::Trace &parent_trace) + : m_image_ctx(image_ctx), m_aio_comp(aio_comp), + m_image_extents(std::move(image_extents)), + m_trace(util::create_trace(image_ctx, trace_name, parent_trace)) { + m_trace.event("start"); + } + + + virtual int clip_request(); + virtual void update_timestamp(); + virtual void send_request() = 0; + virtual void send_image_cache_request() = 0; + + virtual aio_type_t get_aio_type() const = 0; + virtual const char *get_request_type() const = 0; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageReadRequest : public ImageRequest<ImageCtxT> { +public: + using typename ImageRequest<ImageCtxT>::Extents; + + ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ReadResult &&read_result, + int op_flags, const ZTracer::Trace &parent_trace); + +protected: + int clip_request() override; + + void send_request() override; + void send_image_cache_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_READ; + } + const char *get_request_type() const override { + return "aio_read"; + } +private: + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class AbstractImageWriteRequest : public ImageRequest<ImageCtxT> { +public: + inline void flag_synchronous() { + m_synchronous = true; + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + using typename ImageRequest<ImageCtxT>::Extents; + + typedef std::vector<ObjectExtent> ObjectExtents; + + AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, const char *trace_name, + const ZTracer::Trace &parent_trace) + : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents), + trace_name, parent_trace), + m_synchronous(false) { + } + + void send_request() override; + + virtual int prune_object_extents(ObjectExtents* object_extents) const { + return 0; + } + + void send_object_requests(const ObjectExtents &object_extents, + const ::SnapContext &snapc, uint64_t journal_tid); + virtual ObjectDispatchSpec *create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) = 0; + + virtual uint64_t append_journal_event(bool synchronous) = 0; + virtual void update_stats(size_t length) = 0; + +private: + bool m_synchronous; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageWriteRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + using typename ImageRequest<ImageCtxT>::Extents; + + ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), "write", parent_trace), + m_bl(std::move(bl)), m_op_flags(op_flags) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_WRITE; + } + const char *get_request_type() const override { + return "aio_write"; + } + + void assemble_extent(const ObjectExtent &object_extent, bufferlist *bl); + + void send_image_cache_request() override; + + + ObjectDispatchSpec *create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + +private: + bufferlist m_bl; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageDiscardRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents&& image_extents, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), "discard", parent_trace), + m_discard_granularity_bytes(discard_granularity_bytes) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_DISCARD; + } + const char *get_request_type() const override { + return "aio_discard"; + } + + void send_image_cache_request() override; + + ObjectDispatchSpec *create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + + int prune_object_extents(ObjectExtents* object_extents) const override; + +private: + uint32_t m_discard_granularity_bytes; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageFlushRequest : public ImageRequest<ImageCtxT> { +public: + ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + FlushSource flush_source, + const ZTracer::Trace &parent_trace) + : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {}, "flush", parent_trace), + m_flush_source(flush_source) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + int clip_request() override { + return 0; + } + void update_timestamp() override { + } + void send_request() override; + void send_image_cache_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_FLUSH; + } + const char *get_request_type() const override { + return "aio_flush"; + } + +private: + FlushSource m_flush_source; + +}; + +template <typename ImageCtxT = ImageCtx> +class ImageWriteSameRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents&& image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), "writesame", + parent_trace), + m_data_bl(std::move(bl)), m_op_flags(op_flags) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_WRITESAME; + } + const char *get_request_type() const override { + return "aio_writesame"; + } + + void send_image_cache_request() override; + + ObjectDispatchSpec *create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; +private: + bufferlist m_data_bl; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageCompareAndWriteRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents; + + ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, bufferlist &&cmp_bl, + bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), "compare_and_write", parent_trace), + m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)), + m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) { + } + +protected: + void send_image_cache_request() override; + + void assemble_extent(const ObjectExtent &object_extent, bufferlist *bl); + + ObjectDispatchSpec *create_object_request( + const ObjectExtent &object_extent, const ::SnapContext &snapc, + uint64_t journal_tid, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_COMPARE_AND_WRITE; + } + const char *get_request_type() const override { + return "aio_compare_and_write"; + } + + int prune_object_extents(ObjectExtents* object_extents) const override; + +private: + bufferlist m_cmp_bl; + bufferlist m_bl; + uint64_t *m_mismatch_offset; + int m_op_flags; +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageReadRequest<librbd::ImageCtx>; +extern template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageFlushRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H diff --git a/src/librbd/io/ImageRequestWQ.cc b/src/librbd/io/ImageRequestWQ.cc new file mode 100644 index 00000000..7bdaf2f2 --- /dev/null +++ b/src/librbd/io/ImageRequestWQ.cc @@ -0,0 +1,1043 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageRequestWQ.h" +#include "common/errno.h" +#include "common/zipkin_trace.h" +#include "common/Cond.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "common/EventTrace.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageRequestWQ: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +template <typename I> +void flush_image(I& image_ctx, Context* on_finish) { + auto aio_comp = librbd::io::AioCompletion::create_and_start( + on_finish, util::get_image_ctx(&image_ctx), librbd::io::AIO_TYPE_FLUSH); + auto req = librbd::io::ImageDispatchSpec<I>::create_flush_request( + image_ctx, aio_comp, librbd::io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + delete req; +} + +} // anonymous namespace + +template <typename I> +struct ImageRequestWQ<I>::C_AcquireLock : public Context { + ImageRequestWQ *work_queue; + ImageDispatchSpec<I> *image_request; + + C_AcquireLock(ImageRequestWQ *work_queue, ImageDispatchSpec<I> *image_request) + : work_queue(work_queue), image_request(image_request) { + } + + void finish(int r) override { + work_queue->handle_acquire_lock(r, image_request); + } +}; + +template <typename I> +struct ImageRequestWQ<I>::C_BlockedWrites : public Context { + ImageRequestWQ *work_queue; + explicit C_BlockedWrites(ImageRequestWQ *_work_queue) + : work_queue(_work_queue) { + } + + void finish(int r) override { + work_queue->handle_blocked_writes(r); + } +}; + +template <typename I> +struct ImageRequestWQ<I>::C_RefreshFinish : public Context { + ImageRequestWQ *work_queue; + ImageDispatchSpec<I> *image_request; + + C_RefreshFinish(ImageRequestWQ *work_queue, + ImageDispatchSpec<I> *image_request) + : work_queue(work_queue), image_request(image_request) { + } + void finish(int r) override { + work_queue->handle_refreshed(r, image_request); + } +}; + +static std::map<uint64_t, std::string> throttle_flags = { + { RBD_QOS_IOPS_THROTTLE, "rbd_qos_iops_throttle" }, + { RBD_QOS_BPS_THROTTLE, "rbd_qos_bps_throttle" }, + { RBD_QOS_READ_IOPS_THROTTLE, "rbd_qos_read_iops_throttle" }, + { RBD_QOS_WRITE_IOPS_THROTTLE, "rbd_qos_write_iops_throttle" }, + { RBD_QOS_READ_BPS_THROTTLE, "rbd_qos_read_bps_throttle" }, + { RBD_QOS_WRITE_BPS_THROTTLE, "rbd_qos_write_bps_throttle" } +}; + +template <typename I> +ImageRequestWQ<I>::ImageRequestWQ(I *image_ctx, const string &name, + time_t ti, ThreadPool *tp) + : ThreadPool::PointerWQ<ImageDispatchSpec<I> >(name, ti, 0, tp), + m_image_ctx(*image_ctx), + m_lock(util::unique_lock_name("ImageRequestWQ<I>::m_lock", this)) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; + + SafeTimer *timer; + Mutex *timer_lock; + ImageCtx::get_timer_instance(cct, &timer, &timer_lock); + + for (auto flag : throttle_flags) { + m_throttles.push_back(make_pair( + flag.first, + new TokenBucketThrottle(cct, flag.second, 0, 0, timer, timer_lock))); + } + + this->register_work_queue(); +} + +template <typename I> +ImageRequestWQ<I>::~ImageRequestWQ() { + for (auto t : m_throttles) { + delete t.second; + } +} + +template <typename I> +ssize_t ImageRequestWQ<I>::read(uint64_t off, uint64_t len, + ReadResult &&read_result, int op_flags) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + C_SaferCond cond; + AioCompletion *c = AioCompletion::create(&cond); + aio_read(c, off, len, std::move(read_result), op_flags, false); + return cond.wait(); +} + +template <typename I> +ssize_t ImageRequestWQ<I>::write(uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + m_image_ctx.snap_lock.get_read(); + int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len); + m_image_ctx.snap_lock.put_read(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond cond; + AioCompletion *c = AioCompletion::create(&cond); + aio_write(c, off, len, std::move(bl), op_flags, false); + + r = cond.wait(); + if (r < 0) { + return r; + } + return len; +} + +template <typename I> +ssize_t ImageRequestWQ<I>::discard(uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + m_image_ctx.snap_lock.get_read(); + int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len); + m_image_ctx.snap_lock.put_read(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond cond; + AioCompletion *c = AioCompletion::create(&cond); + aio_discard(c, off, len, discard_granularity_bytes, false); + + r = cond.wait(); + if (r < 0) { + return r; + } + return len; +} + +template <typename I> +ssize_t ImageRequestWQ<I>::writesame(uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", " + << "len = " << len << ", data_len " << bl.length() << dendl; + + m_image_ctx.snap_lock.get_read(); + int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len); + m_image_ctx.snap_lock.put_read(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond cond; + AioCompletion *c = AioCompletion::create(&cond); + aio_writesame(c, off, len, std::move(bl), op_flags, false); + + r = cond.wait(); + if (r < 0) { + return r; + } + return len; +} + +template <typename I> +ssize_t ImageRequestWQ<I>::write_zeroes(uint64_t off, uint64_t len, + int zero_flags, int op_flags) { + auto cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + m_image_ctx.snap_lock.get_read(); + int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len); + m_image_ctx.snap_lock.put_read(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_write_zeroes(aio_comp, off, len, zero_flags, op_flags, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + +template <typename I> +ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len, + bufferlist &&cmp_bl, + bufferlist &&bl, + uint64_t *mismatch_off, + int op_flags){ + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "compare_and_write ictx=" << &m_image_ctx << ", off=" + << off << ", " << "len = " << len << dendl; + + m_image_ctx.snap_lock.get_read(); + int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len); + m_image_ctx.snap_lock.put_read(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond cond; + AioCompletion *c = AioCompletion::create(&cond); + aio_compare_and_write(c, off, len, std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, false); + + r = cond.wait(); + if (r < 0) { + return r; + } + + return len; +} + +template <typename I> +int ImageRequestWQ<I>::flush() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << dendl; + + C_SaferCond cond; + AioCompletion *c = AioCompletion::create(&cond); + aio_flush(c, false); + + int r = cond.wait(); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +void ImageRequestWQ<I>::aio_read(AioCompletion *c, uint64_t off, uint64_t len, + ReadResult &&read_result, int op_flags, + bool native_async) { + CephContext *cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("wq: read", &m_image_ctx.trace_endpoint); + trace.event("start"); + } + + c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_READ); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << c << ", off=" << off << ", " + << "len=" << len << ", " << "flags=" << op_flags << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + c->set_event_notify(true); + } + + if (!start_in_flight_io(c)) { + return; + } + + // if journaling is enabled -- we need to replay the journal because + // it might contain an uncommitted write + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty() || + require_lock_on_read()) { + queue(ImageDispatchSpec<I>::create_read_request( + m_image_ctx, c, {{off, len}}, std::move(read_result), op_flags, + trace)); + } else { + c->start_op(); + ImageRequest<I>::aio_read(&m_image_ctx, c, {{off, len}}, + std::move(read_result), op_flags, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + +template <typename I> +void ImageRequestWQ<I>::aio_write(AioCompletion *c, uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags, + bool native_async) { + CephContext *cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("wq: write", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_WRITE); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << c << ", off=" << off << ", " + << "len=" << len << ", flags=" << op_flags << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + c->set_event_notify(true); + } + + if (!start_in_flight_io(c)) { + return; + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked()) { + queue(ImageDispatchSpec<I>::create_write_request( + m_image_ctx, c, {{off, len}}, std::move(bl), op_flags, trace)); + } else { + c->start_op(); + ImageRequest<I>::aio_write(&m_image_ctx, c, {{off, len}}, + std::move(bl), op_flags, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + +template <typename I> +void ImageRequestWQ<I>::aio_discard(AioCompletion *c, uint64_t off, + uint64_t len, + uint32_t discard_granularity_bytes, + bool native_async) { + CephContext *cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("wq: discard", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_DISCARD); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << c << ", off=" << off << ", len=" << len + << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + c->set_event_notify(true); + } + + if (!start_in_flight_io(c)) { + return; + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked()) { + queue(ImageDispatchSpec<I>::create_discard_request( + m_image_ctx, c, off, len, discard_granularity_bytes, trace)); + } else { + c->start_op(); + ImageRequest<I>::aio_discard(&m_image_ctx, c, {{off, len}}, + discard_granularity_bytes, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + +template <typename I> +void ImageRequestWQ<I>::aio_flush(AioCompletion *c, bool native_async) { + CephContext *cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("wq: flush", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_FLUSH); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << c << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + c->set_event_notify(true); + } + + if (!start_in_flight_io(c)) { + return; + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty()) { + queue(ImageDispatchSpec<I>::create_flush_request( + m_image_ctx, c, FLUSH_SOURCE_USER, trace)); + } else { + c->start_op(); + ImageRequest<I>::aio_flush(&m_image_ctx, c, FLUSH_SOURCE_USER, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + +template <typename I> +void ImageRequestWQ<I>::aio_writesame(AioCompletion *c, uint64_t off, + uint64_t len, bufferlist &&bl, + int op_flags, bool native_async) { + CephContext *cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("wq: writesame", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_WRITESAME); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << c << ", off=" << off << ", " + << "len=" << len << ", data_len = " << bl.length() << ", " + << "flags=" << op_flags << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + c->set_event_notify(true); + } + + if (!start_in_flight_io(c)) { + return; + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked()) { + queue(ImageDispatchSpec<I>::create_write_same_request( + m_image_ctx, c, off, len, std::move(bl), op_flags, trace)); + } else { + c->start_op(); + ImageRequest<I>::aio_writesame(&m_image_ctx, c, {{off, len}}, std::move(bl), + op_flags, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + + +template <typename I> +void ImageRequestWQ<I>::aio_write_zeroes(io::AioCompletion *aio_comp, + uint64_t off, uint64_t len, + int zero_flags, int op_flags, + bool native_async) { + auto cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("io: write_zeroes", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_DISCARD); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + // validate the supported flags + if (zero_flags != 0U) { + aio_comp->fail(-EINVAL); + return; + } + + if (!start_in_flight_io(aio_comp)) { + return; + } + + // enable partial discard (zeroing) of objects + uint32_t discard_granularity_bytes = 0; + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked()) { + queue(ImageDispatchSpec<I>::create_discard_request( + m_image_ctx, aio_comp, off, len, discard_granularity_bytes, trace)); + } else { + aio_comp->start_op(); + ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp, {{off, len}}, + discard_granularity_bytes, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + +template <typename I> +void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c, + uint64_t off, uint64_t len, + bufferlist &&cmp_bl, + bufferlist &&bl, + uint64_t *mismatch_off, + int op_flags, bool native_async) { + CephContext *cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("wq: compare_and_write", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_COMPARE_AND_WRITE); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << c << ", off=" << off << ", " + << "len=" << len << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + c->set_event_notify(true); + } + + if (!start_in_flight_io(c)) { + return; + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_image_ctx.non_blocking_aio || writes_blocked()) { + queue(ImageDispatchSpec<I>::create_compare_and_write_request( + m_image_ctx, c, {{off, len}}, std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, trace)); + } else { + c->start_op(); + ImageRequest<I>::aio_compare_and_write(&m_image_ctx, c, {{off, len}}, + std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, trace); + finish_in_flight_io(); + } + trace.event("finish"); +} + +template <typename I> +void ImageRequestWQ<I>::shut_down(Context *on_shutdown) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + { + RWLock::WLocker locker(m_lock); + ceph_assert(!m_shutdown); + m_shutdown = true; + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << __func__ << ": in_flight=" << m_in_flight_ios.load() + << dendl; + if (m_in_flight_ios > 0) { + m_on_shutdown = on_shutdown; + return; + } + } + + // ensure that all in-flight IO is flushed + flush_image(m_image_ctx, on_shutdown); +} + +template <typename I> +int ImageRequestWQ<I>::block_writes() { + C_SaferCond cond_ctx; + block_writes(&cond_ctx); + return cond_ctx.wait(); +} + +template <typename I> +void ImageRequestWQ<I>::block_writes(Context *on_blocked) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + { + RWLock::WLocker locker(m_lock); + ++m_write_blockers; + ldout(cct, 5) << &m_image_ctx << ", " << "num=" + << m_write_blockers << dendl; + if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) { + m_write_blocker_contexts.push_back(on_blocked); + return; + } + } + + // ensure that all in-flight IO is flushed + flush_image(m_image_ctx, on_blocked); +} + +template <typename I> +void ImageRequestWQ<I>::unblock_writes() { + CephContext *cct = m_image_ctx.cct; + + bool wake_up = false; + Contexts waiter_contexts; + { + RWLock::WLocker locker(m_lock); + ceph_assert(m_write_blockers > 0); + --m_write_blockers; + + ldout(cct, 5) << &m_image_ctx << ", " << "num=" + << m_write_blockers << dendl; + if (m_write_blockers == 0) { + wake_up = true; + std::swap(waiter_contexts, m_unblocked_write_waiter_contexts); + } + } + + if (wake_up) { + for (auto ctx : waiter_contexts) { + ctx->complete(0); + } + this->signal(); + } +} + +template <typename I> +void ImageRequestWQ<I>::wait_on_writes_unblocked(Context *on_unblocked) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + { + RWLock::WLocker locker(m_lock); + ldout(cct, 20) << &m_image_ctx << ", " << "write_blockers=" + << m_write_blockers << dendl; + if (!m_unblocked_write_waiter_contexts.empty() || m_write_blockers > 0) { + m_unblocked_write_waiter_contexts.push_back(on_unblocked); + return; + } + } + + on_unblocked->complete(0); +} + +template <typename I> +void ImageRequestWQ<I>::set_require_lock(Direction direction, bool enabled) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + bool wake_up = false; + { + RWLock::WLocker locker(m_lock); + switch (direction) { + case DIRECTION_READ: + wake_up = (enabled != m_require_lock_on_read); + m_require_lock_on_read = enabled; + break; + case DIRECTION_WRITE: + wake_up = (enabled != m_require_lock_on_write); + m_require_lock_on_write = enabled; + break; + case DIRECTION_BOTH: + wake_up = (enabled != m_require_lock_on_read || + enabled != m_require_lock_on_write); + m_require_lock_on_read = enabled; + m_require_lock_on_write = enabled; + break; + } + } + + // wake up the thread pool whenever the state changes so that + // we can re-request the lock if required + if (wake_up) { + this->signal(); + } +} + +template <typename I> +void ImageRequestWQ<I>::apply_qos_schedule_tick_min(uint64_t tick){ + for (auto pair : m_throttles) { + pair.second->set_schedule_tick_min(tick); + } +} + +template <typename I> +void ImageRequestWQ<I>::apply_qos_limit(const uint64_t flag, + uint64_t limit, + uint64_t burst) { + CephContext *cct = m_image_ctx.cct; + TokenBucketThrottle *throttle = nullptr; + for (auto pair : m_throttles) { + if (flag == pair.first) { + throttle = pair.second; + break; + } + } + ceph_assert(throttle != nullptr); + + int r = throttle->set_limit(limit, burst); + if (r < 0) { + lderr(cct) << throttle->get_name() << ": invalid qos parameter: " + << "burst(" << burst << ") is less than " + << "limit(" << limit << ")" << dendl; + // if apply failed, we should at least make sure the limit works. + throttle->set_limit(limit, 0); + } + + if (limit) + m_qos_enabled_flag |= flag; + else + m_qos_enabled_flag &= ~flag; +} + +template <typename I> +void ImageRequestWQ<I>::handle_throttle_ready(int r, ImageDispatchSpec<I> *item, + uint64_t flag) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 15) << "r=" << r << ", " << "req=" << item << dendl; + + std::lock_guard pool_locker{this->get_pool_lock()}; + ceph_assert(m_io_throttled.load() > 0); + item->set_throttled(flag); + if (item->were_all_throttled()) { + this->requeue_back(pool_locker, item); + --m_io_throttled; + this->signal(pool_locker); + } +} + +template <typename I> +bool ImageRequestWQ<I>::needs_throttle(ImageDispatchSpec<I> *item) { + uint64_t tokens = 0; + uint64_t flag = 0; + bool blocked = false; + TokenBucketThrottle* throttle = nullptr; + + for (auto t : m_throttles) { + flag = t.first; + if (item->was_throttled(flag)) + continue; + + if (!(m_qos_enabled_flag & flag)) { + item->set_throttled(flag); + continue; + } + + throttle = t.second; + if (item->tokens_requested(flag, &tokens) && + throttle->get<ImageRequestWQ<I>, ImageDispatchSpec<I>, + &ImageRequestWQ<I>::handle_throttle_ready>( + tokens, this, item, flag)) { + blocked = true; + } else { + item->set_throttled(flag); + } + } + return blocked; +} + +template <typename I> +void *ImageRequestWQ<I>::_void_dequeue() { + CephContext *cct = m_image_ctx.cct; + ImageDispatchSpec<I> *peek_item = this->front(); + + // no queued IO requests or all IO is blocked/stalled + if (peek_item == nullptr || m_io_blockers.load() > 0) { + return nullptr; + } + + if (needs_throttle(peek_item)) { + ldout(cct, 15) << "throttling IO " << peek_item << dendl; + + ++m_io_throttled; + // dequeue the throttled item + ThreadPool::PointerWQ<ImageDispatchSpec<I> >::_void_dequeue(); + return nullptr; + } + + bool lock_required; + bool refresh_required = m_image_ctx.state->is_refresh_required(); + { + RWLock::RLocker locker(m_lock); + bool write_op = peek_item->is_write_op(); + lock_required = is_lock_required(write_op); + if (write_op) { + if (!lock_required && m_write_blockers > 0) { + // missing lock is not the write blocker + return nullptr; + } + + if (!lock_required && !refresh_required) { + // completed ops will requeue the IO -- don't count it as in-progress + m_in_flight_writes++; + } + } + } + + auto item = reinterpret_cast<ImageDispatchSpec<I> *>( + ThreadPool::PointerWQ<ImageDispatchSpec<I> >::_void_dequeue()); + ceph_assert(peek_item == item); + + if (lock_required) { + this->get_pool_lock().unlock(); + m_image_ctx.owner_lock.get_read(); + if (m_image_ctx.exclusive_lock != nullptr) { + ldout(cct, 5) << "exclusive lock required: delaying IO " << item << dendl; + if (!m_image_ctx.get_exclusive_lock_policy()->may_auto_request_lock()) { + lderr(cct) << "op requires exclusive lock" << dendl; + fail_in_flight_io(m_image_ctx.exclusive_lock->get_unlocked_op_error(), + item); + + // wake up the IO since we won't be returning a request to process + this->signal(); + } else { + // stall IO until the acquire completes + ++m_io_blockers; + m_image_ctx.exclusive_lock->acquire_lock(new C_AcquireLock(this, item)); + } + } else { + // raced with the exclusive lock being disabled + lock_required = false; + } + m_image_ctx.owner_lock.put_read(); + this->get_pool_lock().lock(); + + if (lock_required) { + return nullptr; + } + } + + if (refresh_required) { + ldout(cct, 5) << "image refresh required: delaying IO " << item << dendl; + + // stall IO until the refresh completes + ++m_io_blockers; + + this->get_pool_lock().unlock(); + m_image_ctx.state->refresh(new C_RefreshFinish(this, item)); + this->get_pool_lock().lock(); + return nullptr; + } + + item->start_op(); + return item; +} + +template <typename I> +void ImageRequestWQ<I>::process(ImageDispatchSpec<I> *req) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "req=" << req << dendl; + + req->send(); + + finish_queued_io(req); + if (req->is_write_op()) { + finish_in_flight_write(); + } + delete req; + + finish_in_flight_io(); +} + +template <typename I> +void ImageRequestWQ<I>::finish_queued_io(ImageDispatchSpec<I> *req) { + RWLock::RLocker locker(m_lock); + if (req->is_write_op()) { + ceph_assert(m_queued_writes > 0); + m_queued_writes--; + } else { + ceph_assert(m_queued_reads > 0); + m_queued_reads--; + } +} + +template <typename I> +void ImageRequestWQ<I>::finish_in_flight_write() { + bool writes_blocked = false; + { + RWLock::RLocker locker(m_lock); + ceph_assert(m_in_flight_writes > 0); + if (--m_in_flight_writes == 0 && + !m_write_blocker_contexts.empty()) { + writes_blocked = true; + } + } + + if (writes_blocked) { + flush_image(m_image_ctx, new C_BlockedWrites(this)); + } +} + +template <typename I> +int ImageRequestWQ<I>::start_in_flight_io(AioCompletion *c) { + RWLock::RLocker locker(m_lock); + + if (m_shutdown) { + CephContext *cct = m_image_ctx.cct; + lderr(cct) << "IO received on closed image" << dendl; + + c->get(); + c->fail(-ESHUTDOWN); + return false; + } + + if (!m_image_ctx.data_ctx.is_valid()) { + CephContext *cct = m_image_ctx.cct; + lderr(cct) << "missing data pool" << dendl; + + c->get(); + c->fail(-ENODEV); + return false; + } + + m_in_flight_ios++; + return true; +} + +template <typename I> +void ImageRequestWQ<I>::finish_in_flight_io() { + Context *on_shutdown; + { + RWLock::RLocker locker(m_lock); + if (--m_in_flight_ios > 0 || !m_shutdown) { + return; + } + on_shutdown = m_on_shutdown; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "completing shut down" << dendl; + + ceph_assert(on_shutdown != nullptr); + flush_image(m_image_ctx, on_shutdown); +} + +template <typename I> +void ImageRequestWQ<I>::fail_in_flight_io( + int r, ImageDispatchSpec<I> *req) { + this->process_finish(); + req->fail(r); + finish_queued_io(req); + delete req; + finish_in_flight_io(); +} + +template <typename I> +bool ImageRequestWQ<I>::is_lock_required(bool write_op) const { + ceph_assert(m_lock.is_locked()); + return ((write_op && m_require_lock_on_write) || + (!write_op && m_require_lock_on_read)); +} + +template <typename I> +void ImageRequestWQ<I>::queue(ImageDispatchSpec<I> *req) { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "req=" << req << dendl; + + if (req->is_write_op()) { + m_queued_writes++; + } else { + m_queued_reads++; + } + + ThreadPool::PointerWQ<ImageDispatchSpec<I> >::queue(req); +} + +template <typename I> +void ImageRequestWQ<I>::handle_acquire_lock( + int r, ImageDispatchSpec<I> *req) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << ", " << "req=" << req << dendl; + + if (r < 0) { + fail_in_flight_io(r, req); + } else { + // since IO was stalled for acquire -- original IO order is preserved + // if we requeue this op for work queue processing + this->requeue_front(req); + } + + ceph_assert(m_io_blockers.load() > 0); + --m_io_blockers; + this->signal(); +} + +template <typename I> +void ImageRequestWQ<I>::handle_refreshed( + int r, ImageDispatchSpec<I> *req) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "resuming IO after image refresh: r=" << r << ", " + << "req=" << req << dendl; + if (r < 0) { + fail_in_flight_io(r, req); + } else { + // since IO was stalled for refresh -- original IO order is preserved + // if we requeue this op for work queue processing + this->requeue_front(req); + } + + ceph_assert(m_io_blockers.load() > 0); + --m_io_blockers; + this->signal(); +} + +template <typename I> +void ImageRequestWQ<I>::handle_blocked_writes(int r) { + Contexts contexts; + { + RWLock::WLocker locker(m_lock); + contexts.swap(m_write_blocker_contexts); + } + + for (auto ctx : contexts) { + ctx->complete(0); + } +} + +template class librbd::io::ImageRequestWQ<librbd::ImageCtx>; + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ImageRequestWQ.h b/src/librbd/io/ImageRequestWQ.h new file mode 100644 index 00000000..23dcd48d --- /dev/null +++ b/src/librbd/io/ImageRequestWQ.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H +#define CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H + +#include "include/Context.h" +#include "common/RWLock.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" +#include "librbd/io/Types.h" + +#include <list> +#include <atomic> + +namespace librbd { + +class ImageCtx; + +namespace io { + +class AioCompletion; +template <typename> class ImageDispatchSpec; +class ReadResult; + +template <typename ImageCtxT = librbd::ImageCtx> +class ImageRequestWQ + : public ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT> > { +public: + ImageRequestWQ(ImageCtxT *image_ctx, const string &name, time_t ti, + ThreadPool *tp); + ~ImageRequestWQ(); + + ssize_t read(uint64_t off, uint64_t len, ReadResult &&read_result, + int op_flags); + ssize_t write(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags); + ssize_t discard(uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes); + ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags); + ssize_t write_zeroes(uint64_t off, uint64_t len, int zero_flags, + int op_flags); + ssize_t compare_and_write(uint64_t off, uint64_t len, + bufferlist &&cmp_bl, bufferlist &&bl, + uint64_t *mismatch_off, int op_flags); + int flush(); + + void aio_read(AioCompletion *c, uint64_t off, uint64_t len, + ReadResult &&read_result, int op_flags, bool native_async=true); + void aio_write(AioCompletion *c, uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags, bool native_async=true); + void aio_discard(AioCompletion *c, uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes, bool native_async=true); + void aio_flush(AioCompletion *c, bool native_async=true); + void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags, bool native_async=true); + void aio_write_zeroes(AioCompletion *c, uint64_t off, uint64_t len, + int zero_flags, int op_flags, bool native_async); + void aio_compare_and_write(AioCompletion *c, uint64_t off, + uint64_t len, bufferlist &&cmp_bl, + bufferlist &&bl, uint64_t *mismatch_off, + int op_flags, bool native_async=true); + + using ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT> >::drain; + using ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT> >::empty; + + void shut_down(Context *on_shutdown); + + inline bool writes_blocked() const { + RWLock::RLocker locker(m_lock); + return (m_write_blockers > 0); + } + + int block_writes(); + void block_writes(Context *on_blocked); + void unblock_writes(); + + void wait_on_writes_unblocked(Context *on_unblocked); + + void set_require_lock(Direction direction, bool enabled); + + void apply_qos_schedule_tick_min(uint64_t tick); + + void apply_qos_limit(const uint64_t flag, uint64_t limit, uint64_t burst); +protected: + void *_void_dequeue() override; + void process(ImageDispatchSpec<ImageCtxT> *req) override; + bool _empty() override { + return (ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT>>::_empty() && + m_io_throttled.load() == 0); + } + + +private: + typedef std::list<Context *> Contexts; + + struct C_AcquireLock; + struct C_BlockedWrites; + struct C_RefreshFinish; + + ImageCtxT &m_image_ctx; + mutable RWLock m_lock; + Contexts m_write_blocker_contexts; + uint32_t m_write_blockers = 0; + Contexts m_unblocked_write_waiter_contexts; + bool m_require_lock_on_read = false; + bool m_require_lock_on_write = false; + std::atomic<unsigned> m_queued_reads { 0 }; + std::atomic<unsigned> m_queued_writes { 0 }; + std::atomic<unsigned> m_in_flight_ios { 0 }; + std::atomic<unsigned> m_in_flight_writes { 0 }; + std::atomic<unsigned> m_io_blockers { 0 }; + std::atomic<unsigned> m_io_throttled { 0 }; + + std::list<std::pair<uint64_t, TokenBucketThrottle*> > m_throttles; + uint64_t m_qos_enabled_flag = 0; + + bool m_shutdown = false; + Context *m_on_shutdown = nullptr; + + bool is_lock_required(bool write_op) const; + + inline bool require_lock_on_read() const { + RWLock::RLocker locker(m_lock); + return m_require_lock_on_read; + } + inline bool writes_empty() const { + RWLock::RLocker locker(m_lock); + return (m_queued_writes == 0); + } + + bool needs_throttle(ImageDispatchSpec<ImageCtxT> *item); + + void finish_queued_io(ImageDispatchSpec<ImageCtxT> *req); + void finish_in_flight_write(); + + int start_in_flight_io(AioCompletion *c); + void finish_in_flight_io(); + void fail_in_flight_io(int r, ImageDispatchSpec<ImageCtxT> *req); + + void queue(ImageDispatchSpec<ImageCtxT> *req); + + void handle_acquire_lock(int r, ImageDispatchSpec<ImageCtxT> *req); + void handle_refreshed(int r, ImageDispatchSpec<ImageCtxT> *req); + void handle_blocked_writes(int r); + + void handle_throttle_ready(int r, ImageDispatchSpec<ImageCtxT> *item, uint64_t flag); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageRequestWQ<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H diff --git a/src/librbd/io/ObjectDispatch.cc b/src/librbd/io/ObjectDispatch.cc new file mode 100644 index 00000000..85c8b034 --- /dev/null +++ b/src/librbd/io/ObjectDispatch.cc @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatch.h" +#include "common/dout.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/ObjectRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +ObjectDispatch<I>::ObjectDispatch(I* image_ctx) + : m_image_ctx(image_ctx) { +} + +template <typename I> +void ObjectDispatch<I>::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_image_ctx->op_work_queue->queue(on_finish, 0); +} + +template <typename I> +bool ObjectDispatch<I>::read( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + ExtentMap* extent_map, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectReadRequest<I>(m_image_ctx, oid, object_no, object_off, + object_len, snap_id, op_flags, + parent_trace, read_data, extent_map, + on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectDiscardRequest<I>(m_image_ctx, oid, object_no, + object_off, object_len, snapc, + discard_flags, parent_trace, + on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << data.length() << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectWriteRequest<I>(m_image_ctx, oid, object_no, object_off, + std::move(data), snapc, op_flags, + parent_trace, on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, Extents&& buffer_extents, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectWriteSameRequest<I>(m_image_ctx, oid, object_no, + object_off, object_len, + std::move(data), snapc, op_flags, + parent_trace, on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << write_data.length() + << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectCompareAndWriteRequest<I>(m_image_ctx, oid, object_no, + object_off, + std::move(cmp_data), + std::move(write_data), snapc, + mismatch_offset, op_flags, + parent_trace, on_dispatched); + req->send(); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/ObjectDispatch.h b/src/librbd/io/ObjectDispatch.h new file mode 100644 index 00000000..32e8272d --- /dev/null +++ b/src/librbd/io/ObjectDispatch.h @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template <typename ImageCtxT = librbd::ImageCtx> +class ObjectDispatch : public ObjectDispatchInterface { +public: + ObjectDispatch(ImageCtxT* image_ctx); + + ObjectDispatchLayer get_object_dispatch_layer() const override { + return OBJECT_DISPATCH_LAYER_CORE; + } + + void shut_down(Context* on_finish) override; + + bool read( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + ExtentMap* extent_map, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, Extents&& buffer_extents, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_H diff --git a/src/librbd/io/ObjectDispatchInterface.h b/src/librbd/io/ObjectDispatchInterface.h new file mode 100644 index 00000000..fddf0db1 --- /dev/null +++ b/src/librbd/io/ObjectDispatchInterface.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" + +struct Context; +struct RWLock; + +namespace librbd { +namespace io { + +struct AioCompletion; + +struct ObjectDispatchInterface { + virtual ~ObjectDispatchInterface() { + } + + virtual ObjectDispatchLayer get_object_dispatch_layer() const = 0; + + virtual void shut_down(Context* on_finish) = 0; + + virtual bool read( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + ExtentMap* extent_map, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, Extents&& buffer_extents, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool invalidate_cache(Context* on_finish) = 0; + virtual bool reset_existence_cache(Context* on_finish) = 0; + + virtual void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) = 0; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H diff --git a/src/librbd/io/ObjectDispatchSpec.cc b/src/librbd/io/ObjectDispatchSpec.cc new file mode 100644 index 00000000..2b6dccc5 --- /dev/null +++ b/src/librbd/io/ObjectDispatchSpec.cc @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatchSpec.h" +#include "include/Context.h" +#include "librbd/io/ObjectDispatcher.h" +#include <boost/variant.hpp> + +namespace librbd { +namespace io { + +void ObjectDispatchSpec::C_Dispatcher::complete(int r) { + if (r < 0) { + finish(r); + return; + } + + switch (object_dispatch_spec->dispatch_result) { + case DISPATCH_RESULT_CONTINUE: + object_dispatch_spec->send(); + break; + case DISPATCH_RESULT_COMPLETE: + finish(r); + break; + case DISPATCH_RESULT_INVALID: + ceph_abort(); + break; + } +} + +void ObjectDispatchSpec::C_Dispatcher::finish(int r) { + on_finish->complete(r); + delete object_dispatch_spec; +} + +void ObjectDispatchSpec::send() { + object_dispatcher->send(this); +} + +void ObjectDispatchSpec::fail(int r) { + ceph_assert(r < 0); + dispatcher_ctx.complete(r); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ObjectDispatchSpec.h b/src/librbd/io/ObjectDispatchSpec.h new file mode 100644 index 00000000..a26d89fe --- /dev/null +++ b/src/librbd/io/ObjectDispatchSpec.h @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include <boost/variant/variant.hpp> + +namespace librbd { +namespace io { + +struct ObjectDispatcherInterface; + +struct ObjectDispatchSpec { +private: + // helper to avoid extra heap allocation per object IO + struct C_Dispatcher : public Context { + ObjectDispatchSpec* object_dispatch_spec; + Context* on_finish; + + C_Dispatcher(ObjectDispatchSpec* object_dispatch_spec, Context* on_finish) + : object_dispatch_spec(object_dispatch_spec), on_finish(on_finish) { + } + + void complete(int r) override; + void finish(int r) override; + }; + +public: + struct RequestBase { + std::string oid; + uint64_t object_no; + uint64_t object_off; + + RequestBase(const std::string& oid, uint64_t object_no, uint64_t object_off) + : oid(oid), object_no(object_no), object_off(object_off) { + } + }; + + struct ReadRequest : public RequestBase { + uint64_t object_len; + librados::snap_t snap_id; + ceph::bufferlist* read_data; + ExtentMap* extent_map; + + ReadRequest(const std::string& oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, + ceph::bufferlist* read_data, ExtentMap* extent_map) + : RequestBase(oid, object_no, object_off), + object_len(object_len), snap_id(snap_id), read_data(read_data), + extent_map(extent_map) { + } + }; + + struct WriteRequestBase : public RequestBase { + ::SnapContext snapc; + uint64_t journal_tid; + + WriteRequestBase(const std::string& oid, uint64_t object_no, + uint64_t object_off, const ::SnapContext& snapc, + uint64_t journal_tid) + : RequestBase(oid, object_no, object_off), snapc(snapc), + journal_tid(journal_tid) { + } + }; + + struct DiscardRequest : public WriteRequestBase { + uint64_t object_len; + int discard_flags; + + DiscardRequest(const std::string& oid, uint64_t object_no, + uint64_t object_off, uint64_t object_len, + int discard_flags, const ::SnapContext& snapc, + uint64_t journal_tid) + : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid), + object_len(object_len), discard_flags(discard_flags) { + } + }; + + struct WriteRequest : public WriteRequestBase { + ceph::bufferlist data; + + WriteRequest(const std::string& oid, uint64_t object_no, + uint64_t object_off, ceph::bufferlist&& data, + const ::SnapContext& snapc, uint64_t journal_tid) + : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid), + data(std::move(data)) { + } + }; + + struct WriteSameRequest : public WriteRequestBase { + uint64_t object_len; + Extents buffer_extents; + ceph::bufferlist data; + + WriteSameRequest(const std::string& oid, uint64_t object_no, + uint64_t object_off, uint64_t object_len, + Extents&& buffer_extents, ceph::bufferlist&& data, + const ::SnapContext& snapc, uint64_t journal_tid) + : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid), + object_len(object_len), buffer_extents(std::move(buffer_extents)), + data(std::move(data)) { + } + }; + + struct CompareAndWriteRequest : public WriteRequestBase { + ceph::bufferlist cmp_data; + ceph::bufferlist data; + uint64_t* mismatch_offset; + + CompareAndWriteRequest(const std::string& oid, uint64_t object_no, + uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& data, uint64_t* mismatch_offset, + const ::SnapContext& snapc, uint64_t journal_tid) + : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid), + cmp_data(std::move(cmp_data)), data(std::move(data)), + mismatch_offset(mismatch_offset) { + } + }; + + struct FlushRequest { + FlushSource flush_source; + + FlushRequest(FlushSource flush_source) : flush_source(flush_source) { + } + }; + + typedef boost::variant<ReadRequest, + DiscardRequest, + WriteRequest, + WriteSameRequest, + CompareAndWriteRequest, + FlushRequest> Request; + + C_Dispatcher dispatcher_ctx; + + ObjectDispatcherInterface* object_dispatcher; + ObjectDispatchLayer object_dispatch_layer; + int object_dispatch_flags = 0; + DispatchResult dispatch_result = DISPATCH_RESULT_INVALID; + + Request request; + int op_flags; + ZTracer::Trace parent_trace; + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_read( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + ExtentMap* extent_map, Context* on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + ReadRequest{oid, object_no, object_off, + object_len, snap_id, read_data, + extent_map}, + op_flags, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_discard( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + uint64_t journal_tid, const ZTracer::Trace &parent_trace, + Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + DiscardRequest{oid, object_no, object_off, + object_len, discard_flags, + snapc, journal_tid}, + 0, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_write( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + uint64_t journal_tid, const ZTracer::Trace &parent_trace, + Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + WriteRequest{oid, object_no, object_off, + std::move(data), snapc, + journal_tid}, + op_flags, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_write_same( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, Extents&& buffer_extents, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + uint64_t journal_tid, const ZTracer::Trace &parent_trace, + Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + WriteSameRequest{oid, object_no, object_off, + object_len, + std::move(buffer_extents), + std::move(data), snapc, + journal_tid}, + op_flags, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_compare_and_write( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, uint64_t *mismatch_offset, int op_flags, + uint64_t journal_tid, const ZTracer::Trace &parent_trace, + Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + CompareAndWriteRequest{oid, object_no, + object_off, + std::move(cmp_data), + std::move(write_data), + mismatch_offset, + snapc, journal_tid}, + op_flags, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_flush( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + FlushSource flush_source, const ZTracer::Trace &parent_trace, + Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + FlushRequest{flush_source}, 0, + parent_trace, on_finish); + } + + void send(); + void fail(int r); + +private: + template <typename> friend class ObjectDispatcher; + + ObjectDispatchSpec(ObjectDispatcherInterface* object_dispatcher, + ObjectDispatchLayer object_dispatch_layer, + Request&& request, int op_flags, + const ZTracer::Trace& parent_trace, Context* on_finish) + : dispatcher_ctx(this, on_finish), object_dispatcher(object_dispatcher), + object_dispatch_layer(object_dispatch_layer), request(std::move(request)), + op_flags(op_flags), parent_trace(parent_trace) { + } + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H diff --git a/src/librbd/io/ObjectDispatcher.cc b/src/librbd/io/ObjectDispatcher.cc new file mode 100644 index 00000000..befc0750 --- /dev/null +++ b/src/librbd/io/ObjectDispatcher.cc @@ -0,0 +1,354 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatcher.h" +#include "include/Context.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/ObjectDispatch.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include <boost/variant.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectDispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +struct ObjectDispatcher<I>::C_LayerIterator : public Context { + ObjectDispatcher* object_dispatcher; + Context* on_finish; + + ObjectDispatchLayer object_dispatch_layer = OBJECT_DISPATCH_LAYER_NONE; + + C_LayerIterator(ObjectDispatcher* object_dispatcher, + Context* on_finish) + : object_dispatcher(object_dispatcher), on_finish(on_finish) { + } + + void complete(int r) override { + while (true) { + object_dispatcher->m_lock.get_read(); + auto it = object_dispatcher->m_object_dispatches.upper_bound( + object_dispatch_layer); + if (it == object_dispatcher->m_object_dispatches.end()) { + object_dispatcher->m_lock.put_read(); + Context::complete(r); + return; + } + + auto& object_dispatch_meta = it->second; + auto object_dispatch = object_dispatch_meta.object_dispatch; + + // prevent recursive locking back into the dispatcher while handling IO + object_dispatch_meta.async_op_tracker->start_op(); + object_dispatcher->m_lock.put_read(); + + // next loop should start after current layer + object_dispatch_layer = object_dispatch->get_object_dispatch_layer(); + + auto handled = execute(object_dispatch, this); + object_dispatch_meta.async_op_tracker->finish_op(); + + if (handled) { + break; + } + } + } + + void finish(int r) override { + on_finish->complete(0); + } + + virtual bool execute(ObjectDispatchInterface* object_dispatch, + Context* on_finish) = 0; +}; + +template <typename I> +struct ObjectDispatcher<I>::C_InvalidateCache : public C_LayerIterator { + C_InvalidateCache(ObjectDispatcher* object_dispatcher, Context* on_finish) + : C_LayerIterator(object_dispatcher, on_finish) { + } + + bool execute(ObjectDispatchInterface* object_dispatch, + Context* on_finish) override { + return object_dispatch->invalidate_cache(on_finish); + } +}; + +template <typename I> +struct ObjectDispatcher<I>::C_ResetExistenceCache : public C_LayerIterator { + C_ResetExistenceCache(ObjectDispatcher* object_dispatcher, Context* on_finish) + : C_LayerIterator(object_dispatcher, on_finish) { + } + + bool execute(ObjectDispatchInterface* object_dispatch, + Context* on_finish) override { + return object_dispatch->reset_existence_cache(on_finish); + } +}; + +template <typename I> +struct ObjectDispatcher<I>::SendVisitor : public boost::static_visitor<bool> { + ObjectDispatchInterface* object_dispatch; + ObjectDispatchSpec* object_dispatch_spec; + + SendVisitor(ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) + : object_dispatch(object_dispatch), + object_dispatch_spec(object_dispatch_spec) { + } + + bool operator()(ObjectDispatchSpec::ReadRequest& read) const { + return object_dispatch->read( + read.oid, read.object_no, read.object_off, read.object_len, read.snap_id, + object_dispatch_spec->op_flags, object_dispatch_spec->parent_trace, + read.read_data, read.extent_map, + &object_dispatch_spec->object_dispatch_flags, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::DiscardRequest& discard) const { + return object_dispatch->discard( + discard.oid, discard.object_no, discard.object_off, discard.object_len, + discard.snapc, discard.discard_flags, object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &discard.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::WriteRequest& write) const { + return object_dispatch->write( + write.oid, write.object_no, write.object_off, std::move(write.data), + write.snapc, object_dispatch_spec->op_flags, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &write.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::WriteSameRequest& write_same) const { + return object_dispatch->write_same( + write_same.oid, write_same.object_no, write_same.object_off, + write_same.object_len, std::move(write_same.buffer_extents), + std::move(write_same.data), write_same.snapc, + object_dispatch_spec->op_flags, object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &write_same.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()( + ObjectDispatchSpec::CompareAndWriteRequest& compare_and_write) const { + return object_dispatch->compare_and_write( + compare_and_write.oid, compare_and_write.object_no, + compare_and_write.object_off, std::move(compare_and_write.cmp_data), + std::move(compare_and_write.data), compare_and_write.snapc, + object_dispatch_spec->op_flags, object_dispatch_spec->parent_trace, + compare_and_write.mismatch_offset, + &object_dispatch_spec->object_dispatch_flags, + &compare_and_write.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::FlushRequest& flush) const { + return object_dispatch->flush( + flush.flush_source, object_dispatch_spec->parent_trace, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } +}; + +template <typename I> +ObjectDispatcher<I>::ObjectDispatcher(I* image_ctx) + : m_image_ctx(image_ctx), + m_lock(librbd::util::unique_lock_name("librbd::io::ObjectDispatcher::lock", + this)) { + // configure the core object dispatch handler on startup + auto object_dispatch = new ObjectDispatch(image_ctx); + m_object_dispatches[object_dispatch->get_object_dispatch_layer()] = + {object_dispatch, new AsyncOpTracker()}; +} + +template <typename I> +ObjectDispatcher<I>::~ObjectDispatcher() { + ceph_assert(m_object_dispatches.empty()); +} + +template <typename I> +void ObjectDispatcher<I>::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + std::map<ObjectDispatchLayer, ObjectDispatchMeta> object_dispatches; + { + RWLock::WLocker locker(m_lock); + std::swap(object_dispatches, m_object_dispatches); + } + + for (auto it : object_dispatches) { + shut_down_object_dispatch(it.second, &on_finish); + } + on_finish->complete(0); +} + +template <typename I> +void ObjectDispatcher<I>::register_object_dispatch( + ObjectDispatchInterface* object_dispatch) { + auto cct = m_image_ctx->cct; + auto type = object_dispatch->get_object_dispatch_layer(); + ldout(cct, 5) << "object_dispatch_layer=" << type << dendl; + + RWLock::WLocker locker(m_lock); + ceph_assert(type < OBJECT_DISPATCH_LAYER_LAST); + + auto result = m_object_dispatches.insert( + {type, {object_dispatch, new AsyncOpTracker()}}); + ceph_assert(result.second); +} + +template <typename I> +void ObjectDispatcher<I>::shut_down_object_dispatch( + ObjectDispatchLayer object_dispatch_layer, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "object_dispatch_layer=" << object_dispatch_layer << dendl; + ceph_assert(object_dispatch_layer + 1 < OBJECT_DISPATCH_LAYER_LAST); + + ObjectDispatchMeta object_dispatch_meta; + { + RWLock::WLocker locker(m_lock); + auto it = m_object_dispatches.find(object_dispatch_layer); + ceph_assert(it != m_object_dispatches.end()); + + object_dispatch_meta = it->second; + m_object_dispatches.erase(it); + } + + shut_down_object_dispatch(object_dispatch_meta, &on_finish); + on_finish->complete(0); +} + +template <typename I> +void ObjectDispatcher<I>::shut_down_object_dispatch( + ObjectDispatchMeta& object_dispatch_meta, Context** on_finish) { + auto object_dispatch = object_dispatch_meta.object_dispatch; + auto async_op_tracker = object_dispatch_meta.async_op_tracker; + + Context* ctx = *on_finish; + ctx = new FunctionContext( + [object_dispatch, async_op_tracker, ctx](int r) { + delete object_dispatch; + delete async_op_tracker; + + ctx->complete(r); + }); + ctx = new FunctionContext([object_dispatch, ctx](int r) { + object_dispatch->shut_down(ctx); + }); + *on_finish = new FunctionContext([async_op_tracker, ctx](int r) { + async_op_tracker->wait_for_ops(ctx); + }); +} + +template <typename I> +void ObjectDispatcher<I>::invalidate_cache(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish = util::create_async_context_callback(*m_image_ctx, on_finish); + auto ctx = new C_InvalidateCache(this, on_finish); + ctx->complete(0); +} + +template <typename I> +void ObjectDispatcher<I>::reset_existence_cache(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish = util::create_async_context_callback(*m_image_ctx, on_finish); + auto ctx = new C_ResetExistenceCache(this, on_finish); + ctx->complete(0); +} + +template <typename I> +void ObjectDispatcher<I>::extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << object_no << " " << object_off << "~" << object_len + << dendl; + + for (auto it : m_object_dispatches) { + auto& object_dispatch_meta = it.second; + auto object_dispatch = object_dispatch_meta.object_dispatch; + object_dispatch->extent_overwritten(object_no, object_off, object_len, + journal_tid, new_journal_tid); + } +} + +template <typename I> +void ObjectDispatcher<I>::send(ObjectDispatchSpec* object_dispatch_spec) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_dispatch_spec=" << object_dispatch_spec << dendl; + + auto object_dispatch_layer = object_dispatch_spec->object_dispatch_layer; + ceph_assert(object_dispatch_layer + 1 < OBJECT_DISPATCH_LAYER_LAST); + + // apply the IO request to all layers -- this method will be re-invoked + // by the dispatch layer if continuing / restarting the IO + while (true) { + m_lock.get_read(); + object_dispatch_layer = object_dispatch_spec->object_dispatch_layer; + auto it = m_object_dispatches.upper_bound(object_dispatch_layer); + if (it == m_object_dispatches.end()) { + // the request is complete if handled by all layers + object_dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE; + m_lock.put_read(); + break; + } + + auto& object_dispatch_meta = it->second; + auto object_dispatch = object_dispatch_meta.object_dispatch; + object_dispatch_spec->dispatch_result = DISPATCH_RESULT_INVALID; + + // prevent recursive locking back into the dispatcher while handling IO + object_dispatch_meta.async_op_tracker->start_op(); + m_lock.put_read(); + + // advance to next layer in case we skip or continue + object_dispatch_spec->object_dispatch_layer = + object_dispatch->get_object_dispatch_layer(); + + bool handled = boost::apply_visitor( + SendVisitor{object_dispatch, object_dispatch_spec}, + object_dispatch_spec->request); + object_dispatch_meta.async_op_tracker->finish_op(); + + // handled ops will resume when the dispatch ctx is invoked + if (handled) { + return; + } + } + + // skipped through to the last layer + object_dispatch_spec->dispatcher_ctx.complete(0); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectDispatcher<librbd::ImageCtx>; diff --git a/src/librbd/io/ObjectDispatcher.h b/src/librbd/io/ObjectDispatcher.h new file mode 100644 index 00000000..0370d268 --- /dev/null +++ b/src/librbd/io/ObjectDispatcher.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H + +#include "include/int_types.h" +#include "common/RWLock.h" +#include "librbd/io/Types.h" +#include <map> + +struct AsyncOpTracker; +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct ObjectDispatchInterface; +struct ObjectDispatchSpec; + +struct ObjectDispatcherInterface { +public: + virtual ~ObjectDispatcherInterface() { + } + +private: + friend class ObjectDispatchSpec; + + virtual void send(ObjectDispatchSpec* object_dispatch_spec) = 0; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectDispatcher : public ObjectDispatcherInterface { +public: + ObjectDispatcher(ImageCtxT* image_ctx); + ~ObjectDispatcher(); + + void shut_down(Context* on_finish); + + void register_object_dispatch(ObjectDispatchInterface* object_dispatch); + void shut_down_object_dispatch(ObjectDispatchLayer object_dispatch_layer, + Context* on_finish); + + void invalidate_cache(Context* on_finish); + void reset_existence_cache(Context* on_finish); + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid); + +private: + struct ObjectDispatchMeta { + ObjectDispatchInterface* object_dispatch = nullptr; + AsyncOpTracker* async_op_tracker = nullptr; + + ObjectDispatchMeta() { + } + ObjectDispatchMeta(ObjectDispatchInterface* object_dispatch, + AsyncOpTracker* async_op_tracker) + : object_dispatch(object_dispatch), async_op_tracker(async_op_tracker) { + } + }; + + struct C_LayerIterator; + struct C_InvalidateCache; + struct C_ResetExistenceCache; + struct SendVisitor; + + ImageCtxT* m_image_ctx; + + RWLock m_lock; + std::map<ObjectDispatchLayer, ObjectDispatchMeta> m_object_dispatches; + + void send(ObjectDispatchSpec* object_dispatch_spec); + + void shut_down_object_dispatch(ObjectDispatchMeta& object_dispatch_meta, + Context** on_finish); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectDispatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc new file mode 100644 index 00000000..60f53df1 --- /dev/null +++ b/src/librbd/io/ObjectRequest.cc @@ -0,0 +1,729 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectRequest.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Mutex.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "include/Context.h" +#include "include/err.h" +#include "osd/osd_types.h" + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/CopyupRequest.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ReadResult.h" + +#include <boost/bind.hpp> +#include <boost/optional.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +template <typename I> +inline bool is_copy_on_read(I *ictx, librados::snap_t snap_id) { + RWLock::RLocker snap_locker(ictx->snap_lock); + return (ictx->clone_copy_on_read && + !ictx->read_only && snap_id == CEPH_NOSNAP && + (ictx->exclusive_lock == nullptr || + ictx->exclusive_lock->is_lock_owner())); +} + +} // anonymous namespace + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_write(I *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, + Context *completion) { + return new ObjectWriteRequest<I>(ictx, oid, object_no, object_off, + std::move(data), snapc, op_flags, + parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_discard(I *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + uint64_t object_len, + const ::SnapContext &snapc, + int discard_flags, + const ZTracer::Trace &parent_trace, + Context *completion) { + return new ObjectDiscardRequest<I>(ictx, oid, object_no, object_off, + object_len, snapc, discard_flags, + parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_write_same(I *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + uint64_t object_len, + ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, + Context *completion) { + return new ObjectWriteSameRequest<I>(ictx, oid, object_no, object_off, + object_len, std::move(data), snapc, + op_flags, parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_compare_and_write(I *ictx, const std::string &oid, + uint64_t object_no, + uint64_t object_off, + ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, + const ::SnapContext &snapc, + uint64_t *mismatch_offset, + int op_flags, + const ZTracer::Trace &parent_trace, + Context *completion) { + return new ObjectCompareAndWriteRequest<I>(ictx, oid, object_no, object_off, + std::move(cmp_data), + std::move(write_data), snapc, + mismatch_offset, op_flags, + parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>::ObjectRequest(I *ictx, const std::string &oid, + uint64_t objectno, uint64_t off, + uint64_t len, librados::snap_t snap_id, + const char *trace_name, + const ZTracer::Trace &trace, + Context *completion) + : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off), + m_object_len(len), m_snap_id(snap_id), m_completion(completion), + m_trace(util::create_trace(*ictx, "", trace)) { + ceph_assert(m_ictx->data_ctx.is_valid()); + if (m_trace.valid()) { + m_trace.copy_name(trace_name + std::string(" ") + oid); + m_trace.event("start"); + } +} + +template <typename I> +void ObjectRequest<I>::add_write_hint(I& image_ctx, + librados::ObjectWriteOperation *wr) { + if (image_ctx.enable_alloc_hint) { + wr->set_alloc_hint2(image_ctx.get_object_size(), + image_ctx.get_object_size(), + image_ctx.alloc_hint_flags); + } else if (image_ctx.alloc_hint_flags != 0U) { + wr->set_alloc_hint2(0, 0, image_ctx.alloc_hint_flags); + } +} + +template <typename I> +bool ObjectRequest<I>::compute_parent_extents(Extents *parent_extents, + bool read_request) { + ceph_assert(m_ictx->snap_lock.is_locked()); + ceph_assert(m_ictx->parent_lock.is_locked()); + + m_has_parent = false; + parent_extents->clear(); + + uint64_t parent_overlap; + int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap); + if (r < 0) { + // NOTE: it's possible for a snapshot to be deleted while we are + // still reading from it + lderr(m_ictx->cct) << "failed to retrieve parent overlap: " + << cpp_strerror(r) << dendl; + return false; + } + + if (!read_request && !m_ictx->migration_info.empty()) { + parent_overlap = m_ictx->migration_info.overlap; + } + + if (parent_overlap == 0) { + return false; + } + + Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no, 0, + m_ictx->layout.object_size, *parent_extents); + uint64_t object_overlap = m_ictx->prune_parent_extents(*parent_extents, + parent_overlap); + if (object_overlap > 0) { + ldout(m_ictx->cct, 20) << "overlap " << parent_overlap << " " + << "extents " << *parent_extents << dendl; + m_has_parent = !parent_extents->empty(); + return true; + } + return false; +} + +template <typename I> +void ObjectRequest<I>::async_finish(int r) { + ldout(m_ictx->cct, 20) << "r=" << r << dendl; + m_ictx->op_work_queue->queue(util::create_context_callback< + ObjectRequest<I>, &ObjectRequest<I>::finish>(this), r); +} + +template <typename I> +void ObjectRequest<I>::finish(int r) { + ldout(m_ictx->cct, 20) << "r=" << r << dendl; + m_completion->complete(r); + delete this; +} + +/** read **/ + +template <typename I> +ObjectReadRequest<I>::ObjectReadRequest(I *ictx, const std::string &oid, + uint64_t objectno, uint64_t offset, + uint64_t len, librados::snap_t snap_id, + int op_flags, + const ZTracer::Trace &parent_trace, + bufferlist* read_data, + ExtentMap* extent_map, + Context *completion) + : ObjectRequest<I>(ictx, oid, objectno, offset, len, snap_id, "read", + parent_trace, completion), + m_op_flags(op_flags), m_read_data(read_data), m_extent_map(extent_map) { +} + +template <typename I> +void ObjectReadRequest<I>::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + read_object(); +} + +template <typename I> +void ObjectReadRequest<I>::read_object() { + I *image_ctx = this->m_ictx; + { + RWLock::RLocker snap_locker(image_ctx->snap_lock); + if (image_ctx->object_map != nullptr && + !image_ctx->object_map->object_may_exist(this->m_object_no)) { + image_ctx->op_work_queue->queue(new FunctionContext([this](int r) { + read_parent(); + }), 0); + return; + } + } + + ldout(image_ctx->cct, 20) << dendl; + + librados::ObjectReadOperation op; + if (this->m_object_len >= image_ctx->sparse_read_threshold_bytes) { + op.sparse_read(this->m_object_off, this->m_object_len, m_extent_map, + m_read_data, nullptr); + } else { + op.read(this->m_object_off, this->m_object_len, m_read_data, nullptr); + } + op.set_op_flags2(m_op_flags); + + librados::AioCompletion *rados_completion = util::create_rados_callback< + ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_object>(this); + int flags = image_ctx->get_read_flags(this->m_snap_id); + int r = image_ctx->data_ctx.aio_operate( + this->m_oid, rados_completion, &op, flags, nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); + ceph_assert(r == 0); + + rados_completion->release(); +} + +template <typename I> +void ObjectReadRequest<I>::handle_read_object(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + read_parent(); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to read from object: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + this->finish(0); +} + +template <typename I> +void ObjectReadRequest<I>::read_parent() { + I *image_ctx = this->m_ictx; + + RWLock::RLocker snap_locker(image_ctx->snap_lock); + RWLock::RLocker parent_locker(image_ctx->parent_lock); + + // calculate reverse mapping onto the image + Extents parent_extents; + Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, + this->m_object_no, this->m_object_off, + this->m_object_len, parent_extents); + + uint64_t parent_overlap = 0; + uint64_t object_overlap = 0; + int r = image_ctx->get_parent_overlap(this->m_snap_id, &parent_overlap); + if (r == 0) { + object_overlap = image_ctx->prune_parent_extents(parent_extents, + parent_overlap); + } + + if (object_overlap == 0) { + parent_locker.unlock(); + snap_locker.unlock(); + + this->finish(-ENOENT); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + auto parent_completion = AioCompletion::create_and_start< + ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_parent>( + this, util::get_image_ctx(image_ctx->parent), AIO_TYPE_READ); + ImageRequest<I>::aio_read(image_ctx->parent, parent_completion, + std::move(parent_extents), ReadResult{m_read_data}, + 0, this->m_trace); +} + +template <typename I> +void ObjectReadRequest<I>::handle_read_parent(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + this->finish(r); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to read parent extents: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + copyup(); +} + +template <typename I> +void ObjectReadRequest<I>::copyup() { + I *image_ctx = this->m_ictx; + if (!is_copy_on_read(image_ctx, this->m_snap_id)) { + this->finish(0); + return; + } + + image_ctx->owner_lock.get_read(); + image_ctx->snap_lock.get_read(); + image_ctx->parent_lock.get_read(); + Extents parent_extents; + if (!this->compute_parent_extents(&parent_extents, true) || + (image_ctx->exclusive_lock != nullptr && + !image_ctx->exclusive_lock->is_lock_owner())) { + image_ctx->parent_lock.put_read(); + image_ctx->snap_lock.put_read(); + image_ctx->owner_lock.put_read(); + this->finish(0); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + image_ctx->copyup_list_lock.Lock(); + auto it = image_ctx->copyup_list.find(this->m_object_no); + if (it == image_ctx->copyup_list.end()) { + // create and kick off a CopyupRequest + auto new_req = CopyupRequest<I>::create( + image_ctx, this->m_oid, this->m_object_no, std::move(parent_extents), + this->m_trace); + + image_ctx->copyup_list[this->m_object_no] = new_req; + image_ctx->copyup_list_lock.Unlock(); + image_ctx->parent_lock.put_read(); + image_ctx->snap_lock.put_read(); + new_req->send(); + } else { + image_ctx->copyup_list_lock.Unlock(); + image_ctx->parent_lock.put_read(); + image_ctx->snap_lock.put_read(); + } + + image_ctx->owner_lock.put_read(); + this->finish(0); +} + +/** write **/ + +template <typename I> +AbstractObjectWriteRequest<I>::AbstractObjectWriteRequest( + I *ictx, const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t len, const ::SnapContext &snapc, const char *trace_name, + const ZTracer::Trace &parent_trace, Context *completion) + : ObjectRequest<I>(ictx, oid, object_no, object_off, len, CEPH_NOSNAP, + trace_name, parent_trace, completion), + m_snap_seq(snapc.seq.val) +{ + m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end()); + + if (this->m_object_off == 0 && + this->m_object_len == ictx->get_object_size()) { + m_full_object = true; + } + + compute_parent_info(); + + ictx->snap_lock.get_read(); + if (!ictx->migration_info.empty()) { + m_guarding_migration_write = true; + } + ictx->snap_lock.put_read(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::compute_parent_info() { + I *image_ctx = this->m_ictx; + RWLock::RLocker snap_locker(image_ctx->snap_lock); + RWLock::RLocker parent_locker(image_ctx->parent_lock); + + this->compute_parent_extents(&m_parent_extents, false); + + if (!this->has_parent() || + (m_full_object && m_snaps.empty() && !is_post_copyup_write_required())) { + m_copyup_enabled = false; + } +} + +template <typename I> +void AbstractObjectWriteRequest<I>::add_write_hint( + librados::ObjectWriteOperation *wr) { + I *image_ctx = this->m_ictx; + RWLock::RLocker snap_locker(image_ctx->snap_lock); + if (image_ctx->object_map == nullptr || !this->m_object_may_exist) { + ObjectRequest<I>::add_write_hint(*image_ctx, wr); + } +} + +template <typename I> +void AbstractObjectWriteRequest<I>::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << this->get_op_type() << " " << this->m_oid << " " + << this->m_object_off << "~" << this->m_object_len + << dendl; + { + RWLock::RLocker snap_lock(image_ctx->snap_lock); + if (image_ctx->object_map == nullptr) { + m_object_may_exist = true; + } else { + // should have been flushed prior to releasing lock + ceph_assert(image_ctx->exclusive_lock->is_lock_owner()); + m_object_may_exist = image_ctx->object_map->object_may_exist( + this->m_object_no); + } + } + + if (!m_object_may_exist && is_no_op_for_nonexistent_object()) { + ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object" + << dendl; + this->async_finish(0); + return; + } + + pre_write_object_map_update(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::pre_write_object_map_update() { + I *image_ctx = this->m_ictx; + + image_ctx->snap_lock.get_read(); + if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) { + image_ctx->snap_lock.put_read(); + write_object(); + return; + } + + if (!m_object_may_exist && m_copyup_enabled) { + // optimization: copyup required + image_ctx->snap_lock.put_read(); + copyup(); + return; + } + + uint8_t new_state = this->get_pre_write_object_map_state(); + ldout(image_ctx->cct, 20) << this->m_oid << " " << this->m_object_off + << "~" << this->m_object_len << dendl; + + image_ctx->object_map_lock.get_write(); + if (image_ctx->object_map->template aio_update< + AbstractObjectWriteRequest<I>, + &AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update>( + CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false, + this)) { + image_ctx->object_map_lock.put_write(); + image_ctx->snap_lock.put_read(); + return; + } + + image_ctx->object_map_lock.put_write(); + image_ctx->snap_lock.put_read(); + write_object(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + lderr(image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + write_object(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::write_object() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + librados::ObjectWriteOperation write; + if (m_copyup_enabled) { + ldout(image_ctx->cct, 20) << "guarding write" << dendl; + if (m_guarding_migration_write) { + cls_client::assert_snapc_seq( + &write, m_snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ); + } else { + write.assert_exists(); + } + } + + add_write_hint(&write); + add_write_ops(&write); + ceph_assert(write.size() != 0); + + librados::AioCompletion *rados_completion = util::create_rados_callback< + AbstractObjectWriteRequest<I>, + &AbstractObjectWriteRequest<I>::handle_write_object>(this); + int r = image_ctx->data_ctx.aio_operate( + this->m_oid, rados_completion, &write, m_snap_seq, m_snaps, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_write_object(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + r = filter_write_result(r); + if (r == -ENOENT) { + if (m_copyup_enabled) { + copyup(); + return; + } + } else if (r == -ERANGE && m_guarding_migration_write) { + image_ctx->snap_lock.get_read(); + m_guarding_migration_write = !image_ctx->migration_info.empty(); + image_ctx->snap_lock.put_read(); + + if (m_guarding_migration_write) { + copyup(); + } else { + ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl; + compute_parent_info(); + write_object(); + } + return; + } else if (r == -EILSEQ) { + ldout(image_ctx->cct, 10) << "failed to write object" << dendl; + this->finish(r); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + post_write_object_map_update(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::copyup() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + ceph_assert(!m_copyup_in_progress); + m_copyup_in_progress = true; + + image_ctx->copyup_list_lock.Lock(); + auto it = image_ctx->copyup_list.find(this->m_object_no); + if (it == image_ctx->copyup_list.end()) { + auto new_req = CopyupRequest<I>::create( + image_ctx, this->m_oid, this->m_object_no, + std::move(this->m_parent_extents), this->m_trace); + this->m_parent_extents.clear(); + + // make sure to wait on this CopyupRequest + new_req->append_request(this); + image_ctx->copyup_list[this->m_object_no] = new_req; + + image_ctx->copyup_list_lock.Unlock(); + new_req->send(); + } else { + it->second->append_request(this); + image_ctx->copyup_list_lock.Unlock(); + } +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_copyup(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + ceph_assert(m_copyup_in_progress); + m_copyup_in_progress = false; + + if (r < 0 && r != -ERESTART) { + lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + if (r == -ERESTART || is_post_copyup_write_required()) { + write_object(); + return; + } + + post_write_object_map_update(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::post_write_object_map_update() { + I *image_ctx = this->m_ictx; + + image_ctx->snap_lock.get_read(); + if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() || + !is_non_existent_post_write_object_map_state()) { + image_ctx->snap_lock.put_read(); + this->finish(0); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + // should have been flushed prior to releasing lock + ceph_assert(image_ctx->exclusive_lock->is_lock_owner()); + image_ctx->object_map_lock.get_write(); + if (image_ctx->object_map->template aio_update< + AbstractObjectWriteRequest<I>, + &AbstractObjectWriteRequest<I>::handle_post_write_object_map_update>( + CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING, + this->m_trace, false, this)) { + image_ctx->object_map_lock.put_write(); + image_ctx->snap_lock.put_read(); + return; + } + + image_ctx->object_map_lock.put_write(); + image_ctx->snap_lock.put_read(); + this->finish(0); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_post_write_object_map_update(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + lderr(image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + this->finish(0); +} + +template <typename I> +void ObjectWriteRequest<I>::add_write_ops(librados::ObjectWriteOperation *wr) { + if (this->m_full_object) { + wr->write_full(m_write_data); + } else { + wr->write(this->m_object_off, m_write_data); + } + wr->set_op_flags2(m_op_flags); +} + +template <typename I> +void ObjectWriteSameRequest<I>::add_write_ops( + librados::ObjectWriteOperation *wr) { + wr->writesame(this->m_object_off, this->m_object_len, m_write_data); + wr->set_op_flags2(m_op_flags); +} + +template <typename I> +void ObjectCompareAndWriteRequest<I>::add_write_ops( + librados::ObjectWriteOperation *wr) { + wr->cmpext(this->m_object_off, m_cmp_bl, nullptr); + + if (this->m_full_object) { + wr->write_full(m_write_bl); + } else { + wr->write(this->m_object_off, m_write_bl); + } + wr->set_op_flags2(m_op_flags); +} + +template <typename I> +int ObjectCompareAndWriteRequest<I>::filter_write_result(int r) const { + if (r <= -MAX_ERRNO) { + I *image_ctx = this->m_ictx; + Extents image_extents; + + // object extent compare mismatch + uint64_t offset = -MAX_ERRNO - r; + Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, + this->m_object_no, offset, this->m_object_len, + image_extents); + ceph_assert(image_extents.size() == 1); + + if (m_mismatch_offset) { + *m_mismatch_offset = image_extents[0].first; + } + r = -EILSEQ; + } + return r; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectRequest<librbd::ImageCtx>; +template class librbd::io::ObjectReadRequest<librbd::ImageCtx>; +template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>; +template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>; +template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>; +template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>; +template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>; diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h new file mode 100644 index 00000000..9452ec43 --- /dev/null +++ b/src/librbd/io/ObjectRequest.h @@ -0,0 +1,478 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_REQUEST_H +#define CEPH_LIBRBD_IO_OBJECT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "librbd/ObjectMap.h" +#include "librbd/io/Types.h" +#include <map> + +class Context; +class ObjectExtent; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template <typename> class CopyupRequest; + +/** + * This class represents an I/O operation to a single RBD data object. + * Its subclasses encapsulate logic for dealing with special cases + * for I/O due to layering. + */ +template <typename ImageCtxT = ImageCtx> +class ObjectRequest { +public: + static ObjectRequest* create_write( + ImageCtxT *ictx, const std::string &oid, uint64_t object_no, + uint64_t object_off, ceph::bufferlist&& data, const ::SnapContext &snapc, + int op_flags, const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_discard( + ImageCtxT *ictx, const std::string &oid, uint64_t object_no, + uint64_t object_off, uint64_t object_len, const ::SnapContext &snapc, + int discard_flags, const ZTracer::Trace &parent_trace, + Context *completion); + static ObjectRequest* create_write_same( + ImageCtxT *ictx, const std::string &oid, uint64_t object_no, + uint64_t object_off, uint64_t object_len, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_compare_and_write( + ImageCtxT *ictx, const std::string &oid, uint64_t object_no, + uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, const ::SnapContext &snapc, + uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion); + + ObjectRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t objectno, uint64_t off, uint64_t len, + librados::snap_t snap_id, const char *trace_name, + const ZTracer::Trace &parent_trace, Context *completion); + virtual ~ObjectRequest() { + m_trace.event("finish"); + } + + static void add_write_hint(ImageCtxT& image_ctx, + librados::ObjectWriteOperation *wr); + + virtual void send() = 0; + + bool has_parent() const { + return m_has_parent; + } + + virtual const char *get_op_type() const = 0; + +protected: + bool compute_parent_extents(Extents *parent_extents, bool read_request); + + ImageCtxT *m_ictx; + std::string m_oid; + uint64_t m_object_no, m_object_off, m_object_len; + librados::snap_t m_snap_id; + Context *m_completion; + ZTracer::Trace m_trace; + + void async_finish(int r); + void finish(int r); + +private: + bool m_has_parent = false; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectReadRequest : public ObjectRequest<ImageCtxT> { +public: + typedef std::map<uint64_t, uint64_t> ExtentMap; + + static ObjectReadRequest* create(ImageCtxT *ictx, const std::string &oid, + uint64_t objectno, uint64_t offset, + uint64_t len, librados::snap_t snap_id, + int op_flags, + const ZTracer::Trace &parent_trace, + ceph::bufferlist* read_data, + ExtentMap* extent_map, Context *completion) { + return new ObjectReadRequest(ictx, oid, objectno, offset, len, + snap_id, op_flags, parent_trace, read_data, + extent_map, completion); + } + + ObjectReadRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t objectno, uint64_t offset, uint64_t len, + librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, + ceph::bufferlist* read_data, ExtentMap* extent_map, + Context *completion); + + void send() override; + + const char *get_op_type() const override { + return "read"; + } + +private: + /** + * @verbatim + * + * <start> + * | + * | + * v + * READ_OBJECT + * | + * v (skip if not needed) + * READ_PARENT + * | + * v (skip if not needed) + * COPYUP + * | + * v + * <finish> + * + * @endverbatim + */ + + int m_op_flags; + + ceph::bufferlist* m_read_data; + ExtentMap* m_extent_map; + + void read_object(); + void handle_read_object(int r); + + void read_parent(); + void handle_read_parent(int r); + + void copyup(); +}; + +template <typename ImageCtxT = ImageCtx> +class AbstractObjectWriteRequest : public ObjectRequest<ImageCtxT> { +public: + AbstractObjectWriteRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + uint64_t len, const ::SnapContext &snapc, + const char *trace_name, + const ZTracer::Trace &parent_trace, + Context *completion); + + virtual bool is_empty_write_op() const { + return false; + } + + virtual uint8_t get_pre_write_object_map_state() const { + return OBJECT_EXISTS; + } + + virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) { + add_write_ops(wr); + } + + void handle_copyup(int r); + + void send() override; + +protected: + bool m_full_object = false; + bool m_copyup_enabled = true; + + virtual bool is_no_op_for_nonexistent_object() const { + return false; + } + virtual bool is_object_map_update_enabled() const { + return true; + } + virtual bool is_post_copyup_write_required() const { + return false; + } + virtual bool is_non_existent_post_write_object_map_state() const { + return false; + } + + virtual void add_write_hint(librados::ObjectWriteOperation *wr); + virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0; + + virtual int filter_write_result(int r) const { + return r; + } + +private: + /** + * @verbatim + * + * <start> + * | + * v (no-op write request) + * DETECT_NO_OP . . . . . . . . . . . . . . . . . . . + * | . + * v (skip if not required/disabled) . + * PRE_UPDATE_OBJECT_MAP . + * | . . + * | . (child dne) . + * | . . . . . . . . . . + * | . . + * | (post-copyup write) . . + * | . . . . . . . . . . . . . . + * | . . . . + * v v . v . + * WRITE . . . . . . . . > COPYUP (if required) . + * | | . + * |/----------------------/ . + * | . + * v (skip if not required/disabled) . + * POST_UPDATE_OBJECT_MAP . + * | . + * v . + * <finish> < . . . . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + uint64_t m_snap_seq; + std::vector<librados::snap_t> m_snaps; + + Extents m_parent_extents; + bool m_object_may_exist = false; + bool m_copyup_in_progress = false; + bool m_guarding_migration_write = false; + + void compute_parent_info(); + + void pre_write_object_map_update(); + void handle_pre_write_object_map_update(int r); + + void write_object(); + void handle_write_object(int r); + + void copyup(); + + void post_write_object_map_update(); + void handle_post_write_object_map_update(int r); + +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectWriteRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, + int op_flags, const ZTracer::Trace &parent_trace, + Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off, + data.length(), snapc, "write", + parent_trace, completion), + m_write_data(std::move(data)), m_op_flags(op_flags) { + } + + bool is_empty_write_op() const override { + return (m_write_data.length() == 0); + } + + const char *get_op_type() const override { + return "write"; + } + +protected: + void add_write_ops(librados::ObjectWriteOperation *wr) override; + +private: + ceph::bufferlist m_write_data; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectDiscardRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectDiscardRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, + int discard_flags, const ZTracer::Trace &parent_trace, + Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off, + object_len, snapc, "discard", + parent_trace, completion), + m_discard_flags(discard_flags) { + if (this->m_full_object) { + if ((m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE) != 0 && + this->has_parent()) { + if (!this->m_copyup_enabled) { + // need to hide the parent object instead of child object + m_discard_action = DISCARD_ACTION_REMOVE_TRUNCATE; + } else { + m_discard_action = DISCARD_ACTION_TRUNCATE; + } + this->m_object_len = 0; + } else { + m_discard_action = DISCARD_ACTION_REMOVE; + } + } else if (object_off + object_len == ictx->layout.object_size) { + m_discard_action = DISCARD_ACTION_TRUNCATE; + } else { + m_discard_action = DISCARD_ACTION_ZERO; + } + } + + const char* get_op_type() const override { + switch (m_discard_action) { + case DISCARD_ACTION_REMOVE: + return "remove"; + case DISCARD_ACTION_REMOVE_TRUNCATE: + return "remove (create+truncate)"; + case DISCARD_ACTION_TRUNCATE: + return "truncate"; + case DISCARD_ACTION_ZERO: + return "zero"; + } + ceph_abort(); + return nullptr; + } + + uint8_t get_pre_write_object_map_state() const override { + if (m_discard_action == DISCARD_ACTION_REMOVE) { + return OBJECT_PENDING; + } + return OBJECT_EXISTS; + } + +protected: + bool is_no_op_for_nonexistent_object() const override { + return (!this->has_parent()); + } + bool is_object_map_update_enabled() const override { + return ( + (m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE) == 0); + } + bool is_non_existent_post_write_object_map_state() const override { + return (m_discard_action == DISCARD_ACTION_REMOVE); + } + + void add_write_hint(librados::ObjectWriteOperation *wr) override { + // no hint for discard + } + + void add_write_ops(librados::ObjectWriteOperation *wr) override { + switch (m_discard_action) { + case DISCARD_ACTION_REMOVE: + wr->remove(); + break; + case DISCARD_ACTION_REMOVE_TRUNCATE: + wr->create(false); + // fall through + case DISCARD_ACTION_TRUNCATE: + wr->truncate(this->m_object_off); + break; + case DISCARD_ACTION_ZERO: + wr->zero(this->m_object_off, this->m_object_len); + break; + default: + ceph_abort(); + break; + } + } + +private: + enum DiscardAction { + DISCARD_ACTION_REMOVE, + DISCARD_ACTION_REMOVE_TRUNCATE, + DISCARD_ACTION_TRUNCATE, + DISCARD_ACTION_ZERO + }; + + DiscardAction m_discard_action; + int m_discard_flags; + +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectWriteSameRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectWriteSameRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + uint64_t object_len, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, + Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off, + object_len, snapc, "writesame", + parent_trace, completion), + m_write_data(std::move(data)), m_op_flags(op_flags) { + } + + const char *get_op_type() const override { + return "writesame"; + } + +protected: + void add_write_ops(librados::ObjectWriteOperation *wr) override; + +private: + ceph::bufferlist m_write_data; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectCompareAndWriteRequest(ImageCtxT *ictx, const std::string &oid, + uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& write_bl, + const ::SnapContext &snapc, + uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, + Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off, + cmp_bl.length(), snapc, + "compare_and_write", parent_trace, + completion), + m_cmp_bl(std::move(cmp_bl)), m_write_bl(std::move(write_bl)), + m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) { + } + + const char *get_op_type() const override { + return "compare_and_write"; + } + + void add_copyup_ops(librados::ObjectWriteOperation *wr) override { + // no-op on copyup + } + +protected: + virtual bool is_post_copyup_write_required() const { + return true; + } + + void add_write_ops(librados::ObjectWriteOperation *wr) override; + + int filter_write_result(int r) const override; + +private: + ceph::bufferlist m_cmp_bl; + ceph::bufferlist m_write_bl; + uint64_t *m_mismatch_offset; + int m_op_flags; +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectReadRequest<librbd::ImageCtx>; +extern template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_OBJECT_REQUEST_H diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc new file mode 100644 index 00000000..c24d8b4a --- /dev/null +++ b/src/librbd/io/ReadResult.cc @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ReadResult.h" +#include "include/buffer.h" +#include "common/dout.h" +#include "librbd/io/AioCompletion.h" +#include <boost/variant/apply_visitor.hpp> +#include <boost/variant/static_visitor.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +struct ReadResult::SetClipLengthVisitor : public boost::static_visitor<void> { + size_t length; + + explicit SetClipLengthVisitor(size_t length) : length(length) { + } + + void operator()(Linear &linear) const { + ceph_assert(length <= linear.buf_len); + linear.buf_len = length; + } + + template <typename T> + void operator()(T &t) const { + } +}; + +struct ReadResult::AssembleResultVisitor : public boost::static_visitor<void> { + CephContext *cct; + Striper::StripedReadResult &destriper; + + AssembleResultVisitor(CephContext *cct, Striper::StripedReadResult &destriper) + : cct(cct), destriper(destriper) { + } + + void operator()(Empty &empty) const { + ldout(cct, 20) << "dropping read result" << dendl; + } + + void operator()(Linear &linear) const { + ldout(cct, 20) << "copying resulting bytes to " + << reinterpret_cast<void*>(linear.buf) << dendl; + destriper.assemble_result(cct, linear.buf, linear.buf_len); + } + + void operator()(Vector &vector) const { + bufferlist bl; + destriper.assemble_result(cct, bl, true); + + ldout(cct, 20) << "copying resulting " << bl.length() << " bytes to iovec " + << reinterpret_cast<const void*>(vector.iov) << dendl; + + bufferlist::iterator it = bl.begin(); + size_t length = bl.length(); + size_t offset = 0; + int idx = 0; + for (; offset < length && idx < vector.iov_count; idx++) { + size_t len = std::min(vector.iov[idx].iov_len, length - offset); + it.copy(len, static_cast<char *>(vector.iov[idx].iov_base)); + offset += len; + } + ceph_assert(offset == bl.length()); + } + + void operator()(Bufferlist &bufferlist) const { + bufferlist.bl->clear(); + destriper.assemble_result(cct, *bufferlist.bl, true); + + ldout(cct, 20) << "moved resulting " << bufferlist.bl->length() << " " + << "bytes to bl " << reinterpret_cast<void*>(bufferlist.bl) + << dendl; + } +}; + +ReadResult::C_ImageReadRequest::C_ImageReadRequest( + AioCompletion *aio_completion, const Extents image_extents) + : aio_completion(aio_completion), image_extents(image_extents) { + aio_completion->add_request(); +} + +void ReadResult::C_ImageReadRequest::finish(int r) { + CephContext *cct = aio_completion->ictx->cct; + ldout(cct, 10) << "C_ImageReadRequest: r=" << r + << dendl; + if (r >= 0) { + size_t length = 0; + for (auto &image_extent : image_extents) { + length += image_extent.second; + } + ceph_assert(length == bl.length()); + + aio_completion->lock.Lock(); + aio_completion->read_result.m_destriper.add_partial_result( + cct, bl, image_extents); + aio_completion->lock.Unlock(); + r = length; + } + + aio_completion->complete_request(r); +} + +ReadResult::C_ObjectReadRequest::C_ObjectReadRequest( + AioCompletion *aio_completion, uint64_t object_off, uint64_t object_len, + Extents&& buffer_extents) + : aio_completion(aio_completion), object_off(object_off), + object_len(object_len), buffer_extents(std::move(buffer_extents)) { + aio_completion->add_request(); +} + +void ReadResult::C_ObjectReadRequest::finish(int r) { + CephContext *cct = aio_completion->ictx->cct; + ldout(cct, 10) << "C_ObjectReadRequest: r=" << r + << dendl; + + if (r == -ENOENT) { + r = 0; + } + if (r >= 0) { + ldout(cct, 10) << " got " << extent_map + << " for " << buffer_extents + << " bl " << bl.length() << dendl; + // handle the case where a sparse-read wasn't issued + if (extent_map.empty()) { + extent_map[object_off] = bl.length(); + } + + aio_completion->lock.Lock(); + aio_completion->read_result.m_destriper.add_partial_sparse_result( + cct, bl, extent_map, object_off, buffer_extents); + aio_completion->lock.Unlock(); + + r = object_len; + } + + aio_completion->complete_request(r); +} + +ReadResult::ReadResult() : m_buffer(Empty()) { +} + +ReadResult::ReadResult(char *buf, size_t buf_len) + : m_buffer(Linear(buf, buf_len)) { +} + +ReadResult::ReadResult(const struct iovec *iov, int iov_count) + : m_buffer(Vector(iov, iov_count)) { +} + +ReadResult::ReadResult(ceph::bufferlist *bl) + : m_buffer(Bufferlist(bl)) { +} + +void ReadResult::set_clip_length(size_t length) { + boost::apply_visitor(SetClipLengthVisitor(length), m_buffer); +} + +void ReadResult::assemble_result(CephContext *cct) { + boost::apply_visitor(AssembleResultVisitor(cct, m_destriper), m_buffer); +} + +} // namespace io +} // namespace librbd + diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h new file mode 100644 index 00000000..6fb5e4a4 --- /dev/null +++ b/src/librbd/io/ReadResult.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_READ_RESULT_H +#define CEPH_LIBRBD_IO_READ_RESULT_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "librbd/io/Types.h" +#include "osdc/Striper.h" +#include <sys/uio.h> +#include <boost/variant/variant.hpp> + +struct CephContext; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template <typename> struct ObjectReadRequest; + +class ReadResult { +public: + struct C_ImageReadRequest : public Context { + AioCompletion *aio_completion; + Extents image_extents; + bufferlist bl; + + C_ImageReadRequest(AioCompletion *aio_completion, + const Extents image_extents); + + void finish(int r) override; + }; + + struct C_ObjectReadRequest : public Context { + AioCompletion *aio_completion; + uint64_t object_off; + uint64_t object_len; + Extents buffer_extents; + + bufferlist bl; + ExtentMap extent_map; + + C_ObjectReadRequest(AioCompletion *aio_completion, uint64_t object_off, + uint64_t object_len, Extents&& buffer_extents); + + void finish(int r) override; + }; + + ReadResult(); + ReadResult(char *buf, size_t buf_len); + ReadResult(const struct iovec *iov, int iov_count); + ReadResult(ceph::bufferlist *bl); + + void set_clip_length(size_t length); + void assemble_result(CephContext *cct); + +private: + struct Empty { + }; + + struct Linear { + char *buf; + size_t buf_len; + + Linear(char *buf, size_t buf_len) : buf(buf), buf_len(buf_len) { + } + }; + + struct Vector { + const struct iovec *iov; + int iov_count; + + Vector(const struct iovec *iov, int iov_count) + : iov(iov), iov_count(iov_count) { + } + }; + + struct Bufferlist { + ceph::bufferlist *bl; + + Bufferlist(ceph::bufferlist *bl) : bl(bl) { + } + }; + + typedef boost::variant<Empty, + Linear, + Vector, + Bufferlist> Buffer; + struct SetClipLengthVisitor; + struct AssembleResultVisitor; + + Buffer m_buffer; + Striper::StripedReadResult m_destriper; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_READ_RESULT_H + diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h new file mode 100644 index 00000000..6bd42eac --- /dev/null +++ b/src/librbd/io/Types.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_TYPES_H +#define CEPH_LIBRBD_IO_TYPES_H + +#include "include/int_types.h" +#include <map> +#include <vector> + +namespace librbd { +namespace io { + +#define RBD_QOS_IOPS_THROTTLE 1 << 0 +#define RBD_QOS_BPS_THROTTLE 1 << 1 +#define RBD_QOS_READ_IOPS_THROTTLE 1 << 2 +#define RBD_QOS_WRITE_IOPS_THROTTLE 1 << 3 +#define RBD_QOS_READ_BPS_THROTTLE 1 << 4 +#define RBD_QOS_WRITE_BPS_THROTTLE 1 << 5 + +#define RBD_QOS_BPS_MASK (RBD_QOS_BPS_THROTTLE | RBD_QOS_READ_BPS_THROTTLE | RBD_QOS_WRITE_BPS_THROTTLE) +#define RBD_QOS_IOPS_MASK (RBD_QOS_IOPS_THROTTLE | RBD_QOS_READ_IOPS_THROTTLE | RBD_QOS_WRITE_IOPS_THROTTLE) +#define RBD_QOS_READ_MASK (RBD_QOS_READ_BPS_THROTTLE | RBD_QOS_READ_IOPS_THROTTLE) +#define RBD_QOS_WRITE_MASK (RBD_QOS_WRITE_BPS_THROTTLE | RBD_QOS_WRITE_IOPS_THROTTLE) + +#define RBD_QOS_MASK (RBD_QOS_BPS_MASK | RBD_QOS_IOPS_MASK) + +typedef enum { + AIO_TYPE_NONE = 0, + AIO_TYPE_GENERIC, + AIO_TYPE_OPEN, + AIO_TYPE_CLOSE, + AIO_TYPE_READ, + AIO_TYPE_WRITE, + AIO_TYPE_DISCARD, + AIO_TYPE_FLUSH, + AIO_TYPE_WRITESAME, + AIO_TYPE_COMPARE_AND_WRITE, +} aio_type_t; + +enum FlushSource { + FLUSH_SOURCE_USER, + FLUSH_SOURCE_INTERNAL, + FLUSH_SOURCE_SHUTDOWN +}; + +enum Direction { + DIRECTION_READ, + DIRECTION_WRITE, + DIRECTION_BOTH +}; + +enum DispatchResult { + DISPATCH_RESULT_INVALID, + DISPATCH_RESULT_CONTINUE, + DISPATCH_RESULT_COMPLETE +}; + +enum ObjectDispatchLayer { + OBJECT_DISPATCH_LAYER_NONE = 0, + OBJECT_DISPATCH_LAYER_CACHE, + OBJECT_DISPATCH_LAYER_JOURNAL, + OBJECT_DISPATCH_LAYER_CORE, + OBJECT_DISPATCH_LAYER_LAST +}; + +enum { + OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE = 1UL << 0, + OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE = 1UL << 1 +}; + +enum { + OBJECT_DISPATCH_FLAG_FLUSH = 1UL << 0, + OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR = 1UL << 1 +}; + +typedef std::vector<std::pair<uint64_t, uint64_t> > Extents; +typedef std::map<uint64_t, uint64_t> ExtentMap; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_TYPES_H diff --git a/src/librbd/io/Utils.cc b/src/librbd/io/Utils.cc new file mode 100644 index 00000000..1b50561a --- /dev/null +++ b/src/librbd/io/Utils.cc @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/Utils.h" +#include "include/buffer.h" +#include "osd/osd_types.h" + +namespace librbd { +namespace io { +namespace util { + +bool assemble_write_same_extent( + const ObjectExtent &object_extent, const ceph::bufferlist& data, + ceph::bufferlist *ws_data, bool force_write) { + size_t data_len = data.length(); + + if (!force_write) { + bool may_writesame = true; + for (auto& q : object_extent.buffer_extents) { + if (!(q.first % data_len == 0 && q.second % data_len == 0)) { + may_writesame = false; + break; + } + } + + if (may_writesame) { + ws_data->append(data); + return true; + } + } + + for (auto& q : object_extent.buffer_extents) { + bufferlist sub_bl; + uint64_t sub_off = q.first % data_len; + uint64_t sub_len = data_len - sub_off; + uint64_t extent_left = q.second; + while (extent_left >= sub_len) { + sub_bl.substr_of(data, sub_off, sub_len); + ws_data->claim_append(sub_bl); + extent_left -= sub_len; + if (sub_off) { + sub_off = 0; + sub_len = data_len; + } + } + if (extent_left) { + sub_bl.substr_of(data, sub_off, extent_left); + ws_data->claim_append(sub_bl); + } + } + return false; +} + +} // namespace util +} // namespace io +} // namespace librbd + diff --git a/src/librbd/io/Utils.h b/src/librbd/io/Utils.h new file mode 100644 index 00000000..c1f373d4 --- /dev/null +++ b/src/librbd/io/Utils.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_UTILS_H +#define CEPH_LIBRBD_IO_UTILS_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include <map> + +class ObjectExtent; + +namespace librbd { +namespace io { +namespace util { + +bool assemble_write_same_extent(const ObjectExtent &object_extent, + const ceph::bufferlist& data, + ceph::bufferlist *ws_data, + bool force_write); + +} // namespace util +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_UTILS_H diff --git a/src/librbd/journal/CreateRequest.cc b/src/librbd/journal/CreateRequest.cc new file mode 100644 index 00000000..4f10ec65 --- /dev/null +++ b/src/librbd/journal/CreateRequest.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "librbd/Utils.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "journal/Settings.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Journal::CreateRequest: " + +namespace librbd { + +using util::create_context_callback; + +namespace journal { + +template<typename I> +CreateRequest<I>::CreateRequest(IoCtx &ioctx, const std::string &imageid, + uint8_t order, uint8_t splay_width, + const std::string &object_pool, + uint64_t tag_class, TagData &tag_data, + const std::string &client_id, + ContextWQ *op_work_queue, + Context *on_finish) + : m_ioctx(ioctx), m_image_id(imageid), m_order(order), + m_splay_width(splay_width), m_object_pool(object_pool), + m_tag_class(tag_class), m_tag_data(tag_data), m_image_client_id(client_id), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); +} + +template<typename I> +void CreateRequest<I>::send() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + if (m_order > 64 || m_order < 12) { + lderr(m_cct) << "order must be in the range [12, 64]" << dendl; + complete(-EDOM); + return; + } + if (m_splay_width == 0) { + complete(-EINVAL); + return; + } + + get_pool_id(); +} + +template<typename I> +void CreateRequest<I>::get_pool_id() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + if (m_object_pool.empty()) { + create_journal(); + return; + } + + librados::Rados rados(m_ioctx); + IoCtx data_ioctx; + int r = rados.ioctx_create(m_object_pool.c_str(), data_ioctx); + if (r != 0) { + lderr(m_cct) << "failed to create journal: " + << "error opening journal object pool '" << m_object_pool + << "': " << cpp_strerror(r) << dendl; + complete(r); + return; + } + data_ioctx.set_namespace(m_ioctx.get_namespace()); + + m_pool_id = data_ioctx.get_id(); + create_journal(); +} + +template<typename I> +void CreateRequest<I>::create_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock); + m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock, + m_ioctx, m_image_id, m_image_client_id, {}); + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_create_journal>(this); + + m_journaler->create(m_order, m_splay_width, m_pool_id, ctx); +} + +template<typename I> +Context *CreateRequest<I>::handle_create_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to create journal: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + allocate_journal_tag(); + return nullptr; +} + +template<typename I> +void CreateRequest<I>::allocate_journal_tag() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_journal_tag>(this); + + encode(m_tag_data, m_bl); + m_journaler->allocate_tag(m_tag_class, m_bl, &m_tag, ctx); +} + +template<typename I> +Context *CreateRequest<I>::handle_journal_tag(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to allocate tag: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + register_client(); + return nullptr; +} + +template<typename I> +void CreateRequest<I>::register_client() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + m_bl.clear(); + encode(ClientData{ImageClientMeta{m_tag.tag_class}}, m_bl); + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_register_client>(this); + + m_journaler->register_client(m_bl, ctx); +} + +template<typename I> +Context *CreateRequest<I>::handle_register_client(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to register client: " << cpp_strerror(*result) << dendl; + } + + shut_down_journaler(*result); + return nullptr; +} + +template<typename I> +void CreateRequest<I>::shut_down_journaler(int r) { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + m_r_saved = r; + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_journaler_shutdown>(this); + + m_journaler->shut_down(ctx); +} + +template<typename I> +Context *CreateRequest<I>::handle_journaler_shutdown(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl; + } + + delete m_journaler; + + if (!m_r_saved) { + complete(0); + return nullptr; + } + + // there was an error during journal creation, so we rollback + // what ever was done. the easiest way to do this is to invoke + // journal remove state machine, although it's not the most + // cleanest approach when it comes to redundancy, but that's + // ok during the failure path. + remove_journal(); + return nullptr; +} + +template<typename I> +void CreateRequest<I>::remove_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = CreateRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_remove_journal>(this); + + RemoveRequest<I> *req = RemoveRequest<I>::create( + m_ioctx, m_image_id, m_image_client_id, m_op_work_queue, ctx); + req->send(); +} + +template<typename I> +Context *CreateRequest<I>::handle_remove_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "error cleaning up journal after creation failed: " + << cpp_strerror(*result) << dendl; + } + + complete(m_r_saved); + return nullptr; +} + +template<typename I> +void CreateRequest<I>::complete(int r) { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + if (r == 0) { + ldout(m_cct, 20) << "done." << dendl; + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::CreateRequest<librbd::ImageCtx>; diff --git a/src/librbd/journal/CreateRequest.h b/src/librbd/journal/CreateRequest.h new file mode 100644 index 00000000..c2b8cd9c --- /dev/null +++ b/src/librbd/journal/CreateRequest.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "common/Mutex.h" +#include "librbd/ImageCtx.h" +#include "journal/Journaler.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include "cls/journal/cls_journal_types.h" + +using librados::IoCtx; +using journal::Journaler; + +class Context; +class ContextWQ; +class SafeTimer; + +namespace journal { + class Journaler; +} + +namespace librbd { + +class ImageCtx; + +namespace journal { + +template<typename ImageCtxT = ImageCtx> +class CreateRequest { +public: + static CreateRequest *create(IoCtx &ioctx, const std::string &imageid, + uint8_t order, uint8_t splay_width, + const std::string &object_pool, + uint64_t tag_class, TagData &tag_data, + const std::string &client_id, + ContextWQ *op_work_queue, Context *on_finish) { + return new CreateRequest(ioctx, imageid, order, splay_width, object_pool, + tag_class, tag_data, client_id, op_work_queue, + on_finish); + } + + void send(); + +private: + typedef typename TypeTraits<ImageCtxT>::Journaler Journaler; + + CreateRequest(IoCtx &ioctx, const std::string &imageid, uint8_t order, + uint8_t splay_width, const std::string &object_pool, + uint64_t tag_class, TagData &tag_data, + const std::string &client_id, ContextWQ *op_work_queue, + Context *on_finish); + + IoCtx &m_ioctx; + std::string m_image_id; + uint8_t m_order; + uint8_t m_splay_width; + std::string m_object_pool; + uint64_t m_tag_class; + TagData m_tag_data; + std::string m_image_client_id; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + cls::journal::Tag m_tag; + bufferlist m_bl; + Journaler *m_journaler; + SafeTimer *m_timer; + Mutex *m_timer_lock; + int m_r_saved; + + int64_t m_pool_id = -1; + + void get_pool_id(); + + void create_journal(); + Context *handle_create_journal(int *result); + + void allocate_journal_tag(); + Context *handle_journal_tag(int *result); + + void register_client(); + Context *handle_register_client(int *result); + + void shut_down_journaler(int r); + Context *handle_journaler_shutdown(int *result); + + void remove_journal(); + Context *handle_remove_journal(int *result); + + void complete(int r); +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::CreateRequest<librbd::ImageCtx>; + +#endif /* CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H */ diff --git a/src/librbd/journal/DemoteRequest.cc b/src/librbd/journal/DemoteRequest.cc new file mode 100644 index 00000000..66a1d19e --- /dev/null +++ b/src/librbd/journal/DemoteRequest.cc @@ -0,0 +1,255 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/DemoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/journal/OpenRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::DemoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template <typename I> +DemoteRequest<I>::DemoteRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), + m_lock("DemoteRequest::m_lock") { +} + +template <typename I> +DemoteRequest<I>::~DemoteRequest() { + ceph_assert(m_journaler == nullptr); +} + +template <typename I> +void DemoteRequest<I>::send() { + open_journaler(); +} + +template <typename I> +void DemoteRequest<I>::open_journaler() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + m_journaler = new Journaler(m_image_ctx.md_ctx, m_image_ctx.id, + Journal<>::IMAGE_CLIENT_ID, {}); + auto ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_open_journaler>(this)); + auto req = OpenRequest<I>::create(&m_image_ctx, m_journaler, &m_lock, + &m_client_meta, &m_tag_tid, &m_tag_data, + ctx); + req->send(); +} + +template <typename I> +void DemoteRequest<I>::handle_open_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl; + shut_down_journaler(); + return; + } else if (m_tag_data.mirror_uuid != Journal<>::LOCAL_MIRROR_UUID) { + m_ret_val = -EINVAL; + lderr(cct) << "image is not currently the primary" << dendl; + shut_down_journaler(); + return; + } + + allocate_tag(); +} + +template <typename I> +void DemoteRequest<I>::allocate_tag() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + cls::journal::Client client; + int r = m_journaler->get_cached_client(Journal<>::IMAGE_CLIENT_ID, &client); + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to retrieve client: " << cpp_strerror(r) << dendl; + shut_down_journaler(); + return; + } + + TagPredecessor predecessor; + predecessor.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID; + if (!client.commit_position.object_positions.empty()) { + auto position = client.commit_position.object_positions.front(); + predecessor.commit_valid = true; + predecessor.tag_tid = position.tag_tid; + predecessor.entry_tid = position.entry_tid; + } + + TagData tag_data; + tag_data.mirror_uuid = Journal<>::ORPHAN_MIRROR_UUID; + tag_data.predecessor = std::move(predecessor); + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_allocate_tag>(this); + m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx); +} + +template <typename I> +void DemoteRequest<I>::handle_allocate_tag(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl; + shut_down_journaler(); + return; + } + + m_tag_tid = m_tag.tid; + append_event(); +} + +template <typename I> +void DemoteRequest<I>::append_event() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + EventEntry event_entry{DemotePromoteEvent{}, {}}; + bufferlist event_entry_bl; + encode(event_entry, event_entry_bl); + + m_journaler->start_append(0); + m_future = m_journaler->append(m_tag_tid, event_entry_bl); + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_append_event>(this); + m_future.flush(ctx); + +} + +template <typename I> +void DemoteRequest<I>::handle_append_event(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to append demotion journal event: " << cpp_strerror(r) + << dendl; + stop_append(); + return; + } + + commit_event(); +} + +template <typename I> +void DemoteRequest<I>::commit_event() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + m_journaler->committed(m_future); + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_commit_event>(this); + m_journaler->flush_commit_position(ctx); +} + +template <typename I> +void DemoteRequest<I>::handle_commit_event(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to flush demotion commit position: " + << cpp_strerror(r) << dendl; + } + + stop_append(); +} + +template <typename I> +void DemoteRequest<I>::stop_append() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_stop_append>(this); + m_journaler->stop_append(ctx); +} + +template <typename I> +void DemoteRequest<I>::handle_stop_append(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (m_ret_val == 0) { + m_ret_val = r; + } + lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl; + } + + shut_down_journaler(); +} + +template <typename I> +void DemoteRequest<I>::shut_down_journaler() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_shut_down_journaler>(this)); + m_journaler->shut_down(ctx); +} + +template <typename I> +void DemoteRequest<I>::handle_shut_down_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl; + } + + delete m_journaler; + m_journaler = nullptr; + finish(r); +} + +template <typename I> +void DemoteRequest<I>::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::DemoteRequest<librbd::ImageCtx>; diff --git a/src/librbd/journal/DemoteRequest.h b/src/librbd/journal/DemoteRequest.h new file mode 100644 index 00000000..5fea7f47 --- /dev/null +++ b/src/librbd/journal/DemoteRequest.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H + +#include "common/Mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "journal/Future.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +template <typename ImageCtxT = librbd::ImageCtx> +class DemoteRequest { +public: + static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) { + return new DemoteRequest(image_ctx, on_finish); + } + + DemoteRequest(ImageCtxT &image_ctx, Context *on_finish); + ~DemoteRequest(); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * OPEN_JOURNALER * * * * * + * | * + * v * + * ALLOCATE_TAG * * * * * * + * | * + * v * + * APPEND_EVENT * * * * + * | * * + * v * * + * COMMIT_EVENT * * + * | * * + * v * * + * STOP_APPEND <* * * * + * | * + * v * + * SHUT_DOWN_JOURNALER <* * + * | + * v + * <finish> + * + * @endverbatim + */ + + typedef typename TypeTraits<ImageCtxT>::Journaler Journaler; + typedef typename TypeTraits<ImageCtxT>::Future Future; + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + Journaler *m_journaler = nullptr; + int m_ret_val = 0; + + Mutex m_lock; + ImageClientMeta m_client_meta; + uint64_t m_tag_tid = 0; + TagData m_tag_data; + + cls::journal::Tag m_tag; + Future m_future; + + void open_journaler(); + void handle_open_journaler(int r); + + void allocate_tag(); + void handle_allocate_tag(int r); + + void append_event(); + void handle_append_event(int r); + + void commit_event(); + void handle_commit_event(int r); + + void stop_append(); + void handle_stop_append(int r); + + void shut_down_journaler(); + void handle_shut_down_journaler(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::DemoteRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H diff --git a/src/librbd/journal/DisabledPolicy.h b/src/librbd/journal/DisabledPolicy.h new file mode 100644 index 00000000..27d69a50 --- /dev/null +++ b/src/librbd/journal/DisabledPolicy.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H +#define CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H + +#include "librbd/journal/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +class DisabledPolicy : public Policy { +public: + bool append_disabled() const override { + return true; + } + bool journal_disabled() const override { + return true; + } + void allocate_tag_on_lock(Context *on_finish) override { + ceph_abort(); + } +}; + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H diff --git a/src/librbd/journal/ObjectDispatch.cc b/src/librbd/journal/ObjectDispatch.cc new file mode 100644 index 00000000..737f5efe --- /dev/null +++ b/src/librbd/journal/ObjectDispatch.cc @@ -0,0 +1,213 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/ObjectDispatch.h" +#include "common/dout.h" +#include "common/WorkQueue.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::ObjectDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace journal { + +namespace { + +template <typename I> +struct C_CommitIOEvent : public Context { + I* image_ctx; + Journal<I>* journal; + uint64_t object_no; + uint64_t object_off; + uint64_t object_len; + uint64_t journal_tid; + int object_dispatch_flags; + Context* on_finish; + + C_CommitIOEvent(I* image_ctx, Journal<I>* journal, uint64_t object_no, + uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, int object_dispatch_flags, + Context* on_finish) + : image_ctx(image_ctx), journal(journal), object_no(object_no), + object_off(object_off), object_len(object_len), journal_tid(journal_tid), + object_dispatch_flags(object_dispatch_flags), on_finish(on_finish) { + } + + void finish(int r) override { + // don't commit the IO extent if a previous dispatch handler will just + // retry the failed IO + if (r >= 0 || + (object_dispatch_flags & + io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR) == 0) { + io::Extents file_extents; + Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, object_no, + object_off, object_len, file_extents); + for (auto& extent : file_extents) { + journal->commit_io_event_extent(journal_tid, extent.first, + extent.second, r); + } + } + + if (on_finish != nullptr) { + on_finish->complete(r); + } + } +}; + +} // anonymous namespace + +template <typename I> +ObjectDispatch<I>::ObjectDispatch(I* image_ctx, Journal<I>* journal) + : m_image_ctx(image_ctx), m_journal(journal) { +} + +template <typename I> +void ObjectDispatch<I>::shut_down(Context* on_finish) { + m_image_ctx->op_work_queue->queue(on_finish, 0); +} + +template <typename I> +bool ObjectDispatch<I>::discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl; + + *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no, + object_off, object_len, *journal_tid, + *object_dispatch_flags, *on_finish); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << data.length() << dendl; + + *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no, + object_off, data.length(), *journal_tid, + *object_dispatch_flags, *on_finish); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, io::Extents&& buffer_extents, ceph::bufferlist&& data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl; + + *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no, + object_off, object_len, *journal_tid, + *object_dispatch_flags, *on_finish); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << oid << " " << object_off << "~" << write_data.length() + << dendl; + + *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no, + object_off, write_data.length(), + *journal_tid, *object_dispatch_flags, + *on_finish); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template <typename I> +void ObjectDispatch<I>::extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << object_no << " " << object_off << "~" << object_len + << dendl; + + auto ctx = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no, + object_off, object_len, journal_tid, false, + nullptr); + if (new_journal_tid != 0) { + // ensure new journal event is safely committed to disk before + // committing old event + m_journal->flush_event(new_journal_tid, ctx); + } else { + ctx->complete(0); + } +} + +template <typename I> +void ObjectDispatch<I>::wait_or_flush_event( + uint64_t journal_tid, int object_dispatch_flags, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "journal_tid=" << journal_tid << dendl; + + if ((object_dispatch_flags & io::OBJECT_DISPATCH_FLAG_FLUSH) != 0) { + m_journal->flush_event(journal_tid, on_dispatched); + } else { + m_journal->wait_event(journal_tid, on_dispatched); + } +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::ObjectDispatch<librbd::ImageCtx>; diff --git a/src/librbd/journal/ObjectDispatch.h b/src/librbd/journal/ObjectDispatch.h new file mode 100644 index 00000000..b5ae747e --- /dev/null +++ b/src/librbd/journal/ObjectDispatch.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; +template <typename> class Journal; + +namespace journal { + +template <typename ImageCtxT = librbd::ImageCtx> +class ObjectDispatch : public io::ObjectDispatchInterface { +public: + static ObjectDispatch* create(ImageCtxT* image_ctx, + Journal<ImageCtxT>* journal) { + return new ObjectDispatch(image_ctx, journal); + } + + ObjectDispatch(ImageCtxT* image_ctx, Journal<ImageCtxT>* journal); + + io::ObjectDispatchLayer get_object_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_JOURNAL; + } + + void shut_down(Context* on_finish) override; + + bool read( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, librados::snap_t snap_id, int op_flags, + const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data, + io::ExtentMap* extent_map, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + return false; + } + + bool discard( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, const ::SnapContext &snapc, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + const std::string &oid, uint64_t object_no, uint64_t object_off, + uint64_t object_len, io::Extents&& buffer_extents, + ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + const std::string &oid, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + const ::SnapContext &snapc, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override; + +private: + ImageCtxT* m_image_ctx; + Journal<ImageCtxT>* m_journal; + + void wait_or_flush_event(uint64_t journal_tid, int object_dispatch_flags, + Context* on_dispatched); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::ObjectDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H diff --git a/src/librbd/journal/OpenRequest.cc b/src/librbd/journal/OpenRequest.cc new file mode 100644 index 00000000..a5178de8 --- /dev/null +++ b/src/librbd/journal/OpenRequest.cc @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/OpenRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::OpenRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using util::C_DecodeTags; + +template <typename I> +OpenRequest<I>::OpenRequest(I *image_ctx, Journaler *journaler, Mutex *lock, + journal::ImageClientMeta *client_meta, + uint64_t *tag_tid, journal::TagData *tag_data, + Context *on_finish) + : m_image_ctx(image_ctx), m_journaler(journaler), m_lock(lock), + m_client_meta(client_meta), m_tag_tid(tag_tid), m_tag_data(tag_data), + m_on_finish(on_finish) { +} + +template <typename I> +void OpenRequest<I>::send() { + send_init(); +} + +template <typename I> +void OpenRequest<I>::send_init() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + m_journaler->init(create_async_context_callback( + *m_image_ctx, create_context_callback< + OpenRequest<I>, &OpenRequest<I>::handle_init>(this))); +} + +template <typename I> +void OpenRequest<I>::handle_init(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to initialize journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + // locate the master image client record + cls::journal::Client client; + r = m_journaler->get_cached_client(Journal<ImageCtx>::IMAGE_CLIENT_ID, + &client); + if (r < 0) { + lderr(cct) << "failed to locate master image client" << dendl; + finish(r); + return; + } + + librbd::journal::ClientData client_data; + auto bl = client.data.cbegin(); + try { + decode(client_data, bl); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode client meta data: " << err.what() + << dendl; + finish(-EINVAL); + return; + } + + journal::ImageClientMeta *image_client_meta = + boost::get<journal::ImageClientMeta>(&client_data.client_meta); + if (image_client_meta == nullptr) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to extract client meta data" << dendl; + finish(-EINVAL); + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": " + << "client: " << client << ", " + << "image meta: " << *image_client_meta << dendl; + + m_tag_class = image_client_meta->tag_class; + { + Mutex::Locker locker(*m_lock); + *m_client_meta = *image_client_meta; + } + + send_get_tags(); +} + +template <typename I> +void OpenRequest<I>::send_get_tags() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + C_DecodeTags *tags_ctx = new C_DecodeTags( + cct, m_lock, m_tag_tid, m_tag_data, create_async_context_callback( + *m_image_ctx, create_context_callback< + OpenRequest<I>, &OpenRequest<I>::handle_get_tags>(this))); + m_journaler->get_tags(m_tag_class, &tags_ctx->tags, tags_ctx); +} + +template <typename I> +void OpenRequest<I>::handle_get_tags(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to decode journal tags: " << cpp_strerror(r) << dendl; + } + + finish(r); +} + +template <typename I> +void OpenRequest<I>::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::OpenRequest<librbd::ImageCtx>; diff --git a/src/librbd/journal/OpenRequest.h b/src/librbd/journal/OpenRequest.h new file mode 100644 index 00000000..f71d8c53 --- /dev/null +++ b/src/librbd/journal/OpenRequest.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H + +#include "include/int_types.h" +#include "librbd/journal/TypeTraits.h" + +struct Context; +struct Mutex; + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +struct ImageClientMeta; +struct TagData; + +template <typename ImageCtxT = ImageCtx> +class OpenRequest { +public: + typedef typename TypeTraits<ImageCtxT>::Journaler Journaler; + + static OpenRequest* create(ImageCtxT *image_ctx, Journaler *journaler, + Mutex *lock, journal::ImageClientMeta *client_meta, + uint64_t *tag_tid, journal::TagData *tag_data, + Context *on_finish) { + return new OpenRequest(image_ctx, journaler, lock, client_meta, tag_tid, + tag_data, on_finish); + } + + OpenRequest(ImageCtxT *image_ctx, Journaler *journaler, Mutex *lock, + journal::ImageClientMeta *client_meta, uint64_t *tag_tid, + journal::TagData *tag_data, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * INIT + * | + * v + * GET_TAGS + * | + * v + * <finish> + * + * @endverbatim + */ + + + ImageCtxT *m_image_ctx; + Journaler *m_journaler; + Mutex *m_lock; + journal::ImageClientMeta *m_client_meta; + uint64_t *m_tag_tid; + journal::TagData *m_tag_data; + Context *m_on_finish; + + uint64_t m_tag_class = 0; + + void send_init(); + void handle_init(int r); + + void send_get_tags(); + void handle_get_tags(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::OpenRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H diff --git a/src/librbd/journal/Policy.h b/src/librbd/journal/Policy.h new file mode 100644 index 00000000..1ced3c53 --- /dev/null +++ b/src/librbd/journal/Policy.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_POLICY_H +#define CEPH_LIBRBD_JOURNAL_POLICY_H + +class Context; + +namespace librbd { + +namespace journal { + +struct Policy { + virtual ~Policy() { + } + + virtual bool append_disabled() const = 0; + virtual bool journal_disabled() const = 0; + virtual void allocate_tag_on_lock(Context *on_finish) = 0; +}; + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_POLICY_H diff --git a/src/librbd/journal/PromoteRequest.cc b/src/librbd/journal/PromoteRequest.cc new file mode 100644 index 00000000..17f2957e --- /dev/null +++ b/src/librbd/journal/PromoteRequest.cc @@ -0,0 +1,237 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/PromoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/journal/OpenRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::PromoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template <typename I> +PromoteRequest<I>::PromoteRequest(I *image_ctx, bool force, Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish), + m_lock("PromoteRequest::m_lock") { +} + +template <typename I> +void PromoteRequest<I>::send() { + send_open(); +} + +template <typename I> +void PromoteRequest<I>::send_open() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + m_journaler = new Journaler(m_image_ctx->md_ctx, m_image_ctx->id, + Journal<>::IMAGE_CLIENT_ID, {}); + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_open>(this)); + auto open_req = OpenRequest<I>::create(m_image_ctx, m_journaler, + &m_lock, &m_client_meta, + &m_tag_tid, &m_tag_data, ctx); + open_req->send(); +} + +template <typename I> +void PromoteRequest<I>::handle_open(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl; + shut_down(); + return; + } + + allocate_tag(); +} + +template <typename I> +void PromoteRequest<I>::allocate_tag() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + journal::TagPredecessor predecessor; + if (!m_force && m_tag_data.mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) { + // orderly promotion -- demotion epoch will have a single entry + // so link to our predecessor (demotion) epoch + predecessor = TagPredecessor{Journal<>::ORPHAN_MIRROR_UUID, true, m_tag_tid, + 1}; + } else { + // forced promotion -- create an epoch no peers can link against + predecessor = TagPredecessor{Journal<>::LOCAL_MIRROR_UUID, true, m_tag_tid, + 0}; + } + + TagData tag_data; + tag_data.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID; + tag_data.predecessor = predecessor; + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + Context *ctx = create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_allocate_tag>(this); + m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx); +} + +template <typename I> +void PromoteRequest<I>::handle_allocate_tag(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl; + shut_down(); + return; + } + + m_tag_tid = m_tag.tid; + append_event(); +} + +template <typename I> +void PromoteRequest<I>::append_event() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + EventEntry event_entry{DemotePromoteEvent{}, {}}; + bufferlist event_entry_bl; + encode(event_entry, event_entry_bl); + + m_journaler->start_append(0); + m_future = m_journaler->append(m_tag_tid, event_entry_bl); + + auto ctx = create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_append_event>(this); + m_future.flush(ctx); +} + +template <typename I> +void PromoteRequest<I>::handle_append_event(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to append promotion journal event: " + << cpp_strerror(r) << dendl; + stop_append(); + return; + } + + commit_event(); +} + +template <typename I> +void PromoteRequest<I>::commit_event() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + m_journaler->committed(m_future); + + auto ctx = create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_commit_event>(this); + m_journaler->flush_commit_position(ctx); +} + +template <typename I> +void PromoteRequest<I>::handle_commit_event(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to flush promote commit position: " + << cpp_strerror(r) << dendl; + } + + stop_append(); +} + +template <typename I> +void PromoteRequest<I>::stop_append() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_stop_append>(this); + m_journaler->stop_append(ctx); +} + +template <typename I> +void PromoteRequest<I>::handle_stop_append(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (m_ret_val == 0) { + m_ret_val = r; + } + lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl; + } + + shut_down(); +} + +template <typename I> +void PromoteRequest<I>::shut_down() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_shut_down>(this)); + m_journaler->shut_down(ctx); +} + +template <typename I> +void PromoteRequest<I>::handle_shut_down(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl; + } + + delete m_journaler; + finish(r); +} + +template <typename I> +void PromoteRequest<I>::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::PromoteRequest<librbd::ImageCtx>; diff --git a/src/librbd/journal/PromoteRequest.h b/src/librbd/journal/PromoteRequest.h new file mode 100644 index 00000000..0d01f596 --- /dev/null +++ b/src/librbd/journal/PromoteRequest.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H + +#include "include/int_types.h" +#include "common/Mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "journal/Future.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +template <typename ImageCtxT = ImageCtx> +class PromoteRequest { +public: + static PromoteRequest* create(ImageCtxT *image_ctx, bool force, + Context *on_finish) { + return new PromoteRequest(image_ctx, force, on_finish); + } + + PromoteRequest(ImageCtxT *image_ctx, bool force, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * OPEN * * * * * * * * * * + * | * + * v * + * ALLOCATE_TAG * * * * * * + * | * + * v * + * APPEND_EVENT * * * * + * | * * + * v * * + * COMMIT_EVENT * * + * | * * + * v * * + * STOP_APPEND <* * * * + * | * + * v * + * SHUT_DOWN <* * * * * * * + * | + * v + * <finish> + * + * @endverbatim + */ + + typedef typename TypeTraits<ImageCtxT>::Journaler Journaler; + typedef typename TypeTraits<ImageCtxT>::Future Future; + + ImageCtxT *m_image_ctx; + bool m_force; + Context *m_on_finish; + + Journaler *m_journaler = nullptr; + int m_ret_val = 0; + + Mutex m_lock; + ImageClientMeta m_client_meta; + uint64_t m_tag_tid = 0; + TagData m_tag_data; + + cls::journal::Tag m_tag; + Future m_future; + + void send_open(); + void handle_open(int r); + + void allocate_tag(); + void handle_allocate_tag(int r); + + void append_event(); + void handle_append_event(int r); + + void commit_event(); + void handle_commit_event(int r); + + void stop_append(); + void handle_stop_append(int r); + + void shut_down(); + void handle_shut_down(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::PromoteRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H diff --git a/src/librbd/journal/RemoveRequest.cc b/src/librbd/journal/RemoveRequest.cc new file mode 100644 index 00000000..6ed5f54c --- /dev/null +++ b/src/librbd/journal/RemoveRequest.cc @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/dout.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "journal/Settings.h" +#include "include/ceph_assert.h" +#include "librbd/Utils.h" +#include "librbd/journal/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Journal::RemoveRequest: " + +namespace librbd { + +using util::create_context_callback; + +namespace journal { + +template<typename I> +RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, const std::string &image_id, + const std::string &client_id, + ContextWQ *op_work_queue, + Context *on_finish) + : m_ioctx(ioctx), m_image_id(image_id), m_image_client_id(client_id), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); +} + +template<typename I> +void RemoveRequest<I>::send() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + stat_journal(); +} + +template<typename I> +void RemoveRequest<I>::stat_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock); + m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock, + m_ioctx, m_image_id, m_image_client_id, {}); + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_stat_journal>(this); + + m_journaler->exists(ctx); +} + +template<typename I> +Context *RemoveRequest<I>::handle_stat_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if ((*result < 0) && (*result != -ENOENT)) { + lderr(m_cct) << "failed to stat journal header: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + if (*result == -ENOENT) { + shut_down_journaler(0); + return nullptr; + } + + init_journaler(); + return nullptr; +} + +template<typename I> +void RemoveRequest<I>::init_journaler() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_init_journaler>(this); + + m_journaler->init(ctx); +} + +template<typename I> +Context *RemoveRequest<I>::handle_init_journaler(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if ((*result < 0) && (*result != -ENOENT)) { + lderr(m_cct) << "failed to init journaler: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + remove_journal(); + return nullptr; +} + +template<typename I> +void RemoveRequest<I>::remove_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_remove_journal>(this); + + m_journaler->remove(true, ctx); +} + +template<typename I> +Context *RemoveRequest<I>::handle_remove_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to remove journal: " << cpp_strerror(*result) << dendl; + } + + shut_down_journaler(*result); + return nullptr; +} + +template<typename I> +void RemoveRequest<I>::shut_down_journaler(int r) { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + m_r_saved = r; + + using klass = RemoveRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_journaler_shutdown>(this); + + m_journaler->shut_down(ctx); +} + +template<typename I> +Context *RemoveRequest<I>::handle_journaler_shutdown(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl; + } + + delete m_journaler; + + if (m_r_saved == 0) { + ldout(m_cct, 20) << "done." << dendl; + } + + m_on_finish->complete(m_r_saved); + delete this; + + return nullptr; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::RemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/journal/RemoveRequest.h b/src/librbd/journal/RemoveRequest.h new file mode 100644 index 00000000..594e62d5 --- /dev/null +++ b/src/librbd/journal/RemoveRequest.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "common/Mutex.h" +#include "librbd/ImageCtx.h" +#include "journal/Journaler.h" +#include "librbd/journal/TypeTraits.h" + +using librados::IoCtx; +using journal::Journaler; + +class Context; +class ContextWQ; +class SafeTimer; + +namespace journal { + class Journaler; +} + +namespace librbd { + +class ImageCtx; + +namespace journal { + +template<typename ImageCtxT = ImageCtx> +class RemoveRequest { +public: + static RemoveRequest *create(IoCtx &ioctx, const std::string &image_id, + const std::string &client_id, + ContextWQ *op_work_queue, Context *on_finish) { + return new RemoveRequest(ioctx, image_id, client_id, + op_work_queue, on_finish); + } + + void send(); + +private: + typedef typename TypeTraits<ImageCtxT>::Journaler Journaler; + + RemoveRequest(IoCtx &ioctx, const std::string &image_id, + const std::string &client_id, + ContextWQ *op_work_queue, Context *on_finish); + + IoCtx &m_ioctx; + std::string m_image_id; + std::string m_image_client_id; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + Journaler *m_journaler; + SafeTimer *m_timer; + Mutex *m_timer_lock; + int m_r_saved; + + void stat_journal(); + Context *handle_stat_journal(int *result); + + void init_journaler(); + Context *handle_init_journaler(int *result); + + void remove_journal(); + Context *handle_remove_journal(int *result); + + void shut_down_journaler(int r); + Context *handle_journaler_shutdown(int *result); +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::RemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc new file mode 100644 index 00000000..f96153e7 --- /dev/null +++ b/src/librbd/journal/Replay.cc @@ -0,0 +1,1190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Replay.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " + +namespace librbd { +namespace journal { + +namespace { + +static const uint64_t IN_FLIGHT_IO_LOW_WATER_MARK(32); +static const uint64_t IN_FLIGHT_IO_HIGH_WATER_MARK(64); + +static NoOpProgressContext no_op_progress_callback; + +template <typename I, typename E> +struct ExecuteOp : public Context { + I &image_ctx; + E event; + Context *on_op_complete; + + ExecuteOp(I &image_ctx, const E &event, Context *on_op_complete) + : image_ctx(image_ctx), event(event), on_op_complete(on_op_complete) { + } + + void execute(const journal::SnapCreateEvent &_) { + image_ctx.operations->execute_snap_create(event.snap_namespace, + event.snap_name, + on_op_complete, + event.op_tid, false); + } + + void execute(const journal::SnapRemoveEvent &_) { + image_ctx.operations->execute_snap_remove(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapRenameEvent &_) { + image_ctx.operations->execute_snap_rename(event.snap_id, + event.dst_snap_name, + on_op_complete); + } + + void execute(const journal::SnapProtectEvent &_) { + image_ctx.operations->execute_snap_protect(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapUnprotectEvent &_) { + image_ctx.operations->execute_snap_unprotect(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapRollbackEvent &_) { + image_ctx.operations->execute_snap_rollback(event.snap_namespace, + event.snap_name, + no_op_progress_callback, + on_op_complete); + } + + void execute(const journal::RenameEvent &_) { + image_ctx.operations->execute_rename(event.image_name, + on_op_complete); + } + + void execute(const journal::ResizeEvent &_) { + image_ctx.operations->execute_resize(event.size, true, no_op_progress_callback, + on_op_complete, event.op_tid); + } + + void execute(const journal::FlattenEvent &_) { + image_ctx.operations->execute_flatten(no_op_progress_callback, + on_op_complete); + } + + void execute(const journal::SnapLimitEvent &_) { + image_ctx.operations->execute_snap_set_limit(event.limit, on_op_complete); + } + + void execute(const journal::UpdateFeaturesEvent &_) { + image_ctx.operations->execute_update_features(event.features, event.enabled, + on_op_complete, event.op_tid); + } + + void execute(const journal::MetadataSetEvent &_) { + image_ctx.operations->execute_metadata_set(event.key, event.value, + on_op_complete); + } + + void execute(const journal::MetadataRemoveEvent &_) { + image_ctx.operations->execute_metadata_remove(event.key, on_op_complete); + } + + void finish(int r) override { + CephContext *cct = image_ctx.cct; + if (r < 0) { + lderr(cct) << ": ExecuteOp::" << __func__ << ": r=" << r << dendl; + on_op_complete->complete(r); + return; + } + + ldout(cct, 20) << ": ExecuteOp::" << __func__ << dendl; + RWLock::RLocker owner_locker(image_ctx.owner_lock); + + if (image_ctx.exclusive_lock == nullptr || + !image_ctx.exclusive_lock->accept_ops()) { + ldout(cct, 5) << ": lost exclusive lock -- skipping op" << dendl; + on_op_complete->complete(-ECANCELED); + return; + } + + execute(event); + } +}; + +template <typename I> +struct C_RefreshIfRequired : public Context { + I &image_ctx; + Context *on_finish; + + C_RefreshIfRequired(I &image_ctx, Context *on_finish) + : image_ctx(image_ctx), on_finish(on_finish) { + } + ~C_RefreshIfRequired() override { + delete on_finish; + } + + void finish(int r) override { + CephContext *cct = image_ctx.cct; + Context *ctx = on_finish; + on_finish = nullptr; + + if (r < 0) { + lderr(cct) << ": C_RefreshIfRequired::" << __func__ << ": r=" << r << dendl; + image_ctx.op_work_queue->queue(ctx, r); + return; + } + + if (image_ctx.state->is_refresh_required()) { + ldout(cct, 20) << ": C_RefreshIfRequired::" << __func__ << ": " + << "refresh required" << dendl; + image_ctx.state->refresh(ctx); + return; + } + + image_ctx.op_work_queue->queue(ctx, 0); + } +}; + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " \ + << __func__ + +template <typename I> +Replay<I>::Replay(I &image_ctx) + : m_image_ctx(image_ctx), m_lock("Replay<I>::m_lock") { +} + +template <typename I> +Replay<I>::~Replay() { + ceph_assert(m_in_flight_aio_flush == 0); + ceph_assert(m_in_flight_aio_modify == 0); + ceph_assert(m_aio_modify_unsafe_contexts.empty()); + ceph_assert(m_aio_modify_safe_contexts.empty()); + ceph_assert(m_op_events.empty()); + ceph_assert(m_in_flight_op_events == 0); +} + +template <typename I> +int Replay<I>::decode(bufferlist::const_iterator *it, EventEntry *event_entry) { + try { + using ceph::decode; + decode(*event_entry, *it); + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; +} + +template <typename I> +void Replay<I>::process(const EventEntry &event_entry, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": on_ready=" << on_ready << ", on_safe=" << on_safe + << dendl; + + on_ready = util::create_async_context_callback(m_image_ctx, on_ready); + + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->accept_ops()) { + ldout(cct, 5) << ": lost exclusive lock -- skipping event" << dendl; + m_image_ctx.op_work_queue->queue(on_safe, -ECANCELED); + on_ready->complete(0); + return; + } + + boost::apply_visitor(EventVisitor(this, on_ready, on_safe), + event_entry.event); +} + +template <typename I> +void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + io::AioCompletion *flush_comp = nullptr; + on_finish = util::create_async_context_callback( + m_image_ctx, on_finish); + + { + Mutex::Locker locker(m_lock); + + // safely commit any remaining AIO modify operations + if ((m_in_flight_aio_flush + m_in_flight_aio_modify) != 0) { + flush_comp = create_aio_flush_completion(nullptr); + ceph_assert(flush_comp != nullptr); + } + + for (auto &op_event_pair : m_op_events) { + OpEvent &op_event = op_event_pair.second; + if (cancel_ops) { + // cancel ops that are waiting to start (waiting for + // OpFinishEvent or waiting for ready) + if (op_event.on_start_ready == nullptr && + op_event.on_op_finish_event != nullptr) { + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART); + } + } else if (op_event.on_op_finish_event != nullptr) { + // start ops waiting for OpFinishEvent + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, 0); + } else if (op_event.on_start_ready != nullptr) { + // waiting for op ready + op_event_pair.second.finish_on_ready = true; + } + } + + ceph_assert(!m_shut_down); + m_shut_down = true; + + ceph_assert(m_flush_ctx == nullptr); + if (m_in_flight_op_events > 0 || flush_comp != nullptr) { + std::swap(m_flush_ctx, on_finish); + } + } + + // execute the following outside of lock scope + if (flush_comp != nullptr) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + if (on_finish != nullptr) { + on_finish->complete(0); + } +} + +template <typename I> +void Replay<I>::flush(Context *on_finish) { + io::AioCompletion *aio_comp; + { + Mutex::Locker locker(m_lock); + aio_comp = create_aio_flush_completion( + util::create_async_context_callback(m_image_ctx, on_finish)); + if (aio_comp == nullptr) { + return; + } + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); +} + +template <typename I> +void Replay<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": op_tid=" << op_tid << dendl; + + Mutex::Locker locker(m_lock); + auto op_it = m_op_events.find(op_tid); + ceph_assert(op_it != m_op_events.end()); + + OpEvent &op_event = op_it->second; + ceph_assert(op_event.op_in_progress && + op_event.on_op_finish_event == nullptr && + op_event.on_finish_ready == nullptr && + op_event.on_finish_safe == nullptr); + + // resume processing replay events + Context *on_start_ready = nullptr; + std::swap(on_start_ready, op_event.on_start_ready); + on_start_ready->complete(0); + + // cancel has been requested -- send error to paused state machine + if (!op_event.finish_on_ready && m_flush_ctx != nullptr) { + m_image_ctx.op_work_queue->queue(on_resume, -ERESTART); + return; + } + + // resume the op state machine once the associated OpFinishEvent + // is processed + op_event.on_op_finish_event = new FunctionContext( + [on_resume](int r) { + on_resume->complete(r); + }); + + // shut down request -- don't expect OpFinishEvent + if (op_event.finish_on_ready) { + m_image_ctx.op_work_queue->queue(on_resume, 0); + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioDiscardEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO discard event" << dendl; + + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_DISCARD, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + event.discard_granularity_bytes, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioWriteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO write event" << dendl; + + bufferlist data = event.data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_WRITE, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_write(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + std::move(data), 0, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioFlushEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO flush event" << dendl; + + io::AioCompletion *aio_comp; + { + Mutex::Locker locker(m_lock); + aio_comp = create_aio_flush_completion(on_safe); + } + + if (aio_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioWriteSameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO writesame event" << dendl; + + bufferlist data = event.data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_WRITESAME, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_writesame(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + std::move(data), 0, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + + template <typename I> + void Replay<I>::handle_event(const journal::AioCompareAndWriteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO CompareAndWrite event" << dendl; + + bufferlist cmp_data = event.cmp_data; + bufferlist write_data = event.write_data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_COMPARE_AND_WRITE, + &flush_required, + {-EILSEQ}); + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_compare_and_write(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + std::move(cmp_data), + std::move(write_data), + nullptr, 0, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::OpFinishEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Op finish event: " + << "op_tid=" << event.op_tid << dendl; + + bool op_in_progress; + bool filter_ret_val; + Context *on_op_complete = nullptr; + Context *on_op_finish_event = nullptr; + { + Mutex::Locker locker(m_lock); + auto op_it = m_op_events.find(event.op_tid); + if (op_it == m_op_events.end()) { + ldout(cct, 10) << ": unable to locate associated op: assuming previously " + << "committed." << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, 0); + return; + } + + OpEvent &op_event = op_it->second; + ceph_assert(op_event.on_finish_safe == nullptr); + op_event.on_finish_ready = on_ready; + op_event.on_finish_safe = on_safe; + op_in_progress = op_event.op_in_progress; + std::swap(on_op_complete, op_event.on_op_complete); + std::swap(on_op_finish_event, op_event.on_op_finish_event); + + // special errors which indicate op never started but was recorded + // as failed in the journal + filter_ret_val = (op_event.op_finish_error_codes.count(event.r) != 0); + } + + if (event.r < 0) { + if (op_in_progress) { + // bubble the error up to the in-progress op to cancel it + on_op_finish_event->complete(event.r); + } else { + // op hasn't been started -- bubble the error up since + // our image is now potentially in an inconsistent state + // since simple errors should have been caught before + // creating the op event + delete on_op_complete; + delete on_op_finish_event; + handle_op_complete(event.op_tid, filter_ret_val ? 0 : event.r); + } + return; + } + + // journal recorded success -- apply the op now + on_op_finish_event->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapCreateEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap create event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapCreateEvent>(m_image_ctx, event, + on_op_complete)), + 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapRemoveEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap remove event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapRemoveEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-ENOENT}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapRenameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap rename event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapRenameEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapProtectEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap protect event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapProtectEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EBUSY}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapUnprotectEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap unprotect event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapUnprotectEvent>(m_image_ctx, + event, + on_op_complete)); + + // ignore errors recorded in the journal + op_event->op_finish_error_codes = {-EBUSY}; + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EINVAL}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapRollbackEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap rollback start event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapRollbackEvent>(m_image_ctx, + event, + on_op_complete)); + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::RenameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Rename event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::RenameEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::ResizeEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Resize start event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::ResizeEvent>(m_image_ctx, event, + on_op_complete)), 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template <typename I> +void Replay<I>::handle_event(const journal::FlattenEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Flatten start event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::FlattenEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EINVAL}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::DemotePromoteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Demote/Promote event" << dendl; + on_ready->complete(0); + on_safe->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapLimitEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap limit event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapLimitEvent>(m_image_ctx, + event, + on_op_complete)); + + op_event->ignore_error_codes = {-ERANGE}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::UpdateFeaturesEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Update features event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::UpdateFeaturesEvent>( + m_image_ctx, event, on_op_complete)), 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template <typename I> +void Replay<I>::handle_event(const journal::MetadataSetEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Metadata set event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete); + op_event->on_op_finish_event = util::create_async_context_callback( + m_image_ctx, new ExecuteOp<I, journal::MetadataSetEvent>( + m_image_ctx, event, on_op_complete)); + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::MetadataRemoveEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Metadata remove event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete); + op_event->on_op_finish_event = util::create_async_context_callback( + m_image_ctx, new ExecuteOp<I, journal::MetadataRemoveEvent>( + m_image_ctx, event, on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-ENOENT}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::UnknownEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": unknown event" << dendl; + on_ready->complete(0); + on_safe->complete(0); +} + +template <typename I> +void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe, + int r, std::set<int> &filters, + bool writeback_cache_enabled) { + Mutex::Locker locker(m_lock); + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": on_ready=" << on_ready << ", " + << "on_safe=" << on_safe << ", r=" << r << dendl; + + if (on_ready != nullptr) { + on_ready->complete(0); + } + + if (filters.find(r) != filters.end()) + r = 0; + + if (r < 0) { + lderr(cct) << ": AIO modify op failed: " << cpp_strerror(r) << dendl; + m_image_ctx.op_work_queue->queue(on_safe, r); + return; + } + + if (writeback_cache_enabled) { + // will be completed after next flush operation completes + m_aio_modify_safe_contexts.insert(on_safe); + } else { + // IO is safely stored on disk + ceph_assert(m_in_flight_aio_modify > 0); + --m_in_flight_aio_modify; + + if (m_on_aio_ready != nullptr) { + ldout(cct, 10) << ": resuming paused AIO" << dendl; + m_on_aio_ready->complete(0); + m_on_aio_ready = nullptr; + } + + ldout(cct, 20) << ": completing safe context: " << on_safe << dendl; + m_image_ctx.op_work_queue->queue(on_safe, 0); + } +} + +template <typename I> +void Replay<I>::handle_aio_flush_complete(Context *on_flush_safe, + Contexts &on_safe_ctxs, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << ": AIO flush failed: " << cpp_strerror(r) << dendl; + } + + Context *on_aio_ready = nullptr; + Context *on_flush = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_in_flight_aio_flush > 0); + ceph_assert(m_in_flight_aio_modify >= on_safe_ctxs.size()); + --m_in_flight_aio_flush; + m_in_flight_aio_modify -= on_safe_ctxs.size(); + + std::swap(on_aio_ready, m_on_aio_ready); + if (m_in_flight_op_events == 0 && + (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) { + on_flush = m_flush_ctx; + } + + // strip out previously failed on_safe contexts + for (auto it = on_safe_ctxs.begin(); it != on_safe_ctxs.end(); ) { + if (m_aio_modify_safe_contexts.erase(*it)) { + ++it; + } else { + it = on_safe_ctxs.erase(it); + } + } + } + + if (on_aio_ready != nullptr) { + ldout(cct, 10) << ": resuming paused AIO" << dendl; + on_aio_ready->complete(0); + } + + if (on_flush_safe != nullptr) { + on_safe_ctxs.push_back(on_flush_safe); + } + for (auto ctx : on_safe_ctxs) { + ldout(cct, 20) << ": completing safe context: " << ctx << dendl; + ctx->complete(r); + } + + if (on_flush != nullptr) { + ldout(cct, 20) << ": completing flush context: " << on_flush << dendl; + on_flush->complete(r); + } +} + +template <typename I> +Context *Replay<I>::create_op_context_callback(uint64_t op_tid, + Context *on_ready, + Context *on_safe, + OpEvent **op_event) { + CephContext *cct = m_image_ctx.cct; + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + return nullptr; + } + + ceph_assert(m_lock.is_locked()); + if (m_op_events.count(op_tid) != 0) { + lderr(cct) << ": duplicate op tid detected: " << op_tid << dendl; + + // on_ready is already async but on failure invoke on_safe async + // as well + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -EINVAL); + return nullptr; + } + + ++m_in_flight_op_events; + *op_event = &m_op_events[op_tid]; + (*op_event)->on_start_safe = on_safe; + + Context *on_op_complete = new C_OpOnComplete(this, op_tid); + (*op_event)->on_op_complete = on_op_complete; + return on_op_complete; +} + +template <typename I> +void Replay<I>::handle_op_complete(uint64_t op_tid, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": op_tid=" << op_tid << ", " + << "r=" << r << dendl; + + OpEvent op_event; + bool shutting_down = false; + { + Mutex::Locker locker(m_lock); + auto op_it = m_op_events.find(op_tid); + ceph_assert(op_it != m_op_events.end()); + + op_event = std::move(op_it->second); + m_op_events.erase(op_it); + + if (m_shut_down) { + ceph_assert(m_flush_ctx != nullptr); + shutting_down = true; + } + } + + ceph_assert(op_event.on_start_ready == nullptr || (r < 0 && r != -ERESTART)); + if (op_event.on_start_ready != nullptr) { + // blocking op event failed before it became ready + ceph_assert(op_event.on_finish_ready == nullptr && + op_event.on_finish_safe == nullptr); + + op_event.on_start_ready->complete(0); + } else { + // event kicked off by OpFinishEvent + ceph_assert((op_event.on_finish_ready != nullptr && + op_event.on_finish_safe != nullptr) || shutting_down); + } + + if (op_event.on_op_finish_event != nullptr) { + op_event.on_op_finish_event->complete(r); + } + + if (op_event.on_finish_ready != nullptr) { + op_event.on_finish_ready->complete(0); + } + + // filter out errors caused by replay of the same op + if (r < 0 && op_event.ignore_error_codes.count(r) != 0) { + r = 0; + } + + op_event.on_start_safe->complete(r); + if (op_event.on_finish_safe != nullptr) { + op_event.on_finish_safe->complete(r); + } + + // shut down request might have occurred while lock was + // dropped -- handle if pending + Context *on_flush = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_in_flight_op_events > 0); + --m_in_flight_op_events; + if (m_in_flight_op_events == 0 && + (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) { + on_flush = m_flush_ctx; + } + } + if (on_flush != nullptr) { + m_image_ctx.op_work_queue->queue(on_flush, 0); + } +} + +template <typename I> +io::AioCompletion * +Replay<I>::create_aio_modify_completion(Context *on_ready, + Context *on_safe, + io::aio_type_t aio_type, + bool *flush_required, + std::set<int> &&filters) { + Mutex::Locker locker(m_lock); + CephContext *cct = m_image_ctx.cct; + ceph_assert(m_on_aio_ready == nullptr); + + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + return nullptr; + } + + ++m_in_flight_aio_modify; + + bool writeback_cache_enabled = m_image_ctx.is_writeback_cache_enabled(); + if (writeback_cache_enabled) { + m_aio_modify_unsafe_contexts.push_back(on_safe); + } + + // FLUSH if we hit the low-water mark -- on_safe contexts are + // completed by flushes-only so that we don't move the journal + // commit position until safely on-disk + + *flush_required = (writeback_cache_enabled && + m_aio_modify_unsafe_contexts.size() == + IN_FLIGHT_IO_LOW_WATER_MARK); + if (*flush_required) { + ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush" + << dendl; + } + + // READY for more events if: + // * not at high-water mark for IO + // * in-flight ops are at a consistent point (snap create has IO flushed, + // shrink has adjusted clip boundary, etc) -- should have already been + // flagged not-ready + if (m_in_flight_aio_modify == IN_FLIGHT_IO_HIGH_WATER_MARK) { + ldout(cct, 10) << ": hit AIO replay high-water mark: pausing replay" + << dendl; + ceph_assert(m_on_aio_ready == nullptr); + std::swap(m_on_aio_ready, on_ready); + } + + // when the modification is ACKed by librbd, we can process the next + // event. when flushed, the completion of the next flush will fire the + // on_safe callback + auto aio_comp = io::AioCompletion::create_and_start<Context>( + new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters), + writeback_cache_enabled), + util::get_image_ctx(&m_image_ctx), aio_type); + return aio_comp; +} + +template <typename I> +io::AioCompletion *Replay<I>::create_aio_flush_completion(Context *on_safe) { + ceph_assert(m_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + if (on_safe != nullptr) { + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + } + return nullptr; + } + + ++m_in_flight_aio_flush; + + // associate all prior write/discard ops to this flush request + auto aio_comp = io::AioCompletion::create_and_start<Context>( + new C_AioFlushComplete(this, on_safe, + std::move(m_aio_modify_unsafe_contexts)), + util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH); + m_aio_modify_unsafe_contexts.clear(); + return aio_comp; +} + +template <typename I> +bool Replay<I>::clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp) { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.snap_lock.get_read(); + size_t image_size = m_image_ctx.size; + m_image_ctx.snap_lock.put_read(); + + if (image_offset >= image_size) { + // rbd-mirror image sync might race an IO event w/ associated resize between + // the point the peer is registered and the sync point is created, so no-op + // IO events beyond the current image extents since under normal conditions + // it wouldn't have been recorded in the journal + ldout(cct, 5) << ": no-op IO event beyond image size" << dendl; + aio_comp->get(); + aio_comp->set_request_count(0); + aio_comp->put(); + return true; + } + + return false; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::Replay<librbd::ImageCtx>; diff --git a/src/librbd/journal/Replay.h b/src/librbd/journal/Replay.h new file mode 100644 index 00000000..206d7db4 --- /dev/null +++ b/src/librbd/journal/Replay.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_REPLAY_H +#define CEPH_LIBRBD_JOURNAL_REPLAY_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "common/Mutex.h" +#include "librbd/io/Types.h" +#include "librbd/journal/Types.h" +#include <boost/variant.hpp> +#include <list> +#include <unordered_set> +#include <unordered_map> + +namespace librbd { + +class ImageCtx; +namespace io { struct AioCompletion; } + +namespace journal { + +template <typename ImageCtxT = ImageCtx> +class Replay { +public: + static Replay *create(ImageCtxT &image_ctx) { + return new Replay(image_ctx); + } + + Replay(ImageCtxT &image_ctx); + ~Replay(); + + int decode(bufferlist::const_iterator *it, EventEntry *event_entry); + void process(const EventEntry &event_entry, + Context *on_ready, Context *on_safe); + + void shut_down(bool cancel_ops, Context *on_finish); + void flush(Context *on_finish); + + void replay_op_ready(uint64_t op_tid, Context *on_resume); + +private: + typedef std::unordered_set<int> ReturnValues; + + struct OpEvent { + bool op_in_progress = false; + bool finish_on_ready = false; + Context *on_op_finish_event = nullptr; + Context *on_start_ready = nullptr; + Context *on_start_safe = nullptr; + Context *on_finish_ready = nullptr; + Context *on_finish_safe = nullptr; + Context *on_op_complete = nullptr; + ReturnValues op_finish_error_codes; + ReturnValues ignore_error_codes; + }; + + typedef std::list<uint64_t> OpTids; + typedef std::list<Context *> Contexts; + typedef std::unordered_set<Context *> ContextSet; + typedef std::unordered_map<uint64_t, OpEvent> OpEvents; + + struct C_OpOnComplete : public Context { + Replay *replay; + uint64_t op_tid; + C_OpOnComplete(Replay *replay, uint64_t op_tid) + : replay(replay), op_tid(op_tid) { + } + void finish(int r) override { + replay->handle_op_complete(op_tid, r); + } + }; + + struct C_AioModifyComplete : public Context { + Replay *replay; + Context *on_ready; + Context *on_safe; + std::set<int> filters; + bool writeback_cache_enabled; + C_AioModifyComplete(Replay *replay, Context *on_ready, + Context *on_safe, std::set<int> &&filters, + bool writeback_cache_enabled) + : replay(replay), on_ready(on_ready), on_safe(on_safe), + filters(std::move(filters)), + writeback_cache_enabled(writeback_cache_enabled) { + } + void finish(int r) override { + replay->handle_aio_modify_complete(on_ready, on_safe, r, filters, + writeback_cache_enabled); + } + }; + + struct C_AioFlushComplete : public Context { + Replay *replay; + Context *on_flush_safe; + Contexts on_safe_ctxs; + C_AioFlushComplete(Replay *replay, Context *on_flush_safe, + Contexts &&on_safe_ctxs) + : replay(replay), on_flush_safe(on_flush_safe), + on_safe_ctxs(on_safe_ctxs) { + } + void finish(int r) override { + replay->handle_aio_flush_complete(on_flush_safe, on_safe_ctxs, r); + } + }; + + struct EventVisitor : public boost::static_visitor<void> { + Replay *replay; + Context *on_ready; + Context *on_safe; + + EventVisitor(Replay *_replay, Context *_on_ready, Context *_on_safe) + : replay(_replay), on_ready(_on_ready), on_safe(_on_safe) { + } + + template <typename Event> + inline void operator()(const Event &event) const { + replay->handle_event(event, on_ready, on_safe); + } + }; + + ImageCtxT &m_image_ctx; + + Mutex m_lock; + + uint64_t m_in_flight_aio_flush = 0; + uint64_t m_in_flight_aio_modify = 0; + Contexts m_aio_modify_unsafe_contexts; + ContextSet m_aio_modify_safe_contexts; + + OpEvents m_op_events; + uint64_t m_in_flight_op_events = 0; + + bool m_shut_down = false; + Context *m_flush_ctx = nullptr; + Context *m_on_aio_ready = nullptr; + + void handle_event(const AioDiscardEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioWriteEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioWriteSameEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioCompareAndWriteEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioFlushEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const OpFinishEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapCreateEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapRemoveEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapRenameEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapProtectEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapUnprotectEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapRollbackEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const RenameEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const ResizeEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const FlattenEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const DemotePromoteEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapLimitEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const UpdateFeaturesEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const MetadataSetEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const MetadataRemoveEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const UnknownEvent &event, Context *on_ready, + Context *on_safe); + + void handle_aio_modify_complete(Context *on_ready, Context *on_safe, + int r, std::set<int> &filters, + bool writeback_cache_enabled); + void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs, + int r); + + Context *create_op_context_callback(uint64_t op_tid, Context *on_ready, + Context *on_safe, OpEvent **op_event); + void handle_op_complete(uint64_t op_tid, int r); + + io::AioCompletion *create_aio_modify_completion(Context *on_ready, + Context *on_safe, + io::aio_type_t aio_type, + bool *flush_required, + std::set<int> &&filters); + io::AioCompletion *create_aio_flush_completion(Context *on_safe); + void handle_aio_completion(io::AioCompletion *aio_comp); + + bool clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::Replay<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_REPLAY_H diff --git a/src/librbd/journal/ResetRequest.cc b/src/librbd/journal/ResetRequest.cc new file mode 100644 index 00000000..9b67f3e4 --- /dev/null +++ b/src/librbd/journal/ResetRequest.cc @@ -0,0 +1,162 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/ResetRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "include/ceph_assert.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::ResetRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace journal { + +using util::create_async_context_callback; +using util::create_context_callback; + +template<typename I> +void ResetRequest<I>::send() { + init_journaler(); +} + +template<typename I> +void ResetRequest<I>::init_journaler() { + ldout(m_cct, 10) << dendl; + + m_journaler = new Journaler(m_io_ctx, m_image_id, m_client_id, {}); + Context *ctx = create_context_callback< + ResetRequest<I>, &ResetRequest<I>::handle_init_journaler>(this); + m_journaler->init(ctx); +} + +template<typename I> +void ResetRequest<I>::handle_init_journaler(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + ldout(m_cct, 5) << "journal does not exist" << dendl; + m_ret_val = r; + } else if (r < 0) { + lderr(m_cct) << "failed to init journaler: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } else { + int64_t pool_id; + m_journaler->get_metadata(&m_order, &m_splay_width, &pool_id); + + if (pool_id != -1) { + librados::Rados rados(m_io_ctx); + r = rados.pool_reverse_lookup(pool_id, &m_object_pool_name); + if (r < 0) { + lderr(m_cct) << "failed to lookup data pool: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + } + } + } + + shut_down_journaler(); +} + +template<typename I> +void ResetRequest<I>::shut_down_journaler() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_async_context_callback( + m_op_work_queue, create_context_callback< + ResetRequest<I>, &ResetRequest<I>::handle_journaler_shutdown>(this)); + m_journaler->shut_down(ctx); +} + +template<typename I> +void ResetRequest<I>::handle_journaler_shutdown(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + delete m_journaler; + if (r < 0) { + lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(r) + << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + } + + if (m_ret_val < 0) { + finish(m_ret_val); + return; + } + + remove_journal(); +} + +template<typename I> +void ResetRequest<I>::remove_journal() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + ResetRequest<I>, &ResetRequest<I>::handle_remove_journal>(this); + auto req = RemoveRequest<I>::create(m_io_ctx, m_image_id, m_client_id, + m_op_work_queue, ctx); + req->send(); +} + +template<typename I> +void ResetRequest<I>::handle_remove_journal(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove journal: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + create_journal(); +} + +template<typename I> +void ResetRequest<I>::create_journal() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + ResetRequest<I>, &ResetRequest<I>::handle_create_journal>(this); + journal::TagData tag_data(m_mirror_uuid); + auto req = CreateRequest<I>::create(m_io_ctx, m_image_id, m_order, + m_splay_width, m_object_pool_name, + cls::journal::Tag::TAG_CLASS_NEW, + tag_data, m_client_id, m_op_work_queue, + ctx); + req->send(); +} + +template<typename I> +void ResetRequest<I>::handle_create_journal(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create journal: " << cpp_strerror(r) << dendl; + } + finish(r); +} + +template<typename I> +void ResetRequest<I>::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::ResetRequest<librbd::ImageCtx>; diff --git a/src/librbd/journal/ResetRequest.h b/src/librbd/journal/ResetRequest.h new file mode 100644 index 00000000..9cc8e2e4 --- /dev/null +++ b/src/librbd/journal/ResetRequest.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "common/Mutex.h" +#include "librbd/journal/TypeTraits.h" +#include <string> + +class Context; +class ContextWQ; +class SafeTimer; + +namespace journal { class Journaler; } + +namespace librbd { + +class ImageCtx; + +namespace journal { + +template<typename ImageCtxT = ImageCtx> +class ResetRequest { +public: + static ResetRequest *create(librados::IoCtx &io_ctx, + const std::string &image_id, + const std::string &client_id, + const std::string &mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) { + return new ResetRequest(io_ctx, image_id, client_id, mirror_uuid, + op_work_queue, on_finish); + } + + ResetRequest(librados::IoCtx &io_ctx, const std::string &image_id, + const std::string &client_id, const std::string &mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_client_id(client_id), + m_mirror_uuid(mirror_uuid), m_op_work_queue(op_work_queue), + m_on_finish(on_finish), + m_cct(reinterpret_cast<CephContext *>(m_io_ctx.cct())) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * INIT_JOURNALER + * | + * v + * SHUT_DOWN_JOURNALER + * | + * v + * REMOVE_JOURNAL + * | + * v + * CREATE_JOURNAL + * | + * v + * <finish> + * + * @endverbatim + */ + typedef typename TypeTraits<ImageCtxT>::Journaler Journaler; + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + std::string m_client_id; + std::string m_mirror_uuid; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + Journaler *m_journaler = nullptr; + int m_ret_val = 0; + + uint8_t m_order = 0; + uint8_t m_splay_width = 0; + std::string m_object_pool_name; + + void init_journaler(); + void handle_init_journaler(int r); + + void shut_down_journaler(); + void handle_journaler_shutdown(int r); + + void remove_journal(); + void handle_remove_journal(int r); + + void create_journal(); + void handle_create_journal(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::ResetRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H diff --git a/src/librbd/journal/StandardPolicy.cc b/src/librbd/journal/StandardPolicy.cc new file mode 100644 index 00000000..58631801 --- /dev/null +++ b/src/librbd/journal/StandardPolicy.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/StandardPolicy.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::StandardPolicy: " + +namespace librbd { +namespace journal { + +template<typename I> +void StandardPolicy<I>::allocate_tag_on_lock(Context *on_finish) { + ceph_assert(m_image_ctx->journal != nullptr); + + if (!m_image_ctx->journal->is_tag_owner()) { + lderr(m_image_ctx->cct) << "local image not promoted" << dendl; + m_image_ctx->op_work_queue->queue(on_finish, -EPERM); + return; + } + + m_image_ctx->journal->allocate_local_tag(on_finish); +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::StandardPolicy<librbd::ImageCtx>; diff --git a/src/librbd/journal/StandardPolicy.h b/src/librbd/journal/StandardPolicy.h new file mode 100644 index 00000000..ec8d0148 --- /dev/null +++ b/src/librbd/journal/StandardPolicy.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H +#define CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H + +#include "librbd/journal/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +template<typename ImageCtxT = ImageCtx> +class StandardPolicy : public Policy { +public: + StandardPolicy(ImageCtxT *image_ctx) : m_image_ctx(image_ctx) { + } + + bool append_disabled() const override { + return false; + } + bool journal_disabled() const override { + return false; + } + void allocate_tag_on_lock(Context *on_finish) override; + +private: + ImageCtxT *m_image_ctx; +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::StandardPolicy<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H diff --git a/src/librbd/journal/TypeTraits.h b/src/librbd/journal/TypeTraits.h new file mode 100644 index 00000000..d6dde690 --- /dev/null +++ b/src/librbd/journal/TypeTraits.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H +#define CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H + +namespace journal { +class Future; +class Journaler; +class ReplayEntry; +} + +namespace librbd { +namespace journal { + +template <typename ImageCtxT> +struct TypeTraits { + typedef ::journal::Journaler Journaler; + typedef ::journal::Future Future; + typedef ::journal::ReplayEntry ReplayEntry; +}; + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H diff --git a/src/librbd/journal/Types.cc b/src/librbd/journal/Types.cc new file mode 100644 index 00000000..ee919d35 --- /dev/null +++ b/src/librbd/journal/Types.cc @@ -0,0 +1,950 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Types.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "include/types.h" +#include "common/Formatter.h" + +namespace librbd { +namespace journal { + +using ceph::encode; +using ceph::decode; + +namespace { + +template <typename E> +class GetTypeVisitor : public boost::static_visitor<E> { +public: + template <typename T> + inline E operator()(const T&) const { + return T::TYPE; + } +}; + +class EncodeVisitor : public boost::static_visitor<void> { +public: + explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) { + } + + template <typename T> + inline void operator()(const T& t) const { + encode(static_cast<uint32_t>(T::TYPE), m_bl); + t.encode(m_bl); + } +private: + bufferlist &m_bl; +}; + +class DecodeVisitor : public boost::static_visitor<void> { +public: + DecodeVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) { + } + + template <typename T> + inline void operator()(T& t) const { + t.decode(m_version, m_iter); + } +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +class DumpVisitor : public boost::static_visitor<void> { +public: + explicit DumpVisitor(Formatter *formatter, const std::string &key) + : m_formatter(formatter), m_key(key) {} + + template <typename T> + inline void operator()(const T& t) const { + auto type = T::TYPE; + m_formatter->dump_string(m_key.c_str(), stringify(type)); + t.dump(m_formatter); + } +private: + ceph::Formatter *m_formatter; + std::string m_key; +}; + +} // anonymous namespace + +void AioDiscardEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + bool skip_partial_discard = (discard_granularity_bytes > 0); + encode(skip_partial_discard, bl); + encode(discard_granularity_bytes, bl); +} + +void AioDiscardEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + + bool skip_partial_discard = false; + if (version >= 4) { + decode(skip_partial_discard, it); + } + + if (version >= 5) { + decode(discard_granularity_bytes, it); + } else { + if (skip_partial_discard) { + // use a size larger than the maximum object size which will + // truncated down to object size during IO processing + discard_granularity_bytes = std::numeric_limits<uint32_t>::max(); + } else { + discard_granularity_bytes = 0; + } + } +} + +void AioDiscardEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); + f->dump_unsigned("discard_granularity_bytes", discard_granularity_bytes); +} + +uint32_t AioWriteEvent::get_fixed_size() { + return EventEntry::get_fixed_size() + 16 /* offset, length */; +} + +void AioWriteEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + encode(data, bl); +} + +void AioWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + decode(data, it); +} + +void AioWriteEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +void AioWriteSameEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + encode(data, bl); +} + +void AioWriteSameEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + decode(data, it); +} + +void AioWriteSameEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +uint32_t AioCompareAndWriteEvent::get_fixed_size() { + return EventEntry::get_fixed_size() + 32 /* offset, length */; +} + +void AioCompareAndWriteEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + encode(cmp_data, bl); + encode(write_data, bl); +} + +void AioCompareAndWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + decode(cmp_data, it); + decode(write_data, it); +} + +void AioCompareAndWriteEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +void AioFlushEvent::encode(bufferlist& bl) const { +} + +void AioFlushEvent::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void AioFlushEvent::dump(Formatter *f) const { +} + +void OpEventBase::encode(bufferlist& bl) const { + using ceph::encode; + encode(op_tid, bl); +} + +void OpEventBase::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(op_tid, it); +} + +void OpEventBase::dump(Formatter *f) const { + f->dump_unsigned("op_tid", op_tid); +} + +void OpFinishEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(op_tid, bl); + encode(r, bl); +} + +void OpFinishEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(op_tid, it); + decode(r, it); +} + +void OpFinishEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("op_tid", op_tid); + f->dump_int("result", r); +} + +void SnapEventBase::encode(bufferlist& bl) const { + using ceph::encode; + OpEventBase::encode(bl); + encode(snap_name, bl); + encode(snap_namespace, bl); +} + +void SnapEventBase::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + OpEventBase::decode(version, it); + using ceph::decode; + decode(snap_name, it); + if (version >= 4) { + decode(snap_namespace, it); + } +} + +void SnapEventBase::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("snap_name", snap_name); + snap_namespace.dump(f); +} + +void SnapCreateEvent::encode(bufferlist &bl) const { + SnapEventBase::encode(bl); +} + +void SnapCreateEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + SnapEventBase::decode(version, it); + if (version == 3) { + decode(snap_namespace, it); + } +} + +void SnapCreateEvent::dump(Formatter *f) const { + SnapEventBase::dump(f); +} + +void SnapLimitEvent::encode(bufferlist &bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(limit, bl); +} + +void SnapLimitEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(limit, it); +} + +void SnapLimitEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("limit", limit); +} + +void SnapRenameEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(dst_snap_name, bl); + encode(snap_id, bl); + encode(src_snap_name, bl); +} + +void SnapRenameEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + OpEventBase::decode(version, it); + decode(dst_snap_name, it); + decode(snap_id, it); + if (version >= 2) { + decode(src_snap_name, it); + } +} + +void SnapRenameEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("src_snap_id", snap_id); + f->dump_string("src_snap_name", src_snap_name); + f->dump_string("dest_snap_name", dst_snap_name); +} + +void RenameEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(image_name, bl); +} + +void RenameEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(image_name, it); +} + +void RenameEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("image_name", image_name); +} + +void ResizeEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(size, bl); +} + +void ResizeEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(size, it); +} + +void ResizeEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("size", size); +} + +void DemotePromoteEvent::encode(bufferlist& bl) const { +} + +void DemotePromoteEvent::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void DemotePromoteEvent::dump(Formatter *f) const { +} + +void UpdateFeaturesEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(features, bl); + encode(enabled, bl); +} + +void UpdateFeaturesEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(features, it); + decode(enabled, it); +} + +void UpdateFeaturesEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("features", features); + f->dump_bool("enabled", enabled); +} + +void MetadataSetEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(key, bl); + encode(value, bl); +} + +void MetadataSetEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(key, it); + decode(value, it); +} + +void MetadataSetEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("key", key); + f->dump_string("value", value); +} + +void MetadataRemoveEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(key, bl); +} + +void MetadataRemoveEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(key, it); +} + +void MetadataRemoveEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("key", key); +} + +void UnknownEvent::encode(bufferlist& bl) const { + ceph_abort(); +} + +void UnknownEvent::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void UnknownEvent::dump(Formatter *f) const { +} + +EventType EventEntry::get_event_type() const { + return boost::apply_visitor(GetTypeVisitor<EventType>(), event); +} + +void EventEntry::encode(bufferlist& bl) const { + ENCODE_START(5, 1, bl); + boost::apply_visitor(EncodeVisitor(bl), event); + ENCODE_FINISH(bl); + encode_metadata(bl); +} + +void EventEntry::decode(bufferlist::const_iterator& it) { + DECODE_START(1, it); + + uint32_t event_type; + decode(event_type, it); + + // select the correct payload variant based upon the encoded op + switch (event_type) { + case EVENT_TYPE_AIO_DISCARD: + event = AioDiscardEvent(); + break; + case EVENT_TYPE_AIO_WRITE: + event = AioWriteEvent(); + break; + case EVENT_TYPE_AIO_FLUSH: + event = AioFlushEvent(); + break; + case EVENT_TYPE_OP_FINISH: + event = OpFinishEvent(); + break; + case EVENT_TYPE_SNAP_CREATE: + event = SnapCreateEvent(); + break; + case EVENT_TYPE_SNAP_REMOVE: + event = SnapRemoveEvent(); + break; + case EVENT_TYPE_SNAP_RENAME: + event = SnapRenameEvent(); + break; + case EVENT_TYPE_SNAP_PROTECT: + event = SnapProtectEvent(); + break; + case EVENT_TYPE_SNAP_UNPROTECT: + event = SnapUnprotectEvent(); + break; + case EVENT_TYPE_SNAP_ROLLBACK: + event = SnapRollbackEvent(); + break; + case EVENT_TYPE_RENAME: + event = RenameEvent(); + break; + case EVENT_TYPE_RESIZE: + event = ResizeEvent(); + break; + case EVENT_TYPE_FLATTEN: + event = FlattenEvent(); + break; + case EVENT_TYPE_DEMOTE_PROMOTE: + event = DemotePromoteEvent(); + break; + case EVENT_TYPE_UPDATE_FEATURES: + event = UpdateFeaturesEvent(); + break; + case EVENT_TYPE_METADATA_SET: + event = MetadataSetEvent(); + break; + case EVENT_TYPE_METADATA_REMOVE: + event = MetadataRemoveEvent(); + break; + case EVENT_TYPE_AIO_WRITESAME: + event = AioWriteSameEvent(); + break; + case EVENT_TYPE_AIO_COMPARE_AND_WRITE: + event = AioCompareAndWriteEvent(); + break; + default: + event = UnknownEvent(); + break; + } + + boost::apply_visitor(DecodeVisitor(struct_v, it), event); + DECODE_FINISH(it); + if (struct_v >= 4) { + decode_metadata(it); + } +} + +void EventEntry::dump(Formatter *f) const { + boost::apply_visitor(DumpVisitor(f, "event_type"), event); + f->dump_stream("timestamp") << timestamp; +} + +void EventEntry::encode_metadata(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(timestamp, bl); + ENCODE_FINISH(bl); +} + +void EventEntry::decode_metadata(bufferlist::const_iterator& it) { + DECODE_START(1, it); + decode(timestamp, it); + DECODE_FINISH(it); +} + +void EventEntry::generate_test_instances(std::list<EventEntry *> &o) { + o.push_back(new EventEntry(AioDiscardEvent())); + o.push_back(new EventEntry(AioDiscardEvent(123, 345, 4096), utime_t(1, 1))); + + bufferlist bl; + bl.append(std::string(32, '1')); + o.push_back(new EventEntry(AioWriteEvent())); + o.push_back(new EventEntry(AioWriteEvent(123, 456, bl), utime_t(1, 1))); + + o.push_back(new EventEntry(AioFlushEvent())); + + o.push_back(new EventEntry(OpFinishEvent(123, -1), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapCreateEvent(), utime_t(1, 1))); + o.push_back(new EventEntry(SnapCreateEvent(234, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapRemoveEvent())); + o.push_back(new EventEntry(SnapRemoveEvent(345, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapRenameEvent())); + o.push_back(new EventEntry(SnapRenameEvent(456, 1, "src snap", "dest snap"), + utime_t(1, 1))); + + o.push_back(new EventEntry(SnapProtectEvent())); + o.push_back(new EventEntry(SnapProtectEvent(567, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapUnprotectEvent())); + o.push_back(new EventEntry(SnapUnprotectEvent(678, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapRollbackEvent())); + o.push_back(new EventEntry(SnapRollbackEvent(789, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(RenameEvent())); + o.push_back(new EventEntry(RenameEvent(890, "image name"), utime_t(1, 1))); + + o.push_back(new EventEntry(ResizeEvent())); + o.push_back(new EventEntry(ResizeEvent(901, 1234), utime_t(1, 1))); + + o.push_back(new EventEntry(FlattenEvent(123), utime_t(1, 1))); + + o.push_back(new EventEntry(DemotePromoteEvent())); + + o.push_back(new EventEntry(UpdateFeaturesEvent())); + o.push_back(new EventEntry(UpdateFeaturesEvent(123, 127, true), utime_t(1, 1))); + + o.push_back(new EventEntry(MetadataSetEvent())); + o.push_back(new EventEntry(MetadataSetEvent(123, "key", "value"), utime_t(1, 1))); + + o.push_back(new EventEntry(MetadataRemoveEvent())); + o.push_back(new EventEntry(MetadataRemoveEvent(123, "key"), utime_t(1, 1))); +} + +// Journal Client + +void ImageClientMeta::encode(bufferlist& bl) const { + using ceph::encode; + encode(tag_class, bl); + encode(resync_requested, bl); +} + +void ImageClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(tag_class, it); + decode(resync_requested, it); +} + +void ImageClientMeta::dump(Formatter *f) const { + f->dump_unsigned("tag_class", tag_class); + f->dump_bool("resync_requested", resync_requested); +} + +void MirrorPeerSyncPoint::encode(bufferlist& bl) const { + using ceph::encode; + encode(snap_name, bl); + encode(from_snap_name, bl); + encode(object_number, bl); + encode(snap_namespace, bl); +} + +void MirrorPeerSyncPoint::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(snap_name, it); + decode(from_snap_name, it); + decode(object_number, it); + if (version >= 2) { + decode(snap_namespace, it); + } +} + +void MirrorPeerSyncPoint::dump(Formatter *f) const { + f->dump_string("snap_name", snap_name); + f->dump_string("from_snap_name", from_snap_name); + if (object_number) { + f->dump_unsigned("object_number", *object_number); + } + snap_namespace.dump(f); +} + +void MirrorPeerClientMeta::encode(bufferlist& bl) const { + using ceph::encode; + encode(image_id, bl); + encode(static_cast<uint32_t>(state), bl); + encode(sync_object_count, bl); + encode(static_cast<uint32_t>(sync_points.size()), bl); + for (auto &sync_point : sync_points) { + sync_point.encode(bl); + } + encode(snap_seqs, bl); +} + +void MirrorPeerClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(image_id, it); + + uint32_t decode_state; + decode(decode_state, it); + state = static_cast<MirrorPeerState>(decode_state); + + decode(sync_object_count, it); + + uint32_t sync_point_count; + decode(sync_point_count, it); + sync_points.resize(sync_point_count); + for (auto &sync_point : sync_points) { + sync_point.decode(version, it); + } + + decode(snap_seqs, it); +} + +void MirrorPeerClientMeta::dump(Formatter *f) const { + f->dump_string("image_id", image_id); + f->dump_stream("state") << state; + f->dump_unsigned("sync_object_count", sync_object_count); + f->open_array_section("sync_points"); + for (auto &sync_point : sync_points) { + f->open_object_section("sync_point"); + sync_point.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("snap_seqs"); + for (auto &pair : snap_seqs) { + f->open_object_section("snap_seq"); + f->dump_unsigned("local_snap_seq", pair.first); + f->dump_unsigned("peer_snap_seq", pair.second); + f->close_section(); + } + f->close_section(); +} + +void CliClientMeta::encode(bufferlist& bl) const { +} + +void CliClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void CliClientMeta::dump(Formatter *f) const { +} + +void UnknownClientMeta::encode(bufferlist& bl) const { + ceph_abort(); +} + +void UnknownClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void UnknownClientMeta::dump(Formatter *f) const { +} + +ClientMetaType ClientData::get_client_meta_type() const { + return boost::apply_visitor(GetTypeVisitor<ClientMetaType>(), client_meta); +} + +void ClientData::encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + boost::apply_visitor(EncodeVisitor(bl), client_meta); + ENCODE_FINISH(bl); +} + +void ClientData::decode(bufferlist::const_iterator& it) { + DECODE_START(1, it); + + uint32_t client_meta_type; + decode(client_meta_type, it); + + // select the correct payload variant based upon the encoded op + switch (client_meta_type) { + case IMAGE_CLIENT_META_TYPE: + client_meta = ImageClientMeta(); + break; + case MIRROR_PEER_CLIENT_META_TYPE: + client_meta = MirrorPeerClientMeta(); + break; + case CLI_CLIENT_META_TYPE: + client_meta = CliClientMeta(); + break; + default: + client_meta = UnknownClientMeta(); + break; + } + + boost::apply_visitor(DecodeVisitor(struct_v, it), client_meta); + DECODE_FINISH(it); +} + +void ClientData::dump(Formatter *f) const { + boost::apply_visitor(DumpVisitor(f, "client_meta_type"), client_meta); +} + +void ClientData::generate_test_instances(std::list<ClientData *> &o) { + o.push_back(new ClientData(ImageClientMeta())); + o.push_back(new ClientData(ImageClientMeta(123))); + o.push_back(new ClientData(MirrorPeerClientMeta())); + o.push_back(new ClientData(MirrorPeerClientMeta("image_id", + {{{}, "snap 2", "snap 1", 123}}, + {{1, 2}, {3, 4}}))); + o.push_back(new ClientData(CliClientMeta())); +} + +// Journal Tag + +void TagPredecessor::encode(bufferlist& bl) const { + using ceph::encode; + encode(mirror_uuid, bl); + encode(commit_valid, bl); + encode(tag_tid, bl); + encode(entry_tid, bl); +} + +void TagPredecessor::decode(bufferlist::const_iterator& it) { + using ceph::decode; + decode(mirror_uuid, it); + decode(commit_valid, it); + decode(tag_tid, it); + decode(entry_tid, it); +} + +void TagPredecessor::dump(Formatter *f) const { + f->dump_string("mirror_uuid", mirror_uuid); + f->dump_string("commit_valid", commit_valid ? "true" : "false"); + f->dump_unsigned("tag_tid", tag_tid); + f->dump_unsigned("entry_tid", entry_tid); +} + +void TagData::encode(bufferlist& bl) const { + using ceph::encode; + encode(mirror_uuid, bl); + predecessor.encode(bl); +} + +void TagData::decode(bufferlist::const_iterator& it) { + using ceph::decode; + decode(mirror_uuid, it); + predecessor.decode(it); +} + +void TagData::dump(Formatter *f) const { + f->dump_string("mirror_uuid", mirror_uuid); + f->open_object_section("predecessor"); + predecessor.dump(f); + f->close_section(); +} + +void TagData::generate_test_instances(std::list<TagData *> &o) { + o.push_back(new TagData()); + o.push_back(new TagData("mirror-uuid")); + o.push_back(new TagData("mirror-uuid", "remote-mirror-uuid", true, 123, 234)); +} + +std::ostream &operator<<(std::ostream &out, const EventType &type) { + using namespace librbd::journal; + + switch (type) { + case EVENT_TYPE_AIO_DISCARD: + out << "AioDiscard"; + break; + case EVENT_TYPE_AIO_WRITE: + out << "AioWrite"; + break; + case EVENT_TYPE_AIO_FLUSH: + out << "AioFlush"; + break; + case EVENT_TYPE_OP_FINISH: + out << "OpFinish"; + break; + case EVENT_TYPE_SNAP_CREATE: + out << "SnapCreate"; + break; + case EVENT_TYPE_SNAP_REMOVE: + out << "SnapRemove"; + break; + case EVENT_TYPE_SNAP_RENAME: + out << "SnapRename"; + break; + case EVENT_TYPE_SNAP_PROTECT: + out << "SnapProtect"; + break; + case EVENT_TYPE_SNAP_UNPROTECT: + out << "SnapUnprotect"; + break; + case EVENT_TYPE_SNAP_ROLLBACK: + out << "SnapRollback"; + break; + case EVENT_TYPE_RENAME: + out << "Rename"; + break; + case EVENT_TYPE_RESIZE: + out << "Resize"; + break; + case EVENT_TYPE_FLATTEN: + out << "Flatten"; + break; + case EVENT_TYPE_DEMOTE_PROMOTE: + out << "Demote/Promote"; + break; + case EVENT_TYPE_UPDATE_FEATURES: + out << "UpdateFeatures"; + break; + case EVENT_TYPE_METADATA_SET: + out << "MetadataSet"; + break; + case EVENT_TYPE_METADATA_REMOVE: + out << "MetadataRemove"; + break; + case EVENT_TYPE_AIO_WRITESAME: + out << "AioWriteSame"; + break; + case EVENT_TYPE_AIO_COMPARE_AND_WRITE: + out << "AioCompareAndWrite"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(type) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, const ClientMetaType &type) { + using namespace librbd::journal; + + switch (type) { + case IMAGE_CLIENT_META_TYPE: + out << "Master Image"; + break; + case MIRROR_PEER_CLIENT_META_TYPE: + out << "Mirror Peer"; + break; + case CLI_CLIENT_META_TYPE: + out << "CLI Tool"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(type) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta) { + out << "[tag_class=" << meta.tag_class << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync) { + out << "[snap_name=" << sync.snap_name << ", " + << "from_snap_name=" << sync.from_snap_name; + if (sync.object_number) { + out << ", " << *sync.object_number; + } + out << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const MirrorPeerState &state) { + switch (state) { + case MIRROR_PEER_STATE_SYNCING: + out << "Syncing"; + break; + case MIRROR_PEER_STATE_REPLAYING: + out << "Replaying"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(state) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta) { + out << "[image_id=" << meta.image_id << ", " + << "state=" << meta.state << ", " + << "sync_object_count=" << meta.sync_object_count << ", " + << "sync_points=["; + std::string delimiter; + for (auto &sync_point : meta.sync_points) { + out << delimiter << "[" << sync_point << "]"; + delimiter = ", "; + } + out << "], snap_seqs=["; + delimiter = ""; + for (auto &pair : meta.snap_seqs) { + out << delimiter << "[" + << "local_snap_seq=" << pair.first << ", " + << "peer_snap_seq" << pair.second << "]"; + delimiter = ", "; + } + out << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor) { + out << "[" + << "mirror_uuid=" << predecessor.mirror_uuid; + if (predecessor.commit_valid) { + out << ", " + << "tag_tid=" << predecessor.tag_tid << ", " + << "entry_tid=" << predecessor.entry_tid; + } + out << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const TagData &tag_data) { + out << "[" + << "mirror_uuid=" << tag_data.mirror_uuid << ", " + << "predecessor=" << tag_data.predecessor + << "]"; + return out; +} + +} // namespace journal +} // namespace librbd + diff --git a/src/librbd/journal/Types.h b/src/librbd/journal/Types.h new file mode 100644 index 00000000..ae5681ad --- /dev/null +++ b/src/librbd/journal/Types.h @@ -0,0 +1,685 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_TYPES_H +#define CEPH_LIBRBD_JOURNAL_TYPES_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/types.h" +#include "include/utime.h" +#include "librbd/Types.h" +#include <iosfwd> +#include <list> +#include <boost/none.hpp> +#include <boost/optional.hpp> +#include <boost/variant.hpp> +#include <boost/mpl/vector.hpp> + +namespace ceph { +class Formatter; +} + +namespace librbd { +namespace journal { + +enum EventType { + EVENT_TYPE_AIO_DISCARD = 0, + EVENT_TYPE_AIO_WRITE = 1, + EVENT_TYPE_AIO_FLUSH = 2, + EVENT_TYPE_OP_FINISH = 3, + EVENT_TYPE_SNAP_CREATE = 4, + EVENT_TYPE_SNAP_REMOVE = 5, + EVENT_TYPE_SNAP_RENAME = 6, + EVENT_TYPE_SNAP_PROTECT = 7, + EVENT_TYPE_SNAP_UNPROTECT = 8, + EVENT_TYPE_SNAP_ROLLBACK = 9, + EVENT_TYPE_RENAME = 10, + EVENT_TYPE_RESIZE = 11, + EVENT_TYPE_FLATTEN = 12, + EVENT_TYPE_DEMOTE_PROMOTE = 13, + EVENT_TYPE_SNAP_LIMIT = 14, + EVENT_TYPE_UPDATE_FEATURES = 15, + EVENT_TYPE_METADATA_SET = 16, + EVENT_TYPE_METADATA_REMOVE = 17, + EVENT_TYPE_AIO_WRITESAME = 18, + EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19, +}; + +struct AioDiscardEvent { + static const EventType TYPE = EVENT_TYPE_AIO_DISCARD; + + uint64_t offset = 0; + uint64_t length = 0; + uint32_t discard_granularity_bytes = 0; + + AioDiscardEvent() { + } + AioDiscardEvent(uint64_t _offset, uint64_t _length, + uint32_t discard_granularity_bytes) + : offset(_offset), length(_length), + discard_granularity_bytes(discard_granularity_bytes) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioWriteEvent { + static const EventType TYPE = EVENT_TYPE_AIO_WRITE; + + uint64_t offset; + uint64_t length; + bufferlist data; + + static uint32_t get_fixed_size(); + + AioWriteEvent() : offset(0), length(0) { + } + AioWriteEvent(uint64_t _offset, uint64_t _length, const bufferlist &_data) + : offset(_offset), length(_length), data(_data) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioWriteSameEvent { + static const EventType TYPE = EVENT_TYPE_AIO_WRITESAME; + + uint64_t offset; + uint64_t length; + bufferlist data; + + AioWriteSameEvent() : offset(0), length(0) { + } + AioWriteSameEvent(uint64_t _offset, uint64_t _length, + const bufferlist &_data) + : offset(_offset), length(_length), data(_data) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioCompareAndWriteEvent { + static const EventType TYPE = EVENT_TYPE_AIO_COMPARE_AND_WRITE; + + uint64_t offset; + uint64_t length; + bufferlist cmp_data; + bufferlist write_data; + + static uint32_t get_fixed_size(); + + AioCompareAndWriteEvent() : offset(0), length(0) { + } + AioCompareAndWriteEvent(uint64_t _offset, uint64_t _length, + const bufferlist &_cmp_data, const bufferlist &_write_data) + : offset(_offset), length(_length), cmp_data(_cmp_data), write_data(_write_data) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioFlushEvent { + static const EventType TYPE = EVENT_TYPE_AIO_FLUSH; + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct OpEventBase { + uint64_t op_tid; + +protected: + OpEventBase() : op_tid(0) { + } + OpEventBase(uint64_t op_tid) : op_tid(op_tid) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct OpFinishEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_OP_FINISH; + + int r; + + OpFinishEvent() : r(0) { + } + OpFinishEvent(uint64_t op_tid, int r) : OpEventBase(op_tid), r(r) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapEventBase : public OpEventBase { + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + +protected: + SnapEventBase() { + } + SnapEventBase(uint64_t op_tid, const cls::rbd::SnapshotNamespace& _snap_namespace, + const std::string &_snap_name) + : OpEventBase(op_tid), + snap_namespace(_snap_namespace), + snap_name(_snap_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapCreateEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_CREATE; + + SnapCreateEvent() { + } + SnapCreateEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapRemoveEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_REMOVE; + + SnapRemoveEvent() { + } + SnapRemoveEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct SnapRenameEvent : public OpEventBase{ + static const EventType TYPE = EVENT_TYPE_SNAP_RENAME; + + uint64_t snap_id; + std::string src_snap_name; + std::string dst_snap_name; + + SnapRenameEvent() : snap_id(CEPH_NOSNAP) { + } + SnapRenameEvent(uint64_t op_tid, uint64_t src_snap_id, + const std::string &src_snap_name, + const std::string &dest_snap_name) + : OpEventBase(op_tid), + snap_id(src_snap_id), + src_snap_name(src_snap_name), + dst_snap_name(dest_snap_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapProtectEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_PROTECT; + + SnapProtectEvent() { + } + SnapProtectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct SnapUnprotectEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_UNPROTECT; + + SnapUnprotectEvent() { + } + SnapUnprotectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct SnapLimitEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_LIMIT; + uint64_t limit; + + SnapLimitEvent() { + } + SnapLimitEvent(uint64_t op_tid, const uint64_t _limit) + : OpEventBase(op_tid), limit(_limit) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapRollbackEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_ROLLBACK; + + SnapRollbackEvent() { + } + SnapRollbackEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct RenameEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_RENAME; + + std::string image_name; + + RenameEvent() { + } + RenameEvent(uint64_t op_tid, const std::string &_image_name) + : OpEventBase(op_tid), image_name(_image_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct ResizeEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_RESIZE; + + uint64_t size; + + ResizeEvent() : size(0) { + } + ResizeEvent(uint64_t op_tid, uint64_t _size) + : OpEventBase(op_tid), size(_size) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct FlattenEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_FLATTEN; + + FlattenEvent() { + } + FlattenEvent(uint64_t op_tid) : OpEventBase(op_tid) { + } + + using OpEventBase::encode; + using OpEventBase::decode; + using OpEventBase::dump; +}; + +struct DemotePromoteEvent { + static const EventType TYPE = static_cast<EventType>( + EVENT_TYPE_DEMOTE_PROMOTE); + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct UpdateFeaturesEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_UPDATE_FEATURES; + + uint64_t features; + bool enabled; + + UpdateFeaturesEvent() : features(0), enabled(false) { + } + UpdateFeaturesEvent(uint64_t op_tid, uint64_t _features, bool _enabled) + : OpEventBase(op_tid), features(_features), enabled(_enabled) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct MetadataSetEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_METADATA_SET; + + string key; + string value; + + MetadataSetEvent() { + } + MetadataSetEvent(uint64_t op_tid, const string &_key, const string &_value) + : OpEventBase(op_tid), key(_key), value(_value) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct MetadataRemoveEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_METADATA_REMOVE; + + string key; + + MetadataRemoveEvent() { + } + MetadataRemoveEvent(uint64_t op_tid, const string &_key) + : OpEventBase(op_tid), key(_key) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct UnknownEvent { + static const EventType TYPE = static_cast<EventType>(-1); + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +typedef boost::mpl::vector<AioDiscardEvent, + AioWriteEvent, + AioFlushEvent, + OpFinishEvent, + SnapCreateEvent, + SnapRemoveEvent, + SnapRenameEvent, + SnapProtectEvent, + SnapUnprotectEvent, + SnapRollbackEvent, + RenameEvent, + ResizeEvent, + FlattenEvent, + DemotePromoteEvent, + SnapLimitEvent, + UpdateFeaturesEvent, + MetadataSetEvent, + MetadataRemoveEvent, + AioWriteSameEvent, + AioCompareAndWriteEvent, + UnknownEvent> EventVector; +typedef boost::make_variant_over<EventVector>::type Event; + +struct EventEntry { + static uint32_t get_fixed_size() { + return EVENT_FIXED_SIZE + METADATA_FIXED_SIZE; + } + + EventEntry() : event(UnknownEvent()) { + } + EventEntry(const Event &_event, const utime_t &_timestamp = utime_t()) + : event(_event), timestamp(_timestamp) { + } + + Event event; + utime_t timestamp; + + EventType get_event_type() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<EventEntry *> &o); + +private: + static const uint32_t EVENT_FIXED_SIZE = 14; /// version encoding, type + static const uint32_t METADATA_FIXED_SIZE = 14; /// version encoding, timestamp + + void encode_metadata(bufferlist& bl) const; + void decode_metadata(bufferlist::const_iterator& it); +}; + +// Journal Client data structures + +enum ClientMetaType { + IMAGE_CLIENT_META_TYPE = 0, + MIRROR_PEER_CLIENT_META_TYPE = 1, + CLI_CLIENT_META_TYPE = 2 +}; + +struct ImageClientMeta { + static const ClientMetaType TYPE = IMAGE_CLIENT_META_TYPE; + + uint64_t tag_class = 0; + bool resync_requested = false; + + ImageClientMeta() { + } + ImageClientMeta(uint64_t tag_class) : tag_class(tag_class) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct MirrorPeerSyncPoint { + typedef boost::optional<uint64_t> ObjectNumber; + + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + std::string from_snap_name; + ObjectNumber object_number; + + MirrorPeerSyncPoint() : MirrorPeerSyncPoint({}, "", "", boost::none) { + } + MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + const ObjectNumber &object_number) + : MirrorPeerSyncPoint(snap_namespace, snap_name, "", object_number) { + } + MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + const std::string &from_snap_name, + const ObjectNumber &object_number) + : snap_namespace(snap_namespace), snap_name(snap_name), + from_snap_name(from_snap_name), object_number(object_number) { + } + + inline bool operator==(const MirrorPeerSyncPoint &sync) const { + return (snap_name == sync.snap_name && + from_snap_name == sync.from_snap_name && + object_number == sync.object_number && + snap_namespace == sync.snap_namespace); + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +enum MirrorPeerState { + MIRROR_PEER_STATE_SYNCING, + MIRROR_PEER_STATE_REPLAYING +}; + +struct MirrorPeerClientMeta { + typedef std::list<MirrorPeerSyncPoint> SyncPoints; + + static const ClientMetaType TYPE = MIRROR_PEER_CLIENT_META_TYPE; + + std::string image_id; + MirrorPeerState state = MIRROR_PEER_STATE_SYNCING; ///< replay state + uint64_t sync_object_count = 0; ///< maximum number of objects ever sync'ed + SyncPoints sync_points; ///< max two in-use snapshots for sync + SnapSeqs snap_seqs; ///< local to peer snap seq mapping + + MirrorPeerClientMeta() { + } + MirrorPeerClientMeta(const std::string &image_id, + const SyncPoints &sync_points = SyncPoints(), + const SnapSeqs &snap_seqs = SnapSeqs()) + : image_id(image_id), sync_points(sync_points), snap_seqs(snap_seqs) { + } + + inline bool operator==(const MirrorPeerClientMeta &meta) const { + return (image_id == meta.image_id && + state == meta.state && + sync_object_count == meta.sync_object_count && + sync_points == meta.sync_points && + snap_seqs == meta.snap_seqs); + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct CliClientMeta { + static const ClientMetaType TYPE = CLI_CLIENT_META_TYPE; + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct UnknownClientMeta { + static const ClientMetaType TYPE = static_cast<ClientMetaType>(-1); + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +typedef boost::variant<ImageClientMeta, + MirrorPeerClientMeta, + CliClientMeta, + UnknownClientMeta> ClientMeta; + +struct ClientData { + ClientData() { + } + ClientData(const ClientMeta &client_meta) : client_meta(client_meta) { + } + + ClientMeta client_meta; + + ClientMetaType get_client_meta_type() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<ClientData *> &o); +}; + +// Journal Tag data structures + +struct TagPredecessor { + std::string mirror_uuid; // empty if local + bool commit_valid = false; + uint64_t tag_tid = 0; + uint64_t entry_tid = 0; + + TagPredecessor() { + } + TagPredecessor(const std::string &mirror_uuid, bool commit_valid, + uint64_t tag_tid, uint64_t entry_tid) + : mirror_uuid(mirror_uuid), commit_valid(commit_valid), tag_tid(tag_tid), + entry_tid(entry_tid) { + } + + inline bool operator==(const TagPredecessor &rhs) const { + return (mirror_uuid == rhs.mirror_uuid && + commit_valid == rhs.commit_valid && + tag_tid == rhs.tag_tid && + entry_tid == rhs.entry_tid); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct TagData { + // owner of the tag (exclusive lock epoch) + std::string mirror_uuid; // empty if local + + // mapping to last committed record of previous tag + TagPredecessor predecessor; + + TagData() { + } + TagData(const std::string &mirror_uuid) : mirror_uuid(mirror_uuid) { + } + TagData(const std::string &mirror_uuid, + const std::string &predecessor_mirror_uuid, + bool predecessor_commit_valid, + uint64_t predecessor_tag_tid, uint64_t predecessor_entry_tid) + : mirror_uuid(mirror_uuid), + predecessor(predecessor_mirror_uuid, predecessor_commit_valid, + predecessor_tag_tid, predecessor_entry_tid) { + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<TagData *> &o); +}; + +std::ostream &operator<<(std::ostream &out, const EventType &type); +std::ostream &operator<<(std::ostream &out, const ClientMetaType &type); +std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta); +std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync); +std::ostream &operator<<(std::ostream &out, const MirrorPeerState &meta); +std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta); +std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor); +std::ostream &operator<<(std::ostream &out, const TagData &tag_data); + +struct Listener { + virtual ~Listener() { + } + + /// invoked when journal close is requested + virtual void handle_close() = 0; + + /// invoked when journal is promoted to primary + virtual void handle_promoted() = 0; + + /// invoked when journal resync is requested + virtual void handle_resync() = 0; +}; + +WRITE_CLASS_ENCODER(EventEntry); +WRITE_CLASS_ENCODER(ClientData); +WRITE_CLASS_ENCODER(TagData); + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_TYPES_H diff --git a/src/librbd/journal/Utils.cc b/src/librbd/journal/Utils.cc new file mode 100644 index 00000000..1721a9b2 --- /dev/null +++ b/src/librbd/journal/Utils.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/journal/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::" + +namespace librbd { +namespace journal { +namespace util { + +int C_DecodeTag::decode(bufferlist::const_iterator *it, TagData *tag_data) { + try { + using ceph::decode; + decode(*tag_data, *it); + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; +} + +int C_DecodeTag::process(int r) { + if (r < 0) { + lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": " + << "failed to allocate tag: " << cpp_strerror(r) + << dendl; + return r; + } + + Mutex::Locker locker(*lock); + *tag_tid = tag.tid; + + auto data_it = tag.data.cbegin(); + r = decode(&data_it, tag_data); + if (r < 0) { + lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": " + << "failed to decode allocated tag" << dendl; + return r; + } + + ldout(cct, 20) << "C_DecodeTag: " << this << " " << __func__ << ": " + << "allocated journal tag: " + << "tid=" << tag.tid << ", " + << "data=" << *tag_data << dendl; + return 0; +} + +int C_DecodeTags::process(int r) { + if (r < 0) { + lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "failed to retrieve journal tags: " << cpp_strerror(r) + << dendl; + return r; + } + + if (tags.empty()) { + lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "no journal tags retrieved" << dendl; + return -ENOENT; + } + + Mutex::Locker locker(*lock); + *tag_tid = tags.back().tid; + auto data_it = tags.back().data.cbegin(); + r = C_DecodeTag::decode(&data_it, tag_data); + if (r < 0) { + lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "failed to decode journal tag" << dendl; + return r; + } + + ldout(cct, 20) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "most recent journal tag: " + << "tid=" << *tag_tid << ", " + << "data=" << *tag_data << dendl; + return 0; +} + +} // namespace util +} // namespace journal +} // namespace librbd diff --git a/src/librbd/journal/Utils.h b/src/librbd/journal/Utils.h new file mode 100644 index 00000000..63d37c03 --- /dev/null +++ b/src/librbd/journal/Utils.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_UTILS_H +#define CEPH_LIBRBD_JOURNAL_UTILS_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "cls/journal/cls_journal_types.h" +#include <list> + +struct CephContext; +struct Mutex; + +namespace librbd { +namespace journal { + +struct TagData; + +namespace util { + +struct C_DecodeTag : public Context { + CephContext *cct; + Mutex *lock; + uint64_t *tag_tid; + TagData *tag_data; + Context *on_finish; + + cls::journal::Tag tag; + + C_DecodeTag(CephContext *cct, Mutex *lock, uint64_t *tag_tid, + TagData *tag_data, Context *on_finish) + : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data), + on_finish(on_finish) { + } + + void complete(int r) override { + on_finish->complete(process(r)); + Context::complete(0); + } + void finish(int r) override { + } + + int process(int r); + + static int decode(bufferlist::const_iterator *it, TagData *tag_data); + +}; + +struct C_DecodeTags : public Context { + typedef std::list<cls::journal::Tag> Tags; + + CephContext *cct; + Mutex *lock; + uint64_t *tag_tid; + TagData *tag_data; + Context *on_finish; + + Tags tags; + + C_DecodeTags(CephContext *cct, Mutex *lock, uint64_t *tag_tid, + TagData *tag_data, Context *on_finish) + : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data), + on_finish(on_finish) { + } + + void complete(int r) override { + on_finish->complete(process(r)); + Context::complete(0); + } + void finish(int r) override { + } + + int process(int r); +}; + +} // namespace util +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_UTILS_H diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc new file mode 100644 index 00000000..ccab8893 --- /dev/null +++ b/src/librbd/librbd.cc @@ -0,0 +1,6356 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/int_types.h" + +#include <errno.h> + +#include "common/dout.h" +#include "common/errno.h" +#include "common/TracepointProvider.h" +#include "include/Context.h" + +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/api/Config.h" +#include "librbd/api/DiffIterate.h" +#include "librbd/api/Group.h" +#include "librbd/api/Image.h" +#include "librbd/api/Migration.h" +#include "librbd/api/Mirror.h" +#include "librbd/api/Namespace.h" +#include "librbd/api/Pool.h" +#include "librbd/api/PoolMetadata.h" +#include "librbd/api/Snapshot.h" +#include "librbd/api/Trash.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ReadResult.h" +#include <algorithm> +#include <string> +#include <vector> + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/librbd.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd: " + +using std::string; +using std::vector; + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; + +namespace { + +TracepointProvider::Traits tracepoint_traits("librbd_tp.so", "rbd_tracing"); + +static auto create_write_raw(librbd::ImageCtx *ictx, const char *buf, + size_t len) { + // TODO: until librados can guarantee memory won't be referenced after + // it ACKs a request, always make a copy of the user-provided memory + return buffer::copy(buf, len); +} + +CephContext* get_cct(IoCtx &io_ctx) { + return reinterpret_cast<CephContext*>(io_ctx.cct()); +} + +librbd::io::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) { + return reinterpret_cast<librbd::io::AioCompletion *>(comp->pc); +} + +struct C_AioCompletion : public Context { + CephContext *cct; + librbd::io::aio_type_t aio_type; + librbd::io::AioCompletion* aio_comp; + + C_AioCompletion(librbd::ImageCtx *ictx, librbd::io::aio_type_t aio_type, + librbd::io::AioCompletion* aio_comp) + : cct(ictx->cct), aio_type(aio_type), aio_comp(aio_comp) { + aio_comp->init_time(ictx, aio_type); + aio_comp->get(); + } + + void finish(int r) override { + ldout(cct, 20) << "C_AioComplete::finish: r=" << r << dendl; + if (r < 0) { + aio_comp->fail(r); + } else { + aio_comp->lock.Lock(); + aio_comp->complete(); + aio_comp->put_unlock(); + } + } +}; + +struct C_OpenComplete : public C_AioCompletion { + librbd::ImageCtx *ictx; + void **ictxp; + C_OpenComplete(librbd::ImageCtx *ictx, librbd::io::AioCompletion* comp, + void **ictxp) + : C_AioCompletion(ictx, librbd::io::AIO_TYPE_OPEN, comp), + ictx(ictx), ictxp(ictxp) { + } + void finish(int r) override { + ldout(ictx->cct, 20) << "C_OpenComplete::finish: r=" << r << dendl; + if (r < 0) { + *ictxp = nullptr; + } else { + *ictxp = ictx; + } + + C_AioCompletion::finish(r); + } +}; + +struct C_OpenAfterCloseComplete : public Context { + librbd::ImageCtx *ictx; + librbd::io::AioCompletion* comp; + void **ictxp; + C_OpenAfterCloseComplete(librbd::ImageCtx *ictx, + librbd::io::AioCompletion* comp, + void **ictxp) + : ictx(ictx), comp(comp), ictxp(ictxp) { + } + void finish(int r) override { + ldout(ictx->cct, 20) << "C_OpenAfterCloseComplete::finish: r=" << r + << dendl; + delete reinterpret_cast<librbd::ImageCtx*>(*ictxp); + *ictxp = nullptr; + + ictx->state->open(0, new C_OpenComplete(ictx, comp, ictxp)); + } +}; + +struct C_UpdateWatchCB : public librbd::UpdateWatchCtx { + rbd_update_callback_t watch_cb; + void *arg; + uint64_t handle = 0; + + C_UpdateWatchCB(rbd_update_callback_t watch_cb, void *arg) : + watch_cb(watch_cb), arg(arg) { + } + void handle_notify() override { + watch_cb(arg); + } +}; + +void group_image_status_cpp_to_c(const librbd::group_image_info_t &cpp_info, + rbd_group_image_info_t *c_info) { + c_info->name = strdup(cpp_info.name.c_str()); + c_info->pool = cpp_info.pool; + c_info->state = cpp_info.state; +} + +void group_info_cpp_to_c(const librbd::group_info_t &cpp_info, + rbd_group_info_t *c_info) { + c_info->name = strdup(cpp_info.name.c_str()); + c_info->pool = cpp_info.pool; +} + +void group_snap_info_cpp_to_c(const librbd::group_snap_info_t &cpp_info, + rbd_group_snap_info_t *c_info) { + c_info->name = strdup(cpp_info.name.c_str()); + c_info->state = cpp_info.state; +} + +void mirror_image_info_cpp_to_c(const librbd::mirror_image_info_t &cpp_info, + rbd_mirror_image_info_t *c_info) { + c_info->global_id = strdup(cpp_info.global_id.c_str()); + c_info->state = cpp_info.state; + c_info->primary = cpp_info.primary; +} + +void mirror_image_status_cpp_to_c(const librbd::mirror_image_status_t &cpp_status, + rbd_mirror_image_status_t *c_status) { + c_status->name = strdup(cpp_status.name.c_str()); + mirror_image_info_cpp_to_c(cpp_status.info, &c_status->info); + c_status->state = cpp_status.state; + c_status->description = strdup(cpp_status.description.c_str()); + c_status->last_update = cpp_status.last_update; + c_status->up = cpp_status.up; +} + +void trash_image_info_cpp_to_c(const librbd::trash_image_info_t &cpp_info, + rbd_trash_image_info_t *c_info) { + c_info->id = strdup(cpp_info.id.c_str()); + c_info->name = strdup(cpp_info.name.c_str()); + c_info->source = cpp_info.source; + c_info->deletion_time = cpp_info.deletion_time; + c_info->deferment_end_time = cpp_info.deferment_end_time; +} + +void config_option_cpp_to_c(const librbd::config_option_t &cpp_option, + rbd_config_option_t *c_option) { + c_option->name = strdup(cpp_option.name.c_str()); + c_option->value = strdup(cpp_option.value.c_str()); + c_option->source = cpp_option.source; +} + +void config_option_cleanup(rbd_config_option_t &option) { + free(option.name); + free(option.value); +} + +struct C_MirrorImageGetInfo : public Context { + rbd_mirror_image_info_t *mirror_image_info; + Context *on_finish; + + librbd::mirror_image_info_t cpp_mirror_image_info; + + C_MirrorImageGetInfo(rbd_mirror_image_info_t *mirror_image_info, + Context *on_finish) + : mirror_image_info(mirror_image_info), on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + mirror_image_info_cpp_to_c(cpp_mirror_image_info, mirror_image_info); + on_finish->complete(0); + } +}; + +struct C_MirrorImageGetStatus : public Context { + rbd_mirror_image_status_t *mirror_image_status; + Context *on_finish; + + librbd::mirror_image_status_t cpp_mirror_image_status; + + C_MirrorImageGetStatus(rbd_mirror_image_status_t *mirror_image_status, + Context *on_finish) + : mirror_image_status(mirror_image_status), on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + mirror_image_status_cpp_to_c(cpp_mirror_image_status, mirror_image_status); + on_finish->complete(0); + } +}; + +} // anonymous namespace + +namespace librbd { + ProgressContext::~ProgressContext() + { + } + + class CProgressContext : public ProgressContext + { + public: + CProgressContext(librbd_progress_fn_t fn, void *data) + : m_fn(fn), m_data(data) + { + } + int update_progress(uint64_t offset, uint64_t src_size) override + { + return m_fn(offset, src_size, m_data); + } + private: + librbd_progress_fn_t m_fn; + void *m_data; + }; + + /* + * Pool stats + */ + PoolStats::PoolStats() { + rbd_pool_stats_create(&pool_stats); + } + + PoolStats::~PoolStats() { + rbd_pool_stats_destroy(pool_stats); + } + + int PoolStats::add(rbd_pool_stat_option_t option, uint64_t* opt_val) { + return rbd_pool_stats_option_add_uint64(pool_stats, option, opt_val); + } + + /* + * RBD + */ + RBD::RBD() + { + } + + RBD::~RBD() + { + } + + void RBD::version(int *major, int *minor, int *extra) + { + rbd_version(major, minor, extra); + } + + int RBD::open(IoCtx& io_ctx, Image& image, const char *name) + { + return open(io_ctx, image, name, NULL); + } + + int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id) + { + return open_by_id(io_ctx, image, id, nullptr); + } + + int RBD::open(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != NULL) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close(); + image.ctx = NULL; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_exit, 0); + return 0; + } + + int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != nullptr) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close(); + image.ctx = nullptr; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_by_id_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_by_id_exit, 0); + return 0; + } + + int RBD::aio_open(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != NULL) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_exit, 0); + return 0; + } + + int RBD::aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != nullptr) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_by_id_exit, 0); + return 0; + } + + int RBD::open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != NULL) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close(); + image.ctx = NULL; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_exit, 0); + return 0; + } + + int RBD::open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != nullptr) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close(); + image.ctx = nullptr; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_by_id_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_by_id_exit, 0); + return 0; + } + + int RBD::aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != NULL) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_exit, 0); + return 0; + } + + int RBD::aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != nullptr) { + reinterpret_cast<ImageCtx*>(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_by_id_exit, 0); + return 0; + } + + int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order); + int r = librbd::create(io_ctx, name, size, order); + tracepoint(librbd, create_exit, r, *order); + return r; + } + + int RBD::create2(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order); + int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0); + tracepoint(librbd, create2_exit, r, *order); + return r; + } + + int RBD::create3(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order, uint64_t stripe_unit, + uint64_t stripe_count) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count); + int r = librbd::create(io_ctx, name, size, false, features, order, + stripe_unit, stripe_count); + tracepoint(librbd, create3_exit, r, *order); + return r; + } + + int RBD::create4(IoCtx& io_ctx, const char *name, uint64_t size, + ImageOptions& opts) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts.opts); + int r = librbd::create(io_ctx, name, "", size, opts, "", "", false); + tracepoint(librbd, create4_exit, r); + return r; + } + + int RBD::clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx)); + tracepoint(librbd, clone_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features); + int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, + features, c_order, 0, 0); + tracepoint(librbd, clone_exit, r, *c_order); + return r; + } + + int RBD::clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order, uint64_t stripe_unit, int stripe_count) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx)); + tracepoint(librbd, clone2_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features, stripe_unit, stripe_count); + int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, + features, c_order, stripe_unit, stripe_count); + tracepoint(librbd, clone2_exit, r, *c_order); + return r; + } + + int RBD::clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx)); + tracepoint(librbd, clone3_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, c_opts.opts); + int r = librbd::clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, + nullptr, c_name, c_opts, "", ""); + tracepoint(librbd, clone3_exit, r); + return r; + } + + int RBD::remove(IoCtx& io_ctx, const char *name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx); + tracepoint(librbd, remove_exit, r); + return r; + } + + int RBD::remove_with_progress(IoCtx& io_ctx, const char *name, + ProgressContext& pctx) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + int r = librbd::api::Image<>::remove(io_ctx, name, pctx); + tracepoint(librbd, remove_exit, r); + return r; + } + + int RBD::trash_move(IoCtx &io_ctx, const char *name, uint64_t delay) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER, + name, delay); + tracepoint(librbd, trash_move_exit, r); + return r; + } + + int RBD::trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info) { + return librbd::api::Trash<>::get(io_ctx, id, info); + } + + int RBD::trash_list(IoCtx &io_ctx, vector<trash_image_info_t> &entries) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_list_enter, + io_ctx.get_pool_name().c_str(), io_ctx.get_id()); + int r = librbd::api::Trash<>::list(io_ctx, entries, true); +#ifdef WITH_LTTNG + if (r >= 0) { + for (const auto& entry : entries) { + tracepoint(librbd, trash_list_entry, entry.id.c_str()); + } + } +#endif + tracepoint(librbd, trash_list_exit, r, r); + return r; + } + + int RBD::trash_remove(IoCtx &io_ctx, const char *image_id, bool force) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx); + tracepoint(librbd, trash_remove_exit, r); + return r; + } + + int RBD::trash_remove_with_progress(IoCtx &io_ctx, const char *image_id, + bool force, ProgressContext &pctx) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, pctx); + tracepoint(librbd, trash_remove_exit, r); + return r; + } + + int RBD::trash_restore(IoCtx &io_ctx, const char *id, const char *name) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), id, name); + int r = librbd::api::Trash<>::restore( + io_ctx, librbd::api::Trash<>::RESTORE_SOURCE_WHITELIST, id, name); + tracepoint(librbd, trash_undelete_exit, r); + return r; + } + + int RBD::trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + NoOpProgressContext nop_pctx; + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; + } + + int RBD::trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, + float threshold, ProgressContext &pctx) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; + } + + int RBD::namespace_create(IoCtx& io_ctx, const char *namespace_name) { + return librbd::api::Namespace<>::create(io_ctx, namespace_name); + } + + int RBD::namespace_remove(IoCtx& io_ctx, const char *namespace_name) { + return librbd::api::Namespace<>::remove(io_ctx, namespace_name); + } + + int RBD::namespace_list(IoCtx& io_ctx, + std::vector<std::string>* namespace_names) { + return librbd::api::Namespace<>::list(io_ctx, namespace_names); + } + + int RBD::namespace_exists(IoCtx& io_ctx, const char *namespace_name, + bool *exists) { + return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists); + } + + int RBD::pool_init(IoCtx& io_ctx, bool force) { + return librbd::api::Pool<>::init(io_ctx, force); + } + + int RBD::pool_stats_get(IoCtx& io_ctx, PoolStats* stats) { + auto pool_stat_options = + reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats->pool_stats); + return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options); + } + + int RBD::list(IoCtx& io_ctx, vector<string>& names) + { + std::vector<image_spec_t> image_specs; + int r = list2(io_ctx, &image_specs); + if (r < 0) { + return r; + } + + names.clear(); + for (auto& it : image_specs) { + names.push_back(it.name); + } + return 0; + } + + int RBD::list2(IoCtx& io_ctx, std::vector<image_spec_t> *images) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + + int r = librbd::api::Image<>::list_images(io_ctx, images); + if (r >= 0) { + for (auto& it : *images) { + tracepoint(librbd, list_entry, it.name.c_str()); + } + } + tracepoint(librbd, list_exit, r, r); + return r; + } + + int RBD::rename(IoCtx& src_io_ctx, const char *srcname, const char *destname) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx)); + tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname); + int r = librbd::rename(src_io_ctx, srcname, destname); + tracepoint(librbd, rename_exit, r); + return r; + } + + int RBD::migration_prepare(IoCtx& io_ctx, const char *image_name, + IoCtx& dest_io_ctx, const char *dest_image_name, + ImageOptions& opts) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(), + dest_io_ctx.get_id(), dest_image_name, opts.opts); + int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx, + dest_image_name, opts); + tracepoint(librbd, migration_prepare_exit, r); + return r; + } + + int RBD::migration_execute(IoCtx& io_ctx, const char *image_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; + } + + int RBD::migration_execute_with_progress(IoCtx& io_ctx, + const char *image_name, + librbd::ProgressContext &prog_ctx) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; + } + + int RBD::migration_abort(IoCtx& io_ctx, const char *image_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; + } + + int RBD::migration_abort_with_progress(IoCtx& io_ctx, const char *image_name, + librbd::ProgressContext &prog_ctx) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; + } + + int RBD::migration_commit(IoCtx& io_ctx, const char *image_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; + } + + int RBD::migration_commit_with_progress(IoCtx& io_ctx, const char *image_name, + librbd::ProgressContext &prog_ctx) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; + } + + int RBD::migration_status(IoCtx& io_ctx, const char *image_name, + image_migration_status_t *status, + size_t status_size) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + + if (status_size != sizeof(image_migration_status_t)) { + tracepoint(librbd, migration_status_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Migration<>::status(io_ctx, image_name, status); + tracepoint(librbd, migration_status_exit, r); + return r; + } + + int RBD::mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode) { + return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode); + } + + int RBD::mirror_site_name_get(librados::Rados& rados, + std::string* site_name) { + return librbd::api::Mirror<>::site_name_get(rados, site_name); + } + + int RBD::mirror_site_name_set(librados::Rados& rados, + const std::string& site_name) { + return librbd::api::Mirror<>::site_name_set(rados, site_name); + } + + int RBD::mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode) { + return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode); + } + + int RBD::mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token) { + return librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, token); + } + + int RBD::mirror_peer_bootstrap_import(IoCtx& io_ctx, + rbd_mirror_peer_direction_t direction, + const std::string& token) { + return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction, + token); + } + + int RBD::mirror_peer_add(IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name) { + return librbd::api::Mirror<>::peer_add(io_ctx, uuid, cluster_name, + client_name); + } + + int RBD::mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid) { + return librbd::api::Mirror<>::peer_remove(io_ctx, uuid); + } + + int RBD::mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers) { + return librbd::api::Mirror<>::peer_list(io_ctx, peers); + } + + int RBD::mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name) { + return librbd::api::Mirror<>::peer_set_client(io_ctx, uuid, client_name); + } + + int RBD::mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid, + const std::string &cluster_name) { + return librbd::api::Mirror<>::peer_set_cluster(io_ctx, uuid, cluster_name); + } + + int RBD::mirror_peer_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map<std::string, std::string> *key_vals) { + return librbd::api::Mirror<>::peer_get_attributes(io_ctx, uuid, key_vals); + } + + int RBD::mirror_peer_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map<std::string, std::string>& key_vals) { + return librbd::api::Mirror<>::peer_set_attributes(io_ctx, uuid, key_vals); + } + + int RBD::mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id, + size_t max, std::map<std::string, mirror_image_status_t> *images) { + return librbd::api::Mirror<>::image_status_list(io_ctx, start_id, max, + images); + } + + int RBD::mirror_image_status_summary(IoCtx& io_ctx, + std::map<mirror_image_status_state_t, int> *states) { + return librbd::api::Mirror<>::image_status_summary(io_ctx, states); + } + + int RBD::mirror_image_instance_id_list(IoCtx& io_ctx, + const std::string &start_id, size_t max, + std::map<std::string, std::string> *instance_ids) { + return librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max, + instance_ids); + } + + int RBD::group_create(IoCtx& io_ctx, const char *group_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), group_name); + int r = librbd::api::Group<>::create(io_ctx, group_name); + tracepoint(librbd, group_create_exit, r); + return r; + } + + int RBD::group_remove(IoCtx& io_ctx, const char *group_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), group_name); + int r = librbd::api::Group<>::remove(io_ctx, group_name); + tracepoint(librbd, group_remove_exit, r); + return r; + } + + int RBD::group_list(IoCtx& io_ctx, vector<string> *names) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + + int r = librbd::api::Group<>::list(io_ctx, names); + if (r >= 0) { + for (auto itr : *names) { + tracepoint(librbd, group_list_entry, itr.c_str()); + } + } + tracepoint(librbd, group_list_exit, r); + return r; + } + + int RBD::group_rename(IoCtx& io_ctx, const char *src_name, + const char *dest_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), src_name, dest_name); + int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name); + tracepoint(librbd, group_rename_exit, r); + return r; + } + + int RBD::group_image_add(IoCtx& group_ioctx, const char *group_name, + IoCtx& image_ioctx, const char *image_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_add_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + int r = librbd::api::Group<>::image_add(group_ioctx, group_name, + image_ioctx, image_name); + tracepoint(librbd, group_image_add_exit, r); + return r; + } + + int RBD::group_image_remove(IoCtx& group_ioctx, const char *group_name, + IoCtx& image_ioctx, const char *image_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + int r = librbd::api::Group<>::image_remove(group_ioctx, group_name, + image_ioctx, image_name); + tracepoint(librbd, group_image_remove_exit, r); + return r; + } + + int RBD::group_image_remove_by_id(IoCtx& group_ioctx, const char *group_name, + IoCtx& image_ioctx, const char *image_id) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_by_id_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_id); + int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name, + image_ioctx, image_id); + tracepoint(librbd, group_image_remove_by_id_exit, r); + return r; + } + + int RBD::group_image_list(IoCtx& group_ioctx, const char *group_name, + std::vector<group_image_info_t> *images, + size_t group_image_info_size) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_list_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + + if (group_image_info_size != sizeof(group_image_info_t)) { + tracepoint(librbd, group_image_list_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Group<>::image_list(group_ioctx, group_name, images); + tracepoint(librbd, group_image_list_exit, r); + return r; + } + + int RBD::group_snap_create(IoCtx& group_ioctx, const char *group_name, + const char *snap_name) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_create_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, + snap_name); + tracepoint(librbd, group_snap_create_exit, r); + return r; + } + + int RBD::group_snap_remove(IoCtx& group_ioctx, const char *group_name, + const char *snap_name) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_remove_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name, + snap_name); + tracepoint(librbd, group_snap_remove_exit, r); + return r; + } + + int RBD::group_snap_list(IoCtx& group_ioctx, const char *group_name, + std::vector<group_snap_info_t> *snaps, + size_t group_snap_info_size) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_list_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + + if (group_snap_info_size != sizeof(group_snap_info_t)) { + tracepoint(librbd, group_snap_list_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, snaps); + tracepoint(librbd, group_snap_list_exit, r); + return r; + } + + int RBD::group_snap_rename(IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, + const char *new_snap_name) + { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rename_enter, + group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(), + group_name, old_snap_name, new_snap_name); + int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name, + old_snap_name, new_snap_name); + tracepoint(librbd, group_snap_list_exit, r); + return r; + } + + int RBD::group_snap_rollback(IoCtx& group_ioctx, const char *group_name, + const char *snap_name) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + tracepoint(librbd, group_snap_rollback_exit, r); + return r; + } + + int RBD::group_snap_rollback_with_progress(IoCtx& group_ioctx, + const char *group_name, + const char *snap_name, + ProgressContext& prog_ctx) { + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + tracepoint(librbd, group_snap_rollback_exit, r); + return r; + } + + int RBD::pool_metadata_get(IoCtx& ioctx, const std::string &key, + std::string *value) + { + int r = librbd::api::PoolMetadata<>::get(ioctx, key, value); + return r; + } + + int RBD::pool_metadata_set(IoCtx& ioctx, const std::string &key, + const std::string &value) + { + int r = librbd::api::PoolMetadata<>::set(ioctx, key, value); + return r; + } + + int RBD::pool_metadata_remove(IoCtx& ioctx, const std::string &key) + { + int r = librbd::api::PoolMetadata<>::remove(ioctx, key); + return r; + } + + int RBD::pool_metadata_list(IoCtx& ioctx, const std::string &start, + uint64_t max, map<string, bufferlist> *pairs) + { + int r = librbd::api::PoolMetadata<>::list(ioctx, start, max, pairs); + return r; + } + + int RBD::config_list(IoCtx& io_ctx, std::vector<config_option_t> *options) { + return librbd::api::Config<>::list(io_ctx, options); + } + + RBD::AioCompletion::AioCompletion(void *cb_arg, callback_t complete_cb) + { + pc = reinterpret_cast<void*>(librbd::io::AioCompletion::create( + cb_arg, complete_cb, this)); + } + + bool RBD::AioCompletion::is_complete() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->is_complete(); + } + + int RBD::AioCompletion::wait_for_complete() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->wait_for_complete(); + } + + ssize_t RBD::AioCompletion::get_return_value() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->get_return_value(); + } + + void *RBD::AioCompletion::get_arg() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->get_arg(); + } + + void RBD::AioCompletion::release() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + c->release(); + delete this; + } + + /* + ImageOptions + */ + + ImageOptions::ImageOptions() + { + librbd::image_options_create(&opts); + } + + ImageOptions::ImageOptions(rbd_image_options_t opts_) + { + librbd::image_options_create_ref(&opts, opts_); + } + + ImageOptions::ImageOptions(const ImageOptions &imgopts) + { + librbd::image_options_copy(&opts, imgopts); + } + + ImageOptions::~ImageOptions() + { + librbd::image_options_destroy(opts); + } + + int ImageOptions::set(int optname, const std::string& optval) + { + return librbd::image_options_set(opts, optname, optval); + } + + int ImageOptions::set(int optname, uint64_t optval) + { + return librbd::image_options_set(opts, optname, optval); + } + + int ImageOptions::get(int optname, std::string* optval) const + { + return librbd::image_options_get(opts, optname, optval); + } + + int ImageOptions::get(int optname, uint64_t* optval) const + { + return librbd::image_options_get(opts, optname, optval); + } + + int ImageOptions::is_set(int optname, bool* is_set) + { + return librbd::image_options_is_set(opts, optname, is_set); + } + + int ImageOptions::unset(int optname) + { + return librbd::image_options_unset(opts, optname); + } + + void ImageOptions::clear() + { + librbd::image_options_clear(opts); + } + + bool ImageOptions::empty() const + { + return librbd::image_options_is_empty(opts); + } + + /* + Image + */ + + Image::Image() : ctx(NULL) + { + } + + Image::~Image() + { + close(); + } + + int Image::close() + { + int r = 0; + if (ctx) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + + r = ictx->state->close(); + ctx = NULL; + + tracepoint(librbd, close_image_exit, r); + } + return r; + } + + int Image::aio_close(RBD::AioCompletion *c) + { + if (!ctx) { + return -EINVAL; + } + + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), c->pc); + + ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE, + get_aio_completion(c))); + ctx = NULL; + + tracepoint(librbd, aio_close_image_exit, 0); + return 0; + } + + int Image::resize(uint64_t size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->resize(size, true, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; + } + + int Image::resize2(uint64_t size, bool allow_shrink, librbd::ProgressContext& pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + int r = ictx->operations->resize(size, allow_shrink, pctx); + tracepoint(librbd, resize_exit, r); + return r; + } + + int Image::resize_with_progress(uint64_t size, librbd::ProgressContext& pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + int r = ictx->operations->resize(size, true, pctx); + tracepoint(librbd, resize_exit, r); + return r; + } + + int Image::stat(image_info_t& info, size_t infosize) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::info(ictx, info, infosize); + tracepoint(librbd, stat_exit, r, &info); + return r; + } + + int Image::old_format(uint8_t *old) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_old_format(ictx, old); + tracepoint(librbd, get_old_format_exit, r, *old); + return r; + } + + int Image::size(uint64_t *size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_size(ictx, size); + tracepoint(librbd, get_size_exit, r, *size); + return r; + } + + int Image::get_group(group_info_t *group_info, size_t group_info_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, image_get_group_enter, ictx->name.c_str()); + + if (group_info_size != sizeof(group_info_t)) { + tracepoint(librbd, image_get_group_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Group<>::image_get_group(ictx, group_info); + tracepoint(librbd, image_get_group_exit, r); + return r; + } + + int Image::features(uint64_t *features) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_features(ictx, features); + tracepoint(librbd, get_features_exit, r, *features); + return r; + } + + int Image::update_features(uint64_t features, bool enabled) + { + ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx); + tracepoint(librbd, update_features_enter, ictx, features, enabled); + int r = ictx->operations->update_features(features, enabled); + tracepoint(librbd, update_features_exit, r); + return r; + } + + int Image::get_op_features(uint64_t *op_features) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Image<>::get_op_features(ictx, op_features); + } + + uint64_t Image::get_stripe_unit() const + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + uint64_t stripe_unit = ictx->get_stripe_unit(); + tracepoint(librbd, get_stripe_unit_exit, 0, stripe_unit); + return stripe_unit; + } + + uint64_t Image::get_stripe_count() const + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + uint64_t stripe_count = ictx->get_stripe_count(); + tracepoint(librbd, get_stripe_count_exit, 0, stripe_count); + return stripe_count; + } + + int Image::get_create_timestamp(struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_create_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_create_timestamp_exit, 0, timestamp); + return 0; + } + + int Image::get_access_timestamp(struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + { + RWLock::RLocker timestamp_locker(ictx->timestamp_lock); + utime_t time = ictx->get_access_timestamp(); + time.to_timespec(timestamp); + } + tracepoint(librbd, get_access_timestamp_exit, 0, timestamp); + return 0; + } + + int Image::get_modify_timestamp(struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + { + RWLock::RLocker timestamp_locker(ictx->timestamp_lock); + utime_t time = ictx->get_modify_timestamp(); + time.to_timespec(timestamp); + } + tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp); + return 0; + } + + int Image::overlap(uint64_t *overlap) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_overlap(ictx, overlap); + tracepoint(librbd, get_overlap_exit, r, *overlap); + return r; + } + + int Image::get_name(std::string *name) + { + ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx); + *name = ictx->name; + return 0; + } + + int Image::get_id(std::string *id) + { + ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx); + if (ictx->old_format) { + return -EINVAL; + } + *id = ictx->id; + return 0; + } + + std::string Image::get_block_name_prefix() + { + ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx); + return ictx->object_prefix; + } + + int64_t Image::get_data_pool_id() + { + ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx); + return librbd::api::Image<>::get_data_pool_id(ictx); + } + + int Image::parent_info(string *parent_pool_name, string *parent_name, + string *parent_snap_name) + { + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = get_parent(&parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name != nullptr) { + *parent_pool_name = parent_image.pool_name; + } + if (parent_name != nullptr) { + *parent_name = parent_image.image_name; + } + if (parent_snap_name != nullptr) { + *parent_snap_name = parent_snap.name; + } + } + return r; + } + + int Image::parent_info2(string *parent_pool_name, string *parent_name, + string *parent_id, string *parent_snap_name) + { + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = get_parent(&parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name != nullptr) { + *parent_pool_name = parent_image.pool_name; + } + if (parent_name != nullptr) { + *parent_name = parent_image.image_name; + } + if (parent_id != nullptr) { + *parent_id = parent_image.image_id; + } + if (parent_snap_name != nullptr) { + *parent_snap_name = parent_snap.name; + } + } + return r; + } + + int Image::get_parent(linked_image_spec_t *parent_image, + snap_spec_t *parent_snap) + { + auto ictx = reinterpret_cast<ImageCtx*>(ctx); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + int r = librbd::api::Image<>::get_parent(ictx, parent_image, parent_snap); + + tracepoint(librbd, get_parent_info_exit, r, + parent_image->pool_name.c_str(), + parent_image->image_name.c_str(), + parent_image->image_id.c_str(), + parent_snap->name.c_str()); + return r; + } + + int Image::get_flags(uint64_t *flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_flags_enter, ictx); + int r = librbd::get_flags(ictx, flags); + tracepoint(librbd, get_flags_exit, ictx, r, *flags); + return r; + } + + int Image::set_image_notification(int fd, int type) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, set_image_notification_enter, ictx, fd, type); + int r = librbd::set_image_notification(ictx, fd, type); + tracepoint(librbd, set_image_notification_exit, ictx, r); + return r; + } + + int Image::is_exclusive_lock_owner(bool *is_owner) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, is_exclusive_lock_owner_enter, ictx); + int r = librbd::is_exclusive_lock_owner(ictx, is_owner); + tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner); + return r; + } + + int Image::lock_acquire(rbd_lock_mode_t lock_mode) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_acquire_enter, ictx, lock_mode); + int r = librbd::lock_acquire(ictx, lock_mode); + tracepoint(librbd, lock_acquire_exit, ictx, r); + return r; + } + + int Image::lock_release() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_release_enter, ictx); + int r = librbd::lock_release(ictx); + tracepoint(librbd, lock_release_exit, ictx, r); + return r; + } + + int Image::lock_get_owners(rbd_lock_mode_t *lock_mode, + std::list<std::string> *lock_owners) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_get_owners_enter, ictx); + int r = librbd::lock_get_owners(ictx, lock_mode, lock_owners); + tracepoint(librbd, lock_get_owners_exit, ictx, r); + return r; + } + + int Image::lock_break(rbd_lock_mode_t lock_mode, + const std::string &lock_owner) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner.c_str()); + int r = librbd::lock_break(ictx, lock_mode, lock_owner); + tracepoint(librbd, lock_break_exit, ictx, r); + return r; + } + + int Image::rebuild_object_map(ProgressContext &prog_ctx) + { + ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx); + return ictx->operations->rebuild_object_map(prog_ctx); + } + + int Image::check_object_map(ProgressContext &prog_ctx) + { + ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx); + return ictx->operations->check_object_map(prog_ctx); + } + + int Image::copy(IoCtx& dest_io_ctx, const char *destname) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + ImageOptions opts; + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy_exit, r); + return r; + } + + int Image::copy2(Image& dest) + { + ImageCtx *srcctx = (ImageCtx *)ctx; + ImageCtx *destctx = (ImageCtx *)dest.ctx; + tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(srcctx, destctx, prog_ctx, 0); + tracepoint(librbd, copy2_exit, r); + return r; + } + + int Image::copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy3_exit, r); + return r; + } + + int Image::copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, size_t sparse_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, sparse_size); + tracepoint(librbd, copy4_exit, r); + return r; + } + + int Image::copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + librbd::ProgressContext &pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + ImageOptions opts; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0); + tracepoint(librbd, copy_exit, r); + return r; + } + + int Image::copy_with_progress2(Image& dest, librbd::ProgressContext &pctx) + { + ImageCtx *srcctx = (ImageCtx *)ctx; + ImageCtx *destctx = (ImageCtx *)dest.ctx; + tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only); + int r = librbd::copy(srcctx, destctx, pctx, 0); + tracepoint(librbd, copy2_exit, r); + return r; + } + + int Image::copy_with_progress3(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, + librbd::ProgressContext &pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts); + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0); + tracepoint(librbd, copy3_exit, r); + return r; + } + + int Image::copy_with_progress4(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, + librbd::ProgressContext &pctx, + size_t sparse_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size); + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, sparse_size); + tracepoint(librbd, copy4_exit, r); + return r; + } + + int Image::deep_copy(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, opts.opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, r); + return r; + } + + int Image::deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, + librbd::ProgressContext &prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, opts.opts); + int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, r); + return r; + } + + int Image::flatten() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; + } + + int Image::flatten_with_progress(librbd::ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; + } + + int Image::sparsify(size_t sparse_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; + } + + int Image::sparsify_with_progress(size_t sparse_size, + librbd::ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; + } + + int Image::list_children(set<pair<string, string> > *children) + { + std::vector<linked_image_spec_t> images; + int r = list_children3(&images); + if (r < 0) { + return r; + } + + for (auto& image : images) { + if (!image.trash) { + children->insert({image.pool_name, image.image_name}); + } + } + return 0; + } + + int Image::list_children2(vector<librbd::child_info_t> *children) + { + std::vector<linked_image_spec_t> images; + int r = list_children3(&images); + if (r < 0) { + return r; + } + + for (auto& image : images) { + children->push_back({ + .pool_name = image.pool_name, + .image_name = image.image_name, + .image_id = image.image_id, + .trash = image.trash}); + } + + return 0; + } + + int Image::list_children3(std::vector<linked_image_spec_t> *images) + { + auto ictx = reinterpret_cast<ImageCtx*>(ctx); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + int r = librbd::api::Image<>::list_children(ictx, images); + if (r >= 0) { + for (auto& it : *images) { + tracepoint(librbd, list_children_entry, it.pool_name.c_str(), + it.image_name.c_str()); + } + } + + tracepoint(librbd, list_children_exit, r); + return r; + } + + int Image::list_descendants(std::vector<linked_image_spec_t> *images) + { + auto ictx = reinterpret_cast<ImageCtx*>(ctx); + + images->clear(); + int r = librbd::api::Image<>::list_descendants(ictx, {}, images); + return r; + } + + int Image::list_lockers(std::list<librbd::locker_t> *lockers, + bool *exclusive, string *tag) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::list_lockers(ictx, lockers, exclusive, tag); + if (r >= 0) { + for (std::list<librbd::locker_t>::const_iterator it = lockers->begin(); + it != lockers->end(); ++it) { + tracepoint(librbd, list_lockers_entry, it->client.c_str(), it->cookie.c_str(), it->address.c_str()); + } + } + tracepoint(librbd, list_lockers_exit, r); + return r; + } + + int Image::lock_exclusive(const string& cookie) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str()); + int r = librbd::lock(ictx, true, cookie, ""); + tracepoint(librbd, lock_exclusive_exit, r); + return r; + } + + int Image::lock_shared(const string& cookie, const std::string& tag) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str(), tag.c_str()); + int r = librbd::lock(ictx, false, cookie, tag); + tracepoint(librbd, lock_shared_exit, r); + return r; + } + + int Image::unlock(const string& cookie) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str()); + int r = librbd::unlock(ictx, cookie); + tracepoint(librbd, unlock_exit, r); + return r; + } + + int Image::break_lock(const string& client, const string& cookie) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client.c_str(), cookie.c_str()); + int r = librbd::break_lock(ictx, client, cookie); + tracepoint(librbd, break_lock_exit, r); + return r; + } + + int Image::snap_create(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(), + snap_name); + tracepoint(librbd, snap_create_exit, r); + return r; + } + + int Image::snap_remove(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::snap_remove(ictx, snap_name, 0, prog_ctx); + tracepoint(librbd, snap_remove_exit, r); + return r; + } + + int Image::snap_remove2(const char *snap_name, uint32_t flags, ProgressContext& pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags); + int r = librbd::snap_remove(ictx, snap_name, flags, pctx); + tracepoint(librbd, snap_remove_exit, r); + return r; + } + + int Image::snap_remove_by_id(uint64_t snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Snapshot<>::remove(ictx, snap_id); + } + + int Image::snap_rollback(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; + } + + int Image::snap_rename(const char *srcname, const char *dstname) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname); + int r = ictx->operations->snap_rename(srcname, dstname); + tracepoint(librbd, snap_rename_exit, r); + return r; + } + + int Image::snap_rollback_with_progress(const char *snap_name, + ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; + } + + int Image::snap_protect(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_protect_exit, r); + return r; + } + + int Image::snap_unprotect(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_unprotect_exit, r); + return r; + } + + int Image::snap_is_protected(const char *snap_name, bool *is_protected) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::snap_is_protected(ictx, snap_name, is_protected); + tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0); + return r; + } + + int Image::snap_list(vector<librbd::snap_info_t>& snaps) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, &snaps); + int r = librbd::snap_list(ictx, snaps); + if (r >= 0) { + for (int i = 0, n = snaps.size(); i < n; i++) { + tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name.c_str()); + } + } + tracepoint(librbd, snap_list_exit, r, snaps.size()); + if (r >= 0) { + // A little ugly, but the C++ API doesn't need a Image::snap_list_end, + // and we want the tracepoints to mirror the C API + tracepoint(librbd, snap_list_end_enter, &snaps); + tracepoint(librbd, snap_list_end_exit); + } + return r; + } + + bool Image::snap_exists(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, snap_name); + bool exists; + int r = librbd::snap_exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, &exists); + tracepoint(librbd, snap_exists_exit, r, exists); + if (r < 0) { + // lie to caller since we don't know the real answer yet. + return false; + } + return exists; + } + + // A safer verion of snap_exists. + int Image::snap_exists2(const char *snap_name, bool *exists) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::snap_exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, exists); + tracepoint(librbd, snap_exists_exit, r, *exists); + return r; + } + + int Image::snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str()); + int r = librbd::snap_get_timestamp(ictx, snap_id, timestamp); + tracepoint(librbd, snap_get_timestamp_exit, r); + return r; + } + + int Image::snap_get_limit(uint64_t *limit) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str()); + int r = librbd::snap_get_limit(ictx, limit); + tracepoint(librbd, snap_get_limit_exit, r, *limit); + return r; + } + + int Image::snap_get_namespace_type(uint64_t snap_id, + snap_namespace_type_t *namespace_type) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id, namespace_type); + tracepoint(librbd, snap_get_namespace_type_exit, r); + return r; + } + + int Image::snap_get_group_namespace(uint64_t snap_id, + snap_group_namespace_t *group_snap, + size_t group_snap_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_group_namespace_enter, ictx, + ictx->name.c_str()); + + if (group_snap_size != sizeof(snap_group_namespace_t)) { + tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id, + group_snap); + tracepoint(librbd, snap_get_group_namespace_exit, r); + return r; + } + + int Image::snap_get_trash_namespace(uint64_t snap_id, + std::string* original_name) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id, + original_name); + } + + int Image::snap_set_limit(uint64_t limit) + { + ImageCtx *ictx = (ImageCtx *)ctx; + + tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit); + int r = ictx->operations->snap_set_limit(limit); + tracepoint(librbd, snap_set_limit_exit, r); + return r; + } + + int Image::snap_set(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Image<>::snap_set( + ictx, cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_set_exit, r); + return r; + } + + int Image::snap_set_by_id(uint64_t snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Image<>::snap_set(ictx, snap_id); + } + + ssize_t Image::read(uint64_t ofs, size_t len, bufferlist& bl) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + bufferptr ptr(len); + bl.push_back(std::move(ptr)); + + int r = ictx->io_work_queue->read(ofs, len, io::ReadResult{&bl}, 0); + tracepoint(librbd, read_exit, r); + return r; + } + + ssize_t Image::read2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, ofs, len, op_flags); + bufferptr ptr(len); + bl.push_back(std::move(ptr)); + + int r = ictx->io_work_queue->read(ofs, len, io::ReadResult{&bl}, op_flags); + tracepoint(librbd, read_exit, r); + return r; + } + + int64_t Image::read_iterate(uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + tracepoint(librbd, read_iterate_exit, r); + return r; + } + + int Image::read_iterate2(uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + if (r > 0) + r = 0; + tracepoint(librbd, read_iterate2_exit, r); + return (int)r; + } + + int Image::diff_iterate(const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + true, false); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, + len, true, false, cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; + } + + int Image::diff_iterate2(const char *fromsnapname, uint64_t ofs, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + include_parent, whole_object); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, + len, include_parent, + whole_object, cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; + } + + ssize_t Image::write(uint64_t ofs, size_t len, bufferlist& bl) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, bl.length() < len ? NULL : bl.c_str()); + if (bl.length() < len) { + tracepoint(librbd, write_exit, -EINVAL); + return -EINVAL; + } + + int r = ictx->io_work_queue->write(ofs, len, bufferlist{bl}, 0); + tracepoint(librbd, write_exit, r); + return r; + } + + ssize_t Image::write2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, + ofs, len, bl.length() < len ? NULL : bl.c_str(), op_flags); + if (bl.length() < len) { + tracepoint(librbd, write_exit, -EINVAL); + return -EINVAL; + } + + int r = ictx->io_work_queue->write(ofs, len, bufferlist{bl}, op_flags); + tracepoint(librbd, write_exit, r); + return r; + } + + int Image::discard(uint64_t ofs, uint64_t len) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + if (len > std::numeric_limits<int32_t>::max()) { + tracepoint(librbd, discard_exit, -EINVAL); + return -EINVAL; + } + int r = ictx->io_work_queue->discard( + ofs, len, ictx->discard_granularity_bytes); + tracepoint(librbd, discard_exit, r); + return r; + } + + ssize_t Image::writesame(uint64_t ofs, size_t len, bufferlist& bl, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, ofs, len, bl.length() <= 0 ? NULL : bl.c_str(), bl.length(), + op_flags); + if (bl.length() <= 0 || len % bl.length() || + len > std::numeric_limits<int>::max()) { + tracepoint(librbd, writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same"); + if (discard_zero && bl.is_zero()) { + int r = ictx->io_work_queue->write_zeroes(ofs, len, 0U, op_flags); + tracepoint(librbd, writesame_exit, r); + return r; + } + + int r = ictx->io_work_queue->writesame(ofs, len, bufferlist{bl}, op_flags); + tracepoint(librbd, writesame_exit, r); + return r; + } + + ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags, + int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags); + } + + ssize_t Image::compare_and_write(uint64_t ofs, size_t len, + ceph::bufferlist &cmp_bl, ceph::bufferlist& bl, + uint64_t *mismatch_off, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), + ictx->read_only, ofs, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(), + bl.length() < len ? NULL : bl.c_str(), op_flags); + + if (bl.length() < len) { + tracepoint(librbd, write_exit, -EINVAL); + return -EINVAL; + } + + int r = ictx->io_work_queue->compare_and_write(ofs, len, bufferlist{cmp_bl}, + bufferlist{bl}, mismatch_off, + op_flags); + + tracepoint(librbd, compare_and_write_exit, r); + + return r; + } + + int Image::aio_write(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc); + if (bl.length() < len) { + tracepoint(librbd, aio_write_exit, -EINVAL); + return -EINVAL; + } + ictx->io_work_queue->aio_write(get_aio_completion(c), off, len, + bufferlist{bl}, 0); + + tracepoint(librbd, aio_write_exit, 0); + return 0; + } + + int Image::aio_write2(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags); + if (bl.length() < len) { + tracepoint(librbd, aio_write_exit, -EINVAL); + return -EINVAL; + } + ictx->io_work_queue->aio_write(get_aio_completion(c), off, len, + bufferlist{bl}, op_flags); + + tracepoint(librbd, aio_write_exit, 0); + return 0; + } + + int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, c->pc); + ictx->io_work_queue->aio_discard( + get_aio_completion(c), off, len, ictx->discard_granularity_bytes); + tracepoint(librbd, aio_discard_exit, 0); + return 0; + } + + int Image::aio_read(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.c_str(), c->pc); + ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~" + << (void *)(bl.c_str() + len - 1) << dendl; + + ictx->io_work_queue->aio_read(get_aio_completion(c), off, len, + io::ReadResult{&bl}, 0); + tracepoint(librbd, aio_read_exit, 0); + return 0; + } + + int Image::aio_read2(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, bl.c_str(), c->pc, op_flags); + ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~" + << (void *)(bl.c_str() + len - 1) << dendl; + + ictx->io_work_queue->aio_read(get_aio_completion(c), off, len, + io::ReadResult{&bl}, op_flags); + tracepoint(librbd, aio_read_exit, 0); + return 0; + } + + int Image::flush() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = ictx->io_work_queue->flush(); + tracepoint(librbd, flush_exit, r); + return r; + } + + int Image::aio_flush(RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc); + ictx->io_work_queue->aio_flush(get_aio_completion(c)); + tracepoint(librbd, aio_flush_exit, 0); + return 0; + } + + int Image::aio_writesame(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, bl.length() <= len ? NULL : bl.c_str(), bl.length(), + c->pc, op_flags); + if (bl.length() <= 0 || len % bl.length()) { + tracepoint(librbd, aio_writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same"); + if (discard_zero && bl.is_zero()) { + ictx->io_work_queue->aio_write_zeroes(get_aio_completion(c), off, len, 0U, + op_flags, true); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; + } + + ictx->io_work_queue->aio_writesame(get_aio_completion(c), off, len, + bufferlist{bl}, op_flags); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; + } + + int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + ictx->io_work_queue->aio_write_zeroes( + get_aio_completion(c), off, len, zero_flags, op_flags, true); + return 0; + } + + int Image::aio_compare_and_write(uint64_t off, size_t len, + ceph::bufferlist& cmp_bl, ceph::bufferlist& bl, + RBD::AioCompletion *c, uint64_t *mismatch_off, + int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), + ictx->read_only, off, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(), + bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags); + + if (bl.length() < len) { + tracepoint(librbd, compare_and_write_exit, -EINVAL); + return -EINVAL; + } + + ictx->io_work_queue->aio_compare_and_write(get_aio_completion(c), off, len, + bufferlist{cmp_bl}, bufferlist{bl}, + mismatch_off, op_flags, false); + + tracepoint(librbd, aio_compare_and_write_exit, 0); + + return 0; + } + + int Image::invalidate_cache() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::invalidate_cache(ictx); + tracepoint(librbd, invalidate_cache_exit, r); + return r; + } + + int Image::poll_io_events(RBD::AioCompletion **comps, int numcomp) + { + io::AioCompletion *cs[numcomp]; + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, poll_io_events_enter, ictx, numcomp); + int r = librbd::poll_io_events(ictx, cs, numcomp); + tracepoint(librbd, poll_io_events_exit, r); + if (r > 0) { + for (int i = 0; i < r; ++i) + comps[i] = (RBD::AioCompletion *)cs[i]->rbd_comp; + } + return r; + } + + int Image::metadata_get(const std::string &key, std::string *value) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_get_enter, ictx, key.c_str()); + int r = librbd::metadata_get(ictx, key, value); + if (r < 0) { + tracepoint(librbd, metadata_get_exit, r, key.c_str(), NULL); + } else { + tracepoint(librbd, metadata_get_exit, r, key.c_str(), value->c_str()); + } + return r; + } + + int Image::metadata_set(const std::string &key, const std::string &value) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_set_enter, ictx, key.c_str(), value.c_str()); + int r = ictx->operations->metadata_set(key, value); + tracepoint(librbd, metadata_set_exit, r); + return r; + } + + int Image::metadata_remove(const std::string &key) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_remove_enter, ictx, key.c_str()); + int r = ictx->operations->metadata_remove(key); + tracepoint(librbd, metadata_remove_exit, r); + return r; + } + + int Image::metadata_list(const std::string &start, uint64_t max, map<string, bufferlist> *pairs) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_list_enter, ictx); + int r = librbd::metadata_list(ictx, start, max, pairs); + if (r >= 0) { + for (map<string, bufferlist>::iterator it = pairs->begin(); + it != pairs->end(); ++it) { + tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str()); + } + } + tracepoint(librbd, metadata_list_exit, r); + return r; + } + + int Image::mirror_image_enable() { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_enable(ictx, false); + } + + int Image::mirror_image_disable(bool force) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_disable(ictx, force); + } + + int Image::mirror_image_promote(bool force) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_promote(ictx, force); + } + + int Image::mirror_image_demote() { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_demote(ictx); + } + + int Image::mirror_image_resync() + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_resync(ictx); + } + + int Image::mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_info_t) != info_size) { + return -ERANGE; + } + + return librbd::api::Mirror<>::image_get_info(ictx, mirror_image_info); + } + + int Image::mirror_image_get_status(mirror_image_status_t *mirror_image_status, + size_t status_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_status_t) != status_size) { + return -ERANGE; + } + + return librbd::api::Mirror<>::image_get_status(ictx, mirror_image_status); + } + + int Image::mirror_image_get_instance_id(std::string *instance_id) { + ImageCtx *ictx = (ImageCtx *)ctx; + + return librbd::api::Mirror<>::image_get_instance_id(ictx, instance_id); + } + + int Image::aio_mirror_image_promote(bool force, RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + librbd::api::Mirror<>::image_promote( + ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_demote(RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + librbd::api::Mirror<>::image_demote( + ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_info_t) != info_size) { + return -ERANGE; + } + + librbd::api::Mirror<>::image_get_info( + ictx, mirror_image_info, + new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_get_status(mirror_image_status_t *status, + size_t status_size, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_status_t) != status_size) { + return -ERANGE; + } + + librbd::api::Mirror<>::image_get_status( + ictx, status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::update_watch(UpdateWatchCtx *wctx, uint64_t *handle) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, update_watch_enter, ictx, wctx); + int r = ictx->state->register_update_watcher(wctx, handle); + tracepoint(librbd, update_watch_exit, r, *handle); + return r; + } + + int Image::update_unwatch(uint64_t handle) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, update_unwatch_enter, ictx, handle); + int r = ictx->state->unregister_update_watcher(handle); + tracepoint(librbd, update_unwatch_exit, r); + return r; + } + + int Image::list_watchers(std::list<librbd::image_watcher_t> &watchers) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::list_watchers(ictx, watchers); + if (r >= 0) { + for (auto &watcher : watchers) { + tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie); + } + } + tracepoint(librbd, list_watchers_exit, r, watchers.size()); + return r; + } + + int Image::config_list(std::vector<config_option_t> *options) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Config<>::list(ictx, options); + } + +} // namespace librbd + +extern "C" void rbd_version(int *major, int *minor, int *extra) +{ + if (major) + *major = LIBRBD_VER_MAJOR; + if (minor) + *minor = LIBRBD_VER_MINOR; + if (extra) + *extra = LIBRBD_VER_EXTRA; +} + +extern "C" void rbd_image_options_create(rbd_image_options_t* opts) +{ + librbd::image_options_create(opts); +} + +extern "C" void rbd_image_options_destroy(rbd_image_options_t opts) +{ + librbd::image_options_destroy(opts); +} + +extern "C" int rbd_image_options_set_string(rbd_image_options_t opts, int optname, + const char* optval) +{ + return librbd::image_options_set(opts, optname, optval); +} + +extern "C" int rbd_image_options_set_uint64(rbd_image_options_t opts, int optname, + uint64_t optval) +{ + return librbd::image_options_set(opts, optname, optval); +} + +extern "C" int rbd_image_options_get_string(rbd_image_options_t opts, int optname, + char* optval, size_t maxlen) +{ + std::string optval_; + + int r = librbd::image_options_get(opts, optname, &optval_); + + if (r < 0) { + return r; + } + + if (optval_.size() >= maxlen) { + return -E2BIG; + } + + strncpy(optval, optval_.c_str(), maxlen); + + return 0; +} + +extern "C" int rbd_image_options_get_uint64(rbd_image_options_t opts, int optname, + uint64_t* optval) +{ + return librbd::image_options_get(opts, optname, optval); +} + +extern "C" int rbd_image_options_is_set(rbd_image_options_t opts, int optname, + bool* is_set) +{ + return librbd::image_options_is_set(opts, optname, is_set); +} + +extern "C" int rbd_image_options_unset(rbd_image_options_t opts, int optname) +{ + return librbd::image_options_unset(opts, optname); +} + +extern "C" void rbd_image_options_clear(rbd_image_options_t opts) +{ + librbd::image_options_clear(opts); +} + +extern "C" int rbd_image_options_is_empty(rbd_image_options_t opts) +{ + return librbd::image_options_is_empty(opts); +} + +/* pool mirroring */ +extern "C" int rbd_mirror_site_name_get(rados_t cluster, char *name, + size_t *max_len) { + librados::Rados rados; + librados::Rados::from_rados_t(cluster, rados); + + std::string site_name; + int r = librbd::api::Mirror<>::site_name_get(rados, &site_name); + if (r < 0) { + return r; + } + + auto total_len = site_name.size() + 1; + if (*max_len < total_len) { + *max_len = total_len; + return -ERANGE; + } + *max_len = total_len; + + strcpy(name, site_name.c_str()); + return 0; +} + +extern "C" int rbd_mirror_site_name_set(rados_t cluster, const char *name) { + librados::Rados rados; + librados::Rados::from_rados_t(cluster, rados); + return librbd::api::Mirror<>::site_name_set(rados, name); +} + +extern "C" int rbd_mirror_mode_get(rados_ioctx_t p, + rbd_mirror_mode_t *mirror_mode) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode); +} + +extern "C" int rbd_mirror_mode_set(rados_ioctx_t p, + rbd_mirror_mode_t mirror_mode) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode); +} + +extern "C" int rbd_mirror_peer_bootstrap_create(rados_ioctx_t p, char *token, + size_t *max_len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::string token_str; + int r = librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, &token_str); + if (r < 0) { + return r; + } + + auto total_len = token_str.size() + 1; + if (*max_len < total_len) { + *max_len = total_len; + return -ERANGE; + } + *max_len = total_len; + + strcpy(token, token_str.c_str()); + return 0; +} + +extern "C" int rbd_mirror_peer_bootstrap_import( + rados_ioctx_t p, rbd_mirror_peer_direction_t direction, + const char *token) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction, token); +} + +extern "C" int rbd_mirror_peer_add(rados_ioctx_t p, char *uuid, + size_t uuid_max_length, + const char *cluster_name, + const char *client_name) { + static const std::size_t UUID_LENGTH = 36; + + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + if (uuid_max_length < UUID_LENGTH + 1) { + return -E2BIG; + } + + std::string uuid_str; + int r = librbd::api::Mirror<>::peer_add(io_ctx, &uuid_str, cluster_name, + client_name); + if (r >= 0) { + strncpy(uuid, uuid_str.c_str(), uuid_max_length); + uuid[uuid_max_length - 1] = '\0'; + } + return r; +} + +extern "C" int rbd_mirror_peer_remove(rados_ioctx_t p, const char *uuid) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + int r = librbd::api::Mirror<>::peer_remove(io_ctx, uuid); + return r; +} + +extern "C" int rbd_mirror_peer_list(rados_ioctx_t p, + rbd_mirror_peer_t *peers, int *max_peers) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::vector<librbd::mirror_peer_t> peer_vector; + int r = librbd::api::Mirror<>::peer_list(io_ctx, &peer_vector); + if (r < 0) { + return r; + } + + if (*max_peers < static_cast<int>(peer_vector.size())) { + *max_peers = static_cast<int>(peer_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast<int>(peer_vector.size()); ++i) { + peers[i].uuid = strdup(peer_vector[i].uuid.c_str()); + peers[i].cluster_name = strdup(peer_vector[i].cluster_name.c_str()); + peers[i].client_name = strdup(peer_vector[i].client_name.c_str()); + } + *max_peers = static_cast<int>(peer_vector.size()); + return 0; +} + +extern "C" void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers, + int max_peers) { + for (int i = 0; i < max_peers; ++i) { + free(peers[i].uuid); + free(peers[i].cluster_name); + free(peers[i].client_name); + } +} + +extern "C" int rbd_mirror_peer_set_client(rados_ioctx_t p, const char *uuid, + const char *client_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::peer_set_client(io_ctx, uuid, client_name); +} + +extern "C" int rbd_mirror_peer_set_cluster(rados_ioctx_t p, const char *uuid, + const char *cluster_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::peer_set_cluster(io_ctx, uuid, cluster_name); +} + +extern "C" int rbd_mirror_peer_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_val_len, size_t *key_value_count) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::map<std::string, std::string> attributes; + int r = librbd::api::Mirror<>::peer_get_attributes(io_ctx, uuid, &attributes); + if (r < 0) { + return r; + } + + size_t key_total_len = 0, val_total_len = 0; + for (auto& it : attributes) { + key_total_len += it.first.size() + 1; + val_total_len += it.second.length() + 1; + } + + bool too_short = ((*max_key_len < key_total_len) || + (*max_val_len < val_total_len)); + + *max_key_len = key_total_len; + *max_val_len = val_total_len; + *key_value_count = attributes.size(); + if (too_short) { + return -ERANGE; + } + + char *keys_p = keys; + char *values_p = values; + for (auto& it : attributes) { + strncpy(keys_p, it.first.c_str(), it.first.size() + 1); + keys_p += it.first.size() + 1; + + strncpy(values_p, it.second.c_str(), it.second.length() + 1); + values_p += it.second.length() + 1; + } + + return 0; +} + +extern "C" int rbd_mirror_peer_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t count) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::map<std::string, std::string> attributes; + + for (size_t i = 0; i < count; ++i) { + const char* key = keys; + keys += strlen(key) + 1; + const char* value = values; + values += strlen(value) + 1; + attributes[key] = value; + } + + return librbd::api::Mirror<>::peer_set_attributes(io_ctx, uuid, attributes); +} + +extern "C" int rbd_mirror_image_status_list(rados_ioctx_t p, + const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_status_t *images, size_t *len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + std::map<std::string, librbd::mirror_image_status_t> cpp_images; + + int r = librbd::api::Mirror<>::image_status_list(io_ctx, start_id, max, + &cpp_images); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : cpp_images) { + ceph_assert(i < max); + const std::string &image_id = it.first; + image_ids[i] = strdup(image_id.c_str()); + mirror_image_status_cpp_to_c(it.second, &images[i]); + i++; + } + *len = i; + return 0; +} + +extern "C" void rbd_mirror_image_status_list_cleanup(char **image_ids, + rbd_mirror_image_status_t *images, size_t len) { + for (size_t i = 0; i < len; i++) { + free(image_ids[i]); + free(images[i].name); + free(images[i].info.global_id); + free(images[i].description); + } +} + +extern "C" int rbd_mirror_image_status_summary(rados_ioctx_t p, + rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen) { + + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::map<librbd::mirror_image_status_state_t, int> states_; + int r = librbd::api::Mirror<>::image_status_summary(io_ctx, &states_); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : states_) { + if (i == *maxlen) { + return -ERANGE; + } + states[i] = it.first; + counts[i] = it.second; + i++; + } + *maxlen = i; + return 0; +} + +extern "C" int rbd_mirror_image_instance_id_list( + rados_ioctx_t p, const char *start_id, size_t max, char **image_ids, + char **instance_ids, size_t *len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + std::map<std::string, std::string> cpp_instance_ids; + + int r = librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max, + &cpp_instance_ids); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : cpp_instance_ids) { + ceph_assert(i < max); + image_ids[i] = strdup(it.first.c_str()); + instance_ids[i] = strdup(it.second.c_str()); + i++; + } + *len = i; + return 0; +} + +extern "C" void rbd_mirror_image_instance_id_list_cleanup( + char **image_ids, char **instance_ids, size_t len) { + for (size_t i = 0; i < len; i++) { + free(image_ids[i]); + free(instance_ids[i]); + } +} + +/* helpers */ + +extern "C" void rbd_image_spec_cleanup(rbd_image_spec_t *image) +{ + free(image->id); + free(image->name); +} + +extern "C" void rbd_image_spec_list_cleanup(rbd_image_spec_t *images, + size_t num_images) +{ + for (size_t idx = 0; idx < num_images; ++idx) { + rbd_image_spec_cleanup(&images[idx]); + } +} + +extern "C" void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image) +{ + free(image->pool_name); + free(image->pool_namespace); + free(image->image_id); + free(image->image_name); +} + +extern "C" void rbd_linked_image_spec_list_cleanup( + rbd_linked_image_spec_t *images, size_t num_images) +{ + for (size_t idx = 0; idx < num_images; ++idx) { + rbd_linked_image_spec_cleanup(&images[idx]); + } +} + +extern "C" void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap) +{ + free(snap->name); +} + +/* images */ +extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + std::vector<librbd::image_spec_t> cpp_image_specs; + int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs); + if (r < 0) { + tracepoint(librbd, list_exit, r, *size); + return r; + } + + size_t expected_size = 0; + + for (auto& it : cpp_image_specs) { + expected_size += it.name.size() + 1; + } + if (*size < expected_size) { + *size = expected_size; + tracepoint(librbd, list_exit, -ERANGE, *size); + return -ERANGE; + } + + if (names == NULL) { + tracepoint(librbd, list_exit, -EINVAL, *size); + return -EINVAL; + } + + for (auto& it : cpp_image_specs) { + const char* name = it.name.c_str(); + tracepoint(librbd, list_entry, name); + strcpy(names, name); + names += strlen(names) + 1; + } + tracepoint(librbd, list_exit, (int)expected_size, *size); + return (int)expected_size; +} + +extern "C" int rbd_list2(rados_ioctx_t p, rbd_image_spec_t *images, + size_t *size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *size); + std::vector<librbd::image_spec_t> cpp_image_specs; + int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs); + if (r < 0) { + tracepoint(librbd, list_exit, r, *size); + return r; + } + + size_t expected_size = cpp_image_specs.size(); + if (*size < expected_size) { + *size = expected_size; + tracepoint(librbd, list_exit, -ERANGE, *size); + return -ERANGE; + } + + *size = expected_size; + for (size_t idx = 0; idx < expected_size; ++idx) { + images[idx].id = strdup(cpp_image_specs[idx].id.c_str()); + images[idx].name = strdup(cpp_image_specs[idx].name.c_str()); + } + tracepoint(librbd, list_exit, 0, *size); + return 0; +} + +extern "C" int rbd_create(rados_ioctx_t p, const char *name, uint64_t size, int *order) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order); + int r = librbd::create(io_ctx, name, size, order); + tracepoint(librbd, create_exit, r, *order); + return r; +} + +extern "C" int rbd_create2(rados_ioctx_t p, const char *name, + uint64_t size, uint64_t features, + int *order) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order); + int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0); + tracepoint(librbd, create2_exit, r, *order); + return r; +} + +extern "C" int rbd_create3(rados_ioctx_t p, const char *name, + uint64_t size, uint64_t features, + int *order, + uint64_t stripe_unit, uint64_t stripe_count) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count); + int r = librbd::create(io_ctx, name, size, false, features, order, + stripe_unit, stripe_count); + tracepoint(librbd, create3_exit, r, *order); + return r; +} + +extern "C" int rbd_create4(rados_ioctx_t p, const char *name, + uint64_t size, rbd_image_options_t opts) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts); + librbd::ImageOptions opts_(opts); + int r = librbd::create(io_ctx, name, "", size, opts_, "", "", false); + tracepoint(librbd, create4_exit, r); + return r; +} + +extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snap_name, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order) +{ + librados::IoCtx p_ioc, c_ioc; + librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc); + librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc); + TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc)); + tracepoint(librbd, clone_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features); + int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name, + features, c_order, 0, 0); + tracepoint(librbd, clone_exit, r, *c_order); + return r; +} + +extern "C" int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snap_name, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count) +{ + librados::IoCtx p_ioc, c_ioc; + librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc); + librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc); + TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc)); + tracepoint(librbd, clone2_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features, stripe_unit, stripe_count); + int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name, + features, c_order, stripe_unit, stripe_count); + tracepoint(librbd, clone2_exit, r, *c_order); + return r; +} + +extern "C" int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snap_name, rados_ioctx_t c_ioctx, + const char *c_name, rbd_image_options_t c_opts) +{ + librados::IoCtx p_ioc, c_ioc; + librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc); + librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc); + TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc)); + tracepoint(librbd, clone3_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, c_opts); + librbd::ImageOptions c_opts_(c_opts); + int r = librbd::clone(p_ioc, nullptr, p_name, p_snap_name, c_ioc, nullptr, + c_name, c_opts_, "", ""); + tracepoint(librbd, clone3_exit, r); + return r; +} + +extern "C" int rbd_remove(rados_ioctx_t p, const char *name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx); + tracepoint(librbd, remove_exit, r); + return r; +} + +extern "C" int rbd_remove_with_progress(rados_ioctx_t p, const char *name, + librbd_progress_fn_t cb, void *cbdata) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx); + tracepoint(librbd, remove_exit, r); + return r; +} + +extern "C" int rbd_trash_move(rados_ioctx_t p, const char *name, + uint64_t delay) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER, name, + delay); + tracepoint(librbd, trash_move_exit, r); + return r; +} + +extern "C" int rbd_trash_get(rados_ioctx_t io, const char *id, + rbd_trash_image_info_t *info) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + librbd::trash_image_info_t cpp_info; + int r = librbd::api::Trash<>::get(io_ctx, id, &cpp_info); + if (r < 0) { + return r; + } + + trash_image_info_cpp_to_c(cpp_info, info); + return 0; +} + +extern "C" void rbd_trash_get_cleanup(rbd_trash_image_info_t *info) { + free(info->id); + free(info->name); +} + +extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries, + size_t *num_entries) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_list_enter, + io_ctx.get_pool_name().c_str(), io_ctx.get_id()); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(entries, 0, sizeof(*entries) * *num_entries); + + vector<librbd::trash_image_info_t> cpp_entries; + int r = librbd::api::Trash<>::list(io_ctx, cpp_entries, true); + if (r < 0) { + tracepoint(librbd, trash_list_exit, r, *num_entries); + return r; + } + + if (*num_entries < cpp_entries.size()) { + *num_entries = cpp_entries.size(); + tracepoint(librbd, trash_list_exit, -ERANGE, *num_entries); + return -ERANGE; + } + + int i=0; + for (const auto &entry : cpp_entries) { + trash_image_info_cpp_to_c(entry, &entries[i++]); + } + *num_entries = cpp_entries.size(); + + return *num_entries; +} + +extern "C" void rbd_trash_list_cleanup(rbd_trash_image_info_t *entries, + size_t num_entries) { + for (size_t i=0; i < num_entries; i++) { + rbd_trash_get_cleanup(&entries[i]); + } +} + +extern "C" int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, + float threshold) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + librbd::NoOpProgressContext nop_pctx; + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; +} + +extern "C" int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts, + float threshold, librbd_progress_fn_t cb, void* cbdata) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + librbd::CProgressContext pctx(cb, cbdata); + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; +} + +extern "C" int rbd_trash_remove(rados_ioctx_t p, const char *image_id, + bool force) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx); + tracepoint(librbd, trash_remove_exit, r); + return r; +} + +extern "C" int rbd_trash_remove_with_progress(rados_ioctx_t p, + const char *image_id, + bool force, + librbd_progress_fn_t cb, + void *cbdata) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx); + tracepoint(librbd, trash_remove_exit, r); + return r; +} + +extern "C" int rbd_trash_restore(rados_ioctx_t p, const char *id, + const char *name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), id, name); + int r = librbd::api::Trash<>::restore( + io_ctx, librbd::api::Trash<>::RESTORE_SOURCE_WHITELIST, id, name); + tracepoint(librbd, trash_undelete_exit, r); + return r; +} + +extern "C" int rbd_namespace_create(rados_ioctx_t io, + const char *namespace_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Namespace<>::create(io_ctx, namespace_name); +} + +extern "C" int rbd_namespace_remove(rados_ioctx_t io, + const char *namespace_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Namespace<>::remove(io_ctx, namespace_name); +} + +extern "C" int rbd_namespace_list(rados_ioctx_t io, char *names, size_t *size) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + if (names == nullptr || size == nullptr) { + return -EINVAL; + } + + std::vector<std::string> cpp_names; + int r = librbd::api::Namespace<>::list(io_ctx, &cpp_names); + if (r < 0) { + return r; + } + + size_t expected_size = 0; + for (size_t i = 0; i < cpp_names.size(); i++) { + expected_size += cpp_names[i].size() + 1; + } + if (*size < expected_size) { + *size = expected_size; + return -ERANGE; + } + + *size = expected_size; + for (int i = 0; i < (int)cpp_names.size(); i++) { + const char* name = cpp_names[i].c_str(); + strcpy(names, name); + names += strlen(names) + 1; + } + + return (int)expected_size; +} + +extern "C" int rbd_namespace_exists(rados_ioctx_t io, + const char *namespace_name, + bool *exists) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists); +} + +extern "C" int rbd_pool_init(rados_ioctx_t io, bool force) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Pool<>::init(io_ctx, force); +} + +extern "C" void rbd_pool_stats_create(rbd_pool_stats_t *stats) { + *stats = reinterpret_cast<rbd_pool_stats_t>( + new librbd::api::Pool<>::StatOptions{}); +} + +extern "C" void rbd_pool_stats_destroy(rbd_pool_stats_t stats) { + auto pool_stat_options = + reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats); + delete pool_stat_options; +} + +extern "C" int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats, + int stat_option, + uint64_t* stat_val) { + auto pool_stat_options = + reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats); + return librbd::api::Pool<>::add_stat_option( + pool_stat_options, static_cast<rbd_pool_stat_option_t>(stat_option), + stat_val); +} + +extern "C" int rbd_pool_stats_get( + rados_ioctx_t io, rbd_pool_stats_t pool_stats) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + auto pool_stat_options = + reinterpret_cast<librbd::api::Pool<>::StatOptions*>(pool_stats); + return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options); +} + +extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + librbd::ImageOptions opts; + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy_exit, r); + return r; +} + +extern "C" int rbd_copy2(rbd_image_t srcp, rbd_image_t destp) +{ + librbd::ImageCtx *src = (librbd::ImageCtx *)srcp; + librbd::ImageCtx *dest = (librbd::ImageCtx *)destp; + tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(src, dest, prog_ctx, 0); + tracepoint(librbd, copy2_exit, r); + return r; +} + +extern "C" int rbd_copy3(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, rbd_image_options_t c_opts) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts); + librbd::ImageOptions c_opts_(c_opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, 0); + tracepoint(librbd, copy3_exit, r); + return r; +} + +extern "C" int rbd_copy4(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, rbd_image_options_t c_opts, size_t sparse_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts, sparse_size); + librbd::ImageOptions c_opts_(c_opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, sparse_size); + tracepoint(librbd, copy4_exit, r); + return r; +} + +extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + librbd::ImageOptions opts; + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy_exit, ret); + return ret; +} + +extern "C" int rbd_copy_with_progress2(rbd_image_t srcp, rbd_image_t destp, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *src = (librbd::ImageCtx *)srcp; + librbd::ImageCtx *dest = (librbd::ImageCtx *)destp; + tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(src, dest, prog_ctx, 0); + tracepoint(librbd, copy2_exit, ret); + return ret; +} + +extern "C" int rbd_copy_with_progress3(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts); + librbd::ImageOptions dest_opts_(dest_opts); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, 0); + tracepoint(librbd, copy3_exit, ret); + return ret; +} + +extern "C" int rbd_copy_with_progress4(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t fn, void *data, size_t sparse_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts, sparse_size); + librbd::ImageOptions dest_opts_(dest_opts); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, sparse_size); + tracepoint(librbd, copy4_exit, ret); + return ret; +} + +extern "C" int rbd_deep_copy(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, rbd_image_options_t c_opts) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, c_opts); + librbd::ImageOptions opts(c_opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, r); + return r; +} + +extern "C" int rbd_deep_copy_with_progress(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, dest_opts); + librbd::ImageOptions opts(dest_opts); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, ret); + return ret; +} + +extern "C" int rbd_flatten(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; +} + +extern "C" int rbd_flatten_with_progress(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; +} + +extern "C" int rbd_sparsify(rbd_image_t image, size_t sparse_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; +} + +extern "C" int rbd_sparsify_with_progress(rbd_image_t image, size_t sparse_size, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; +} + +extern "C" int rbd_rename(rados_ioctx_t src_p, const char *srcname, + const char *destname) +{ + librados::IoCtx src_io_ctx; + librados::IoCtx::from_rados_ioctx_t(src_p, src_io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx)); + tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname); + int r = librbd::rename(src_io_ctx, srcname, destname); + tracepoint(librbd, rename_exit, r); + return r; +} + +extern "C" int rbd_migration_prepare(rados_ioctx_t p, const char *image_name, + rados_ioctx_t dest_p, + const char *dest_image_name, + rbd_image_options_t opts_) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(), + dest_io_ctx.get_id(), dest_image_name, opts_); + librbd::ImageOptions opts(opts_); + int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx, + dest_image_name, opts); + tracepoint(librbd, migration_prepare_exit, r); + return r; +} + +extern "C" int rbd_migration_execute(rados_ioctx_t p, const char *image_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; +} + +extern "C" int rbd_migration_execute_with_progress(rados_ioctx_t p, + const char *name, + librbd_progress_fn_t fn, + void *data) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(fn, data); + int r = librbd::api::Migration<>::execute(io_ctx, name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; +} + +extern "C" int rbd_migration_abort(rados_ioctx_t p, const char *image_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; +} + +extern "C" int rbd_migration_abort_with_progress(rados_ioctx_t p, + const char *name, + librbd_progress_fn_t fn, + void *data) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(fn, data); + int r = librbd::api::Migration<>::abort(io_ctx, name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; +} + +extern "C" int rbd_migration_commit(rados_ioctx_t p, const char *image_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; +} + +extern "C" int rbd_migration_commit_with_progress(rados_ioctx_t p, + const char *name, + librbd_progress_fn_t fn, + void *data) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(fn, data); + int r = librbd::api::Migration<>::commit(io_ctx, name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; +} + +extern "C" int rbd_migration_status(rados_ioctx_t p, const char *image_name, + rbd_image_migration_status_t *status, + size_t status_size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + + if (status_size != sizeof(rbd_image_migration_status_t)) { + tracepoint(librbd, migration_status_exit, -ERANGE); + return -ERANGE; + } + + librbd::image_migration_status_t cpp_status; + int r = librbd::api::Migration<>::status(io_ctx, image_name, &cpp_status); + if (r >= 0) { + status->source_pool_id = cpp_status.source_pool_id; + status->source_pool_namespace = + strdup(cpp_status.source_pool_namespace.c_str()); + status->source_image_name = strdup(cpp_status.source_image_name.c_str()); + status->source_image_id = strdup(cpp_status.source_image_id.c_str()); + status->dest_pool_id = cpp_status.dest_pool_id; + status->dest_pool_namespace = + strdup(cpp_status.dest_pool_namespace.c_str()); + status->dest_image_name = strdup(cpp_status.dest_image_name.c_str()); + status->dest_image_id = strdup(cpp_status.dest_image_id.c_str()); + status->state = cpp_status.state; + status->state_description = strdup(cpp_status.state_description.c_str()); + } + + tracepoint(librbd, migration_status_exit, r); + return r; +} + +extern "C" void rbd_migration_status_cleanup(rbd_image_migration_status_t *s) +{ + free(s->source_pool_namespace); + free(s->source_image_name); + free(s->source_image_id); + free(s->dest_pool_namespace); + free(s->dest_image_name); + free(s->dest_image_id); + free(s->state_description); +} + +extern "C" int rbd_pool_metadata_get(rados_ioctx_t p, const char *key, + char *value, size_t *vallen) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + string val_s; + int r = librbd::api::PoolMetadata<>::get(io_ctx, key, &val_s); + if (*vallen < val_s.size() + 1) { + r = -ERANGE; + *vallen = val_s.size() + 1; + } else { + strncpy(value, val_s.c_str(), val_s.size() + 1); + } + + return r; +} + +extern "C" int rbd_pool_metadata_set(rados_ioctx_t p, const char *key, + const char *value) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + int r = librbd::api::PoolMetadata<>::set(io_ctx, key, value); + return r; +} + +extern "C" int rbd_pool_metadata_remove(rados_ioctx_t p, const char *key) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + int r = librbd::api::PoolMetadata<>::remove(io_ctx, key); + return r; +} + +extern "C" int rbd_pool_metadata_list(rados_ioctx_t p, const char *start, + uint64_t max, char *key, size_t *key_len, + char *value, size_t *val_len) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + map<string, bufferlist> pairs; + int r = librbd::api::PoolMetadata<>::list(io_ctx, start, max, &pairs); + if (r < 0) { + return r; + } + size_t key_total_len = 0, val_total_len = 0; + for (auto &it : pairs) { + key_total_len += it.first.size() + 1; + val_total_len += it.second.length() + 1; + } + if (*key_len < key_total_len || *val_len < val_total_len) { + *key_len = key_total_len; + *val_len = val_total_len; + return -ERANGE; + } + *key_len = key_total_len; + *val_len = val_total_len; + + char *key_p = key, *value_p = value; + for (auto &it : pairs) { + strncpy(key_p, it.first.c_str(), it.first.size() + 1); + key_p += it.first.size() + 1; + strncpy(value_p, it.second.c_str(), it.second.length()); + value_p += it.second.length(); + *value_p = '\0'; + value_p++; + } + return 0; +} + +extern "C" int rbd_config_pool_list(rados_ioctx_t p, + rbd_config_option_t *options, + int *max_options) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::vector<librbd::config_option_t> option_vector; + int r = librbd::api::Config<>::list(io_ctx, &option_vector); + if (r < 0) { + return r; + } + + if (*max_options < static_cast<int>(option_vector.size())) { + *max_options = static_cast<int>(option_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast<int>(option_vector.size()); ++i) { + config_option_cpp_to_c(option_vector[i], &options[i]); + } + *max_options = static_cast<int>(option_vector.size()); + return 0; +} + +extern "C" void rbd_config_pool_list_cleanup(rbd_config_option_t *options, + int max_options) { + for (int i = 0; i < max_options; ++i) { + config_option_cleanup(options[i]); + } +} + +extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image, + const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + false); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_open_by_id(rados_ioctx_t p, const char *id, + rbd_image_t *image, const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + false); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_aio_open(rados_ioctx_t p, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + false); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_aio_open_by_id(rados_ioctx_t p, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + false); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, + comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_open_read_only(rados_ioctx_t p, const char *name, + rbd_image_t *image, const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + true); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_open_by_id_read_only(rados_ioctx_t p, const char *id, + rbd_image_t *image, const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + true); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_aio_open_read_only(rados_ioctx_t p, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + true); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_aio_open_by_id_read_only(rados_ioctx_t p, const char *id, + rbd_image_t *image, + const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + true); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_close(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + + int r = ictx->state->close(); + + tracepoint(librbd, close_image_exit, r); + return r; +} + +extern "C" int rbd_aio_close(rbd_image_t image, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), comp->pc); + ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE, + get_aio_completion(comp))); + tracepoint(librbd, aio_close_image_exit, 0); + return 0; +} + +extern "C" int rbd_resize(rbd_image_t image, uint64_t size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->resize(size, true, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; +} + +extern "C" int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->resize(size, allow_shrink, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; +} + +extern "C" int rbd_resize_with_progress(rbd_image_t image, uint64_t size, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->resize(size, true, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; +} + +extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info, + size_t infosize) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::info(ictx, *info, infosize); + tracepoint(librbd, stat_exit, r, info); + return r; +} + +extern "C" int rbd_get_old_format(rbd_image_t image, uint8_t *old) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_old_format(ictx, old); + tracepoint(librbd, get_old_format_exit, r, *old); + return r; +} + +extern "C" int rbd_get_size(rbd_image_t image, uint64_t *size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_size(ictx, size); + tracepoint(librbd, get_size_exit, r, *size); + return r; +} + +extern "C" int rbd_get_features(rbd_image_t image, uint64_t *features) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_features(ictx, features); + tracepoint(librbd, get_features_exit, r, *features); + return r; +} + +extern "C" int rbd_update_features(rbd_image_t image, uint64_t features, + uint8_t enabled) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image); + bool features_enabled = enabled != 0; + tracepoint(librbd, update_features_enter, ictx, features, features_enabled); + int r = ictx->operations->update_features(features, features_enabled); + tracepoint(librbd, update_features_exit, r); + return r; +} + +extern "C" int rbd_get_op_features(rbd_image_t image, uint64_t *op_features) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Image<>::get_op_features(ictx, op_features); +} + +extern "C" int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + *stripe_unit = ictx->get_stripe_unit(); + tracepoint(librbd, get_stripe_unit_exit, 0, *stripe_unit); + return 0; +} + +extern "C" int rbd_get_stripe_count(rbd_image_t image, uint64_t *stripe_count) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + *stripe_count = ictx->get_stripe_count(); + tracepoint(librbd, get_stripe_count_exit, 0, *stripe_count); + return 0; +} + +extern "C" int rbd_get_create_timestamp(rbd_image_t image, + struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_create_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_create_timestamp_exit, 0, timestamp); + return 0; +} + +extern "C" int rbd_get_access_timestamp(rbd_image_t image, + struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_access_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_access_timestamp_exit, 0, timestamp); + return 0; +} + +extern "C" int rbd_get_modify_timestamp(rbd_image_t image, + struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_modify_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp); + return 0; +} + + +extern "C" int rbd_get_overlap(rbd_image_t image, uint64_t *overlap) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_overlap(ictx, overlap); + tracepoint(librbd, get_overlap_exit, r, *overlap); + return r; +} + +extern "C" int rbd_get_name(rbd_image_t image, char *name, size_t *name_len) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image); + if (*name_len <= ictx->name.size()) { + *name_len = ictx->name.size() + 1; + return -ERANGE; + } + + strncpy(name, ictx->name.c_str(), ictx->name.size()); + name[ictx->name.size()] = '\0'; + *name_len = ictx->name.size() + 1; + return 0; +} + +extern "C" int rbd_get_id(rbd_image_t image, char *id, size_t id_len) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image); + if (ictx->old_format) { + return -EINVAL; + } + if (ictx->id.size() >= id_len) { + return -ERANGE; + } + + strncpy(id, ictx->id.c_str(), id_len - 1); + id[id_len - 1] = '\0'; + return 0; +} + +extern "C" int rbd_get_block_name_prefix(rbd_image_t image, char *prefix, + size_t prefix_len) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image); + if (ictx->object_prefix.size() >= prefix_len) { + return -ERANGE; + } + + strncpy(prefix, ictx->object_prefix.c_str(), prefix_len - 1); + prefix[prefix_len - 1] = '\0'; + return 0; +} + +extern "C" int64_t rbd_get_data_pool_id(rbd_image_t image) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image); + return librbd::api::Image<>::get_data_pool_id(ictx); +} + +extern "C" int rbd_get_parent_info(rbd_image_t image, + char *parent_pool_name, size_t ppool_namelen, + char *parent_name, size_t pnamelen, + char *parent_snap_name, size_t psnap_namelen) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name) { + if (parent_image.pool_name.length() + 1 > ppool_namelen) { + r = -ERANGE; + } else { + strcpy(parent_pool_name, parent_image.pool_name.c_str()); + } + } + if (parent_name) { + if (parent_image.image_name.length() + 1 > pnamelen) { + r = -ERANGE; + } else { + strcpy(parent_name, parent_image.image_name.c_str()); + } + } + if (parent_snap_name) { + if (parent_snap.name.length() + 1 > psnap_namelen) { + r = -ERANGE; + } else { + strcpy(parent_snap_name, parent_snap.name.c_str()); + } + } + } + + if (r < 0) { + tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL); + return r; + } + + tracepoint(librbd, get_parent_info_exit, r, + parent_image.pool_name.c_str(), + parent_image.image_name.c_str(), + parent_image.image_id.c_str(), + parent_snap.name.c_str()); + return 0; +} + +extern "C" int rbd_get_parent_info2(rbd_image_t image, + char *parent_pool_name, + size_t ppool_namelen, + char *parent_name, size_t pnamelen, + char *parent_id, size_t pidlen, + char *parent_snap_name, + size_t psnap_namelen) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name) { + if (parent_image.pool_name.length() + 1 > ppool_namelen) { + r = -ERANGE; + } else { + strcpy(parent_pool_name, parent_image.pool_name.c_str()); + } + } + if (parent_name) { + if (parent_image.image_name.length() + 1 > pnamelen) { + r = -ERANGE; + } else { + strcpy(parent_name, parent_image.image_name.c_str()); + } + } + if (parent_id) { + if (parent_image.image_id.length() + 1 > pidlen) { + r = -ERANGE; + } else { + strcpy(parent_id, parent_image.image_id.c_str()); + } + } + if (parent_snap_name) { + if (parent_snap.name.length() + 1 > psnap_namelen) { + r = -ERANGE; + } else { + strcpy(parent_snap_name, parent_snap.name.c_str()); + } + } + } + + if (r < 0) { + tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL); + return r; + } + + tracepoint(librbd, get_parent_info_exit, r, + parent_image.pool_name.c_str(), + parent_image.image_name.c_str(), + parent_image.image_id.c_str(), + parent_snap.name.c_str()); + return 0; +} + +extern "C" int rbd_get_parent(rbd_image_t image, + rbd_linked_image_spec_t *parent_image, + rbd_snap_spec_t *parent_snap) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + librbd::linked_image_spec_t cpp_parent_image; + librbd::snap_spec_t cpp_parent_snap; + int r = librbd::api::Image<>::get_parent(ictx, &cpp_parent_image, + &cpp_parent_snap); + if (r < 0) { + // FIPS zeroization audit 20191117: these memsets are not security related. + memset(parent_image, 0, sizeof(rbd_linked_image_spec_t)); + memset(parent_snap, 0, sizeof(rbd_snap_spec_t)); + } else { + *parent_image = { + .pool_id = cpp_parent_image.pool_id, + .pool_name = strdup(cpp_parent_image.pool_name.c_str()), + .pool_namespace = strdup(cpp_parent_image.pool_namespace.c_str()), + .image_id = strdup(cpp_parent_image.image_id.c_str()), + .image_name = strdup(cpp_parent_image.image_name.c_str()), + .trash = cpp_parent_image.trash}; + *parent_snap = { + .id = cpp_parent_snap.id, + .namespace_type = cpp_parent_snap.namespace_type, + .name = strdup(cpp_parent_snap.name.c_str())}; + } + + tracepoint(librbd, get_parent_info_exit, r, + parent_image->pool_name, + parent_image->image_name, + parent_image->image_id, + parent_snap->name); + return r; +} + +extern "C" int rbd_get_flags(rbd_image_t image, uint64_t *flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_flags_enter, ictx); + int r = librbd::get_flags(ictx, flags); + tracepoint(librbd, get_flags_exit, ictx, r, *flags); + return r; +} + +extern "C" int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info, + size_t group_info_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, image_get_group_enter, ictx->name.c_str()); + + if (group_info_size != sizeof(rbd_group_info_t)) { + tracepoint(librbd, image_get_group_exit, -ERANGE); + return -ERANGE; + } + + librbd::group_info_t cpp_group_info; + int r = librbd::api::Group<>::image_get_group(ictx, &cpp_group_info); + if (r >= 0) { + group_info_cpp_to_c(cpp_group_info, group_info); + } else { + group_info->name = NULL; + } + + tracepoint(librbd, image_get_group_exit, r); + return r; +} + +extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, set_image_notification_enter, ictx, fd, type); + int r = librbd::set_image_notification(ictx, fd, type); + tracepoint(librbd, set_image_notification_exit, ictx, r); + return r; +} + +extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, is_exclusive_lock_owner_enter, ictx); + bool owner; + int r = librbd::is_exclusive_lock_owner(ictx, &owner); + *is_owner = owner ? 1 : 0; + tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner); + return r; +} + +extern "C" int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_acquire_enter, ictx, lock_mode); + int r = librbd::lock_acquire(ictx, lock_mode); + tracepoint(librbd, lock_acquire_exit, ictx, r); + return r; +} + +extern "C" int rbd_lock_release(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_release_enter, ictx); + int r = librbd::lock_release(ictx); + tracepoint(librbd, lock_release_exit, ictx, r); + return r; +} + +extern "C" int rbd_lock_get_owners(rbd_image_t image, + rbd_lock_mode_t *lock_mode, + char **lock_owners, + size_t *max_lock_owners) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, lock_get_owners_enter, ictx); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(lock_owners, 0, sizeof(*lock_owners) * *max_lock_owners); + std::list<std::string> lock_owner_list; + int r = librbd::lock_get_owners(ictx, lock_mode, &lock_owner_list); + if (r >= 0) { + if (*max_lock_owners >= lock_owner_list.size()) { + *max_lock_owners = 0; + for (auto &lock_owner : lock_owner_list) { + lock_owners[(*max_lock_owners)++] = strdup(lock_owner.c_str()); + } + } else { + *max_lock_owners = lock_owner_list.size(); + r = -ERANGE; + } + } + tracepoint(librbd, lock_get_owners_exit, ictx, r); + return r; +} + +extern "C" void rbd_lock_get_owners_cleanup(char **lock_owners, + size_t lock_owner_count) +{ + for (size_t i = 0; i < lock_owner_count; ++i) { + free(lock_owners[i]); + } +} + +extern "C" int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode, + const char *lock_owner) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner); + int r = librbd::lock_break(ictx, lock_mode, lock_owner); + tracepoint(librbd, lock_break_exit, ictx, r); + return r; +} + +extern "C" int rbd_rebuild_object_map(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image); + librbd::CProgressContext prog_ctx(cb, cbdata); + return ictx->operations->rebuild_object_map(prog_ctx); +} + +/* snapshots */ +extern "C" int rbd_snap_create(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(), + snap_name); + tracepoint(librbd, snap_create_exit, r); + return r; +} + +extern "C" int rbd_snap_rename(rbd_image_t image, const char *srcname, const char *dstname) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname); + int r = ictx->operations->snap_rename(srcname, dstname); + tracepoint(librbd, snap_rename_exit, r); + return r; +} + +extern "C" int rbd_snap_remove(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::snap_remove(ictx, snap_name, 0, prog_ctx); + tracepoint(librbd, snap_remove_exit, r); + return r; +} + +extern "C" int rbd_snap_remove2(rbd_image_t image, const char *snap_name, uint32_t flags, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::snap_remove(ictx, snap_name, flags, prog_ctx); + tracepoint(librbd, snap_remove_exit, r); + return r; +} + +extern "C" int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Snapshot<>::remove(ictx, snap_id); +} + +extern "C" int rbd_snap_rollback(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; +} + +extern "C" int rbd_snap_rollback_with_progress(rbd_image_t image, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; +} + +extern "C" int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, + int *max_snaps) +{ + vector<librbd::snap_info_t> cpp_snaps; + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snaps); + + if (!max_snaps) { + tracepoint(librbd, snap_list_exit, -EINVAL, 0); + return -EINVAL; + } + // FIPS zeroization audit 20191117: this memset is not security related. + memset(snaps, 0, sizeof(*snaps) * *max_snaps); + + int r = librbd::snap_list(ictx, cpp_snaps); + if (r == -ENOENT) { + tracepoint(librbd, snap_list_exit, 0, *max_snaps); + return 0; + } + if (r < 0) { + tracepoint(librbd, snap_list_exit, r, *max_snaps); + return r; + } + if (*max_snaps < (int)cpp_snaps.size() + 1) { + *max_snaps = (int)cpp_snaps.size() + 1; + tracepoint(librbd, snap_list_exit, -ERANGE, *max_snaps); + return -ERANGE; + } + + int i; + + for (i = 0; i < (int)cpp_snaps.size(); i++) { + snaps[i].id = cpp_snaps[i].id; + snaps[i].size = cpp_snaps[i].size; + snaps[i].name = strdup(cpp_snaps[i].name.c_str()); + if (!snaps[i].name) { + for (int j = 0; j < i; j++) + free((void *)snaps[j].name); + tracepoint(librbd, snap_list_exit, -ENOMEM, *max_snaps); + return -ENOMEM; + } + tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name); + } + snaps[i].id = 0; + snaps[i].size = 0; + snaps[i].name = NULL; + + r = (int)cpp_snaps.size(); + tracepoint(librbd, snap_list_exit, r, *max_snaps); + return r; +} + +extern "C" void rbd_snap_list_end(rbd_snap_info_t *snaps) +{ + tracepoint(librbd, snap_list_end_enter, snaps); + while (snaps->name) { + free((void *)snaps->name); + snaps++; + } + tracepoint(librbd, snap_list_end_exit); +} + +extern "C" int rbd_snap_protect(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_protect_exit, r); + return r; +} + +extern "C" int rbd_snap_unprotect(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_unprotect_exit, r); + return r; +} + +extern "C" int rbd_snap_is_protected(rbd_image_t image, const char *snap_name, + int *is_protected) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + bool protected_snap; + int r = librbd::snap_is_protected(ictx, snap_name, &protected_snap); + if (r < 0) { + tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0); + return r; + } + *is_protected = protected_snap ? 1 : 0; + tracepoint(librbd, snap_is_protected_exit, 0, *is_protected ? 1 : 0); + return 0; +} + +extern "C" int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str()); + int r = librbd::snap_get_limit(ictx, limit); + tracepoint(librbd, snap_get_limit_exit, r, *limit); + return r; +} + +extern "C" int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str()); + int r = librbd::snap_get_timestamp(ictx, snap_id, timestamp); + tracepoint(librbd, snap_get_timestamp_exit, r); + return r; +} + +extern "C" int rbd_snap_set_limit(rbd_image_t image, uint64_t limit) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit); + int r = librbd::snap_set_limit(ictx, limit); + tracepoint(librbd, snap_set_limit_exit, r); + return r; +} + +extern "C" int rbd_snap_set(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Image<>::snap_set( + ictx, cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_set_exit, r); + return r; +} + +extern "C" int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Image<>::snap_set(ictx, snap_id); +} + +extern "C" ssize_t rbd_list_children(rbd_image_t image, char *pools, + size_t *pools_len, char *images, + size_t *images_len) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + std::vector<librbd::linked_image_spec_t> cpp_images; + int r = librbd::api::Image<>::list_children(ictx, &cpp_images); + if (r < 0) { + tracepoint(librbd, list_children_exit, r); + return r; + } + + std::set<std::pair<std::string, std::string>> image_set; + for (auto& image : cpp_images) { + if (!image.trash) { + image_set.insert({image.pool_name, image.image_name}); + } + } + + size_t pools_total = 0; + size_t images_total = 0; + for (auto it : image_set) { + pools_total += it.first.length() + 1; + images_total += it.second.length() + 1; + } + + bool too_short = false; + if (pools_total > *pools_len) + too_short = true; + if (images_total > *images_len) + too_short = true; + *pools_len = pools_total; + *images_len = images_total; + if (too_short) { + tracepoint(librbd, list_children_exit, -ERANGE); + return -ERANGE; + } + + char *pools_p = pools; + char *images_p = images; + for (auto it : image_set) { + const char* pool = it.first.c_str(); + strcpy(pools_p, pool); + pools_p += it.first.length() + 1; + const char* image = it.second.c_str(); + strcpy(images_p, image); + images_p += it.second.length() + 1; + tracepoint(librbd, list_children_entry, pool, image); + } + + ssize_t ret = image_set.size(); + tracepoint(librbd, list_children_exit, ret); + return ret; +} + +extern "C" int rbd_list_children2(rbd_image_t image, + rbd_child_info_t *children, + int *max_children) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(children, 0, sizeof(*children) * *max_children); + + if (!max_children) { + tracepoint(librbd, list_children_exit, -EINVAL); + return -EINVAL; + } + + std::vector<librbd::linked_image_spec_t> cpp_children; + int r = librbd::api::Image<>::list_children(ictx, &cpp_children); + if (r < 0) { + tracepoint(librbd, list_children_exit, r); + return r; + } + + if (*max_children < (int)cpp_children.size() + 1) { + *max_children = (int)cpp_children.size() + 1; + tracepoint(librbd, list_children_exit, *max_children); + return -ERANGE; + } + + int i; + for (i = 0; i < (int)cpp_children.size(); i++) { + children[i].pool_name = strdup(cpp_children[i].pool_name.c_str()); + children[i].image_name = strdup(cpp_children[i].image_name.c_str()); + children[i].image_id = strdup(cpp_children[i].image_id.c_str()); + children[i].trash = cpp_children[i].trash; + tracepoint(librbd, list_children_entry, children[i].pool_name, + children[i].image_name); + } + children[i].pool_name = NULL; + children[i].image_name = NULL; + children[i].image_id = NULL; + + r = (int)cpp_children.size(); + tracepoint(librbd, list_children_exit, *max_children); + return r; +} + +extern "C" void rbd_list_child_cleanup(rbd_child_info_t *child) +{ + free((void *)child->pool_name); + free((void *)child->image_name); + free((void *)child->image_id); +} + +extern "C" void rbd_list_children_cleanup(rbd_child_info_t *children, + size_t num_children) +{ + for (size_t i=0; i < num_children; i++) { + free((void *)children[i].pool_name); + free((void *)children[i].image_name); + free((void *)children[i].image_id); + } +} + +extern "C" int rbd_list_children3(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *max_images); + + std::vector<librbd::linked_image_spec_t> cpp_children; + int r = librbd::api::Image<>::list_children(ictx, &cpp_children); + if (r < 0) { + tracepoint(librbd, list_children_exit, r); + return r; + } + + if (*max_images < cpp_children.size()) { + *max_images = cpp_children.size(); + return -ERANGE; + } + + *max_images = cpp_children.size(); + for (size_t idx = 0; idx < cpp_children.size(); ++idx) { + images[idx] = { + .pool_id = cpp_children[idx].pool_id, + .pool_name = strdup(cpp_children[idx].pool_name.c_str()), + .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()), + .image_id = strdup(cpp_children[idx].image_id.c_str()), + .image_name = strdup(cpp_children[idx].image_name.c_str()), + .trash = cpp_children[idx].trash}; + tracepoint(librbd, list_children_entry, images[idx].pool_name, + images[idx].image_name); + } + return 0; +} + +extern "C" int rbd_list_descendants(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images) +{ + auto ictx = reinterpret_cast<librbd::ImageCtx*>(image); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *max_images); + + std::vector<librbd::linked_image_spec_t> cpp_children; + int r = librbd::api::Image<>::list_descendants(ictx, {}, &cpp_children); + if (r < 0) { + return r; + } + + if (*max_images < cpp_children.size()) { + *max_images = cpp_children.size(); + return -ERANGE; + } + + *max_images = cpp_children.size(); + for (size_t idx = 0; idx < cpp_children.size(); ++idx) { + images[idx] = { + .pool_id = cpp_children[idx].pool_id, + .pool_name = strdup(cpp_children[idx].pool_name.c_str()), + .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()), + .image_id = strdup(cpp_children[idx].image_id.c_str()), + .image_name = strdup(cpp_children[idx].image_name.c_str()), + .trash = cpp_children[idx].trash}; + } + return 0; +} + +extern "C" ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + std::list<librbd::locker_t> lockers; + bool exclusive_bool; + string tag_str; + + int r = list_lockers(ictx, &lockers, &exclusive_bool, &tag_str); + if (r < 0) { + tracepoint(librbd, list_lockers_exit, r); + return r; + } + + ldout(ictx->cct, 20) << "list_lockers r = " << r << " lockers.size() = " << lockers.size() << dendl; + + *exclusive = (int)exclusive_bool; + size_t clients_total = 0; + size_t cookies_total = 0; + size_t addrs_total = 0; + for (list<librbd::locker_t>::const_iterator it = lockers.begin(); + it != lockers.end(); ++it) { + clients_total += it->client.length() + 1; + cookies_total += it->cookie.length() + 1; + addrs_total += it->address.length() + 1; + } + + bool too_short = ((clients_total > *clients_len) || + (cookies_total > *cookies_len) || + (addrs_total > *addrs_len) || + (tag_str.length() + 1 > *tag_len)); + *clients_len = clients_total; + *cookies_len = cookies_total; + *addrs_len = addrs_total; + *tag_len = tag_str.length() + 1; + if (too_short) { + tracepoint(librbd, list_lockers_exit, -ERANGE); + return -ERANGE; + } + + strcpy(tag, tag_str.c_str()); + char *clients_p = clients; + char *cookies_p = cookies; + char *addrs_p = addrs; + for (list<librbd::locker_t>::const_iterator it = lockers.begin(); + it != lockers.end(); ++it) { + const char* client = it->client.c_str(); + strcpy(clients_p, client); + clients_p += it->client.length() + 1; + const char* cookie = it->cookie.c_str(); + strcpy(cookies_p, cookie); + cookies_p += it->cookie.length() + 1; + const char* address = it->address.c_str(); + strcpy(addrs_p, address); + addrs_p += it->address.length() + 1; + tracepoint(librbd, list_lockers_entry, client, cookie, address); + } + + ssize_t ret = lockers.size(); + tracepoint(librbd, list_lockers_exit, ret); + return ret; +} + +extern "C" int rbd_lock_exclusive(rbd_image_t image, const char *cookie) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie); + int r = librbd::lock(ictx, true, cookie ? cookie : "", ""); + tracepoint(librbd, lock_exclusive_exit, r); + return r; +} + +extern "C" int rbd_lock_shared(rbd_image_t image, const char *cookie, + const char *tag) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie, tag); + int r = librbd::lock(ictx, false, cookie ? cookie : "", tag ? tag : ""); + tracepoint(librbd, lock_shared_exit, r); + return r; +} + +extern "C" int rbd_unlock(rbd_image_t image, const char *cookie) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie); + int r = librbd::unlock(ictx, cookie ? cookie : ""); + tracepoint(librbd, unlock_exit, r); + return r; +} + +extern "C" int rbd_break_lock(rbd_image_t image, const char *client, + const char *cookie) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client, cookie); + int r = librbd::break_lock(ictx, client, cookie ? cookie : ""); + tracepoint(librbd, break_lock_exit, r); + return r; +} + +/* I/O */ +extern "C" ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len, + char *buf) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + int r = ictx->io_work_queue->read(ofs, len, librbd::io::ReadResult{buf, len}, + 0); + tracepoint(librbd, read_exit, r); + return r; +} + +extern "C" ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len, + char *buf, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, len, op_flags); + int r = ictx->io_work_queue->read(ofs, len, librbd::io::ReadResult{buf, len}, + op_flags); + tracepoint(librbd, read_exit, r); + return r; +} + + +extern "C" int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + tracepoint(librbd, read_iterate_exit, r); + return r; +} + +extern "C" int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + if (r > 0) + r = 0; + tracepoint(librbd, read_iterate2_exit, r); + return (int)r; +} + +extern "C" int rbd_diff_iterate(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + true, false); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, len, + true, false, cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; +} + +extern "C" int rbd_diff_iterate2(rbd_image_t image, const char *fromsnapname, + uint64_t ofs, uint64_t len, + uint8_t include_parent, uint8_t whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + include_parent != 0, whole_object != 0); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, len, + include_parent, whole_object, + cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; +} + +extern "C" ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf); + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len)); + int r = ictx->io_work_queue->write(ofs, len, std::move(bl), 0); + tracepoint(librbd, write_exit, r); + return r; +} + +extern "C" ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf, op_flags); + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len)); + int r = ictx->io_work_queue->write(ofs, len, std::move(bl), op_flags); + tracepoint(librbd, write_exit, r); + return r; +} + + +extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, len); + if (len > std::numeric_limits<int>::max()) { + tracepoint(librbd, discard_exit, -EINVAL); + return -EINVAL; + } + + int r = ictx->io_work_queue->discard( + ofs, len, ictx->discard_granularity_bytes); + tracepoint(librbd, discard_exit, r); + return r; +} + +extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, size_t data_len, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, ofs, len, data_len == 0 ? NULL : buf, data_len, op_flags); + + if (data_len == 0 || len % data_len || + len > std::numeric_limits<int>::max()) { + tracepoint(librbd, writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same"); + if (discard_zero && mem_is_zero(buf, data_len)) { + int r = ictx->io_work_queue->write_zeroes(ofs, len, 0, op_flags); + tracepoint(librbd, writesame_exit, r); + return r; + } + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, data_len)); + int r = ictx->io_work_queue->writesame(ofs, len, std::move(bl), op_flags); + tracepoint(librbd, writesame_exit, r); + return r; +} + +extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len, + int zero_flags, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags); +} + +extern "C" ssize_t rbd_compare_and_write(rbd_image_t image, + uint64_t ofs, size_t len, + const char *cmp_buf, + const char *buf, + uint64_t *mismatch_off, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, + len, cmp_buf, buf, op_flags); + + bufferlist cmp_bl; + cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len)); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len)); + + int r = ictx->io_work_queue->compare_and_write(ofs, len, std::move(cmp_bl), + std::move(bl), mismatch_off, + op_flags); + tracepoint(librbd, compare_and_write_exit, r); + return r; +} + +extern "C" int rbd_aio_create_completion(void *cb_arg, + rbd_callback_t complete_cb, + rbd_completion_t *c) +{ + librbd::RBD::AioCompletion *rbd_comp = + new librbd::RBD::AioCompletion(cb_arg, complete_cb); + *c = (rbd_completion_t) rbd_comp; + return 0; +} + +extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc); + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len)); + ictx->io_work_queue->aio_write(get_aio_completion(comp), off, len, + std::move(bl), 0); + tracepoint(librbd, aio_write_exit, 0); + return 0; +} + +extern "C" int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, buf, comp->pc, op_flags); + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len)); + ictx->io_work_queue->aio_write(get_aio_completion(comp), off, len, + std::move(bl), op_flags); + tracepoint(librbd, aio_write_exit, 0); + return 0; +} + +extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + // convert the scatter list into a bufferlist + ssize_t len = 0; + bufferlist bl; + for (int i = 0; i < iovcnt; ++i) { + const struct iovec &io = iov[i]; + len += io.iov_len; + if (len < 0) { + break; + } + + bl.push_back(create_write_raw(ictx, static_cast<char*>(io.iov_base), + io.iov_len)); + } + + int r = 0; + if (iovcnt <= 0 || len < 0) { + r = -EINVAL; + } + + tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, off, len, NULL, + comp->pc); + if (r == 0) { + ictx->io_work_queue->aio_write(get_aio_completion(comp), off, len, + std::move(bl), 0); + } + tracepoint(librbd, aio_write_exit, r); + return r; +} + +extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, + rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, comp->pc); + ictx->io_work_queue->aio_discard( + get_aio_completion(comp), off, len, ictx->discard_granularity_bytes); + tracepoint(librbd, aio_discard_exit, 0); + return 0; +} + +extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc); + ictx->io_work_queue->aio_read(get_aio_completion(comp), off, len, + librbd::io::ReadResult{buf, len}, 0); + tracepoint(librbd, aio_read_exit, 0); + return 0; +} + +extern "C" int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, buf, comp->pc, op_flags); + ictx->io_work_queue->aio_read(get_aio_completion(comp), off, len, + librbd::io::ReadResult{buf, len},op_flags); + tracepoint(librbd, aio_read_exit, 0); + return 0; +} + +extern "C" int rbd_aio_readv(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + ssize_t len = 0; + for (int i = 0; i < iovcnt; ++i) { + len += iov[i].iov_len; + if (len < 0) { + break; + } + } + + int r = 0; + if (iovcnt == 0 || len < 0) { + r = -EINVAL; + } + + tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, off, len, NULL, + comp->pc); + if (r == 0) { + librbd::io::ReadResult read_result; + if (iovcnt == 1) { + read_result = librbd::io::ReadResult( + static_cast<char *>(iov[0].iov_base), iov[0].iov_len); + } else { + read_result = librbd::io::ReadResult(iov, iovcnt); + } + ictx->io_work_queue->aio_read(get_aio_completion(comp), off, len, + std::move(read_result), 0); + } + tracepoint(librbd, aio_read_exit, r); + return r; +} + +extern "C" int rbd_flush(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = ictx->io_work_queue->flush(); + tracepoint(librbd, flush_exit, r); + return r; +} + +extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->io_work_queue->aio_flush(get_aio_completion(comp)); + tracepoint(librbd, aio_flush_exit, 0); + return 0; +} + +extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, + const char *buf, size_t data_len, rbd_completion_t c, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, data_len == 0 ? NULL : buf, data_len, comp->pc, + op_flags); + + if (data_len == 0 || len % data_len) { + tracepoint(librbd, aio_writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same"); + if (discard_zero && mem_is_zero(buf, data_len)) { + ictx->io_work_queue->aio_write_zeroes(get_aio_completion(comp), off, len, 0, + op_flags, true); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; + } + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, data_len)); + ictx->io_work_queue->aio_writesame(get_aio_completion(comp), off, len, + std::move(bl), op_flags); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; +} + +extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len, + rbd_completion_t c, int zero_flags, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + ictx->io_work_queue->aio_write_zeroes( + get_aio_completion(comp), off, len, zero_flags, op_flags, true); + return 0; +} + +extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off, + size_t len, const char *cmp_buf, + const char *buf, rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, cmp_buf, buf, comp->pc, op_flags); + + bufferlist cmp_bl; + cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len)); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len)); + ictx->io_work_queue->aio_compare_and_write(get_aio_completion(comp), off, len, + std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, false); + + tracepoint(librbd, aio_compare_and_write_exit, 0); + return 0; +} + +extern "C" int rbd_invalidate_cache(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::invalidate_cache(ictx); + tracepoint(librbd, invalidate_cache_exit, r); + return r; +} + +extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::io::AioCompletion *cs[numcomp]; + tracepoint(librbd, poll_io_events_enter, ictx, numcomp); + int r = librbd::poll_io_events(ictx, cs, numcomp); + tracepoint(librbd, poll_io_events_exit, r); + if (r > 0) { + for (int i = 0; i < r; ++i) + comps[i] = cs[i]->rbd_comp; + } + return r; +} + +extern "C" int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *vallen) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + string val_s; + tracepoint(librbd, metadata_get_enter, ictx, key); + int r = librbd::metadata_get(ictx, key, &val_s); + if (r < 0) { + tracepoint(librbd, metadata_get_exit, r, key, NULL); + return r; + } + if (*vallen < val_s.size() + 1) { + r = -ERANGE; + *vallen = val_s.size() + 1; + tracepoint(librbd, metadata_get_exit, r, key, NULL); + } else { + strncpy(value, val_s.c_str(), val_s.size() + 1); + tracepoint(librbd, metadata_get_exit, r, key, value); + } + return r; +} + +extern "C" int rbd_metadata_set(rbd_image_t image, const char *key, const char *value) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, metadata_set_enter, ictx, key, value); + int r = ictx->operations->metadata_set(key, value); + tracepoint(librbd, metadata_set_exit, r); + return r; +} + +extern "C" int rbd_metadata_remove(rbd_image_t image, const char *key) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, metadata_remove_enter, ictx, key); + int r = ictx->operations->metadata_remove(key); + tracepoint(librbd, metadata_remove_exit, r); + return r; +} + +extern "C" int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max, + char *key, size_t *key_len, char *value, size_t *val_len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, metadata_list_enter, ictx); + map<string, bufferlist> pairs; + int r = librbd::metadata_list(ictx, start, max, &pairs); + size_t key_total_len = 0, val_total_len = 0; + bool too_short = false; + for (map<string, bufferlist>::iterator it = pairs.begin(); + it != pairs.end(); ++it) { + key_total_len += it->first.size() + 1; + val_total_len += it->second.length() + 1; + } + if (*key_len < key_total_len || *val_len < val_total_len) + too_short = true; + *key_len = key_total_len; + *val_len = val_total_len; + if (too_short) { + tracepoint(librbd, metadata_list_exit, -ERANGE); + return -ERANGE; + } + + char *key_p = key, *value_p = value; + + for (map<string, bufferlist>::iterator it = pairs.begin(); + it != pairs.end(); ++it) { + strncpy(key_p, it->first.c_str(), it->first.size() + 1); + key_p += it->first.size() + 1; + strncpy(value_p, it->second.c_str(), it->second.length()); + value_p += it->second.length(); + *value_p = '\0'; + value_p++; + tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str()); + } + tracepoint(librbd, metadata_list_exit, r); + return r; +} + +extern "C" int rbd_mirror_image_enable(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_enable(ictx, false); +} + +extern "C" int rbd_mirror_image_disable(rbd_image_t image, bool force) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_disable(ictx, force); +} + +extern "C" int rbd_mirror_image_promote(rbd_image_t image, bool force) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_promote(ictx, force); +} + +extern "C" int rbd_mirror_image_demote(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_demote(ictx); +} + +extern "C" int rbd_mirror_image_resync(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_resync(ictx); +} + +extern "C" int rbd_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + if (sizeof(rbd_mirror_image_info_t) != info_size) { + return -ERANGE; + } + + librbd::mirror_image_info_t cpp_mirror_image; + int r = librbd::api::Mirror<>::image_get_info(ictx, &cpp_mirror_image); + if (r < 0) { + return r; + } + + mirror_image_info_cpp_to_c(cpp_mirror_image, mirror_image_info); + return 0; +} + +extern "C" int rbd_mirror_image_get_status(rbd_image_t image, + rbd_mirror_image_status_t *status, + size_t status_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + if (sizeof(rbd_mirror_image_status_t) != status_size) { + return -ERANGE; + } + + librbd::mirror_image_status_t cpp_status; + int r = librbd::api::Mirror<>::image_get_status(ictx, &cpp_status); + if (r < 0) { + return r; + } + + mirror_image_status_cpp_to_c(cpp_status, status); + return 0; +} + +extern "C" int rbd_mirror_image_get_instance_id(rbd_image_t image, + char *instance_id, + size_t *instance_id_max_length) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + std::string cpp_instance_id; + int r = librbd::api::Mirror<>::image_get_instance_id(ictx, &cpp_instance_id); + if (r < 0) { + return r; + } + + if (cpp_instance_id.size() >= *instance_id_max_length) { + *instance_id_max_length = cpp_instance_id.size() + 1; + return -ERANGE; + } + + strcpy(instance_id, cpp_instance_id.c_str()); + *instance_id_max_length = cpp_instance_id.size() + 1; + return 0; +} + +extern "C" int rbd_aio_mirror_image_promote(rbd_image_t image, bool force, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + librbd::api::Mirror<>::image_promote( + ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + return 0; +} + +extern "C" int rbd_aio_mirror_image_demote(rbd_image_t image, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + librbd::api::Mirror<>::image_demote( + ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + return 0; +} + +extern "C" int rbd_aio_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *info, + size_t info_size, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + if (sizeof(rbd_mirror_image_info_t) != info_size) { + return -ERANGE; + } + + auto ctx = new C_MirrorImageGetInfo( + info, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + librbd::api::Mirror<>::image_get_info( + ictx, &ctx->cpp_mirror_image_info, ctx); + return 0; +} + +extern "C" int rbd_aio_mirror_image_get_status(rbd_image_t image, + rbd_mirror_image_status_t *status, + size_t status_size, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + if (sizeof(rbd_mirror_image_status_t) != status_size) { + return -ERANGE; + } + + auto ctx = new C_MirrorImageGetStatus( + status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + librbd::api::Mirror<>::image_get_status(ictx, &ctx->cpp_mirror_image_status, + ctx); + return 0; +} + +extern "C" int rbd_update_watch(rbd_image_t image, uint64_t *handle, + rbd_update_callback_t watch_cb, void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + C_UpdateWatchCB *wctx = new C_UpdateWatchCB(watch_cb, arg); + tracepoint(librbd, update_watch_enter, ictx, wctx); + int r = ictx->state->register_update_watcher(wctx, &wctx->handle); + tracepoint(librbd, update_watch_exit, r, wctx->handle); + *handle = reinterpret_cast<uint64_t>(wctx); + return r; +} + +extern "C" int rbd_update_unwatch(rbd_image_t image, uint64_t handle) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + C_UpdateWatchCB *wctx = reinterpret_cast<C_UpdateWatchCB *>(handle); + tracepoint(librbd, update_unwatch_enter, ictx, wctx->handle); + int r = ictx->state->unregister_update_watcher(wctx->handle); + delete wctx; + tracepoint(librbd, update_unwatch_exit, r); + return r; +} + +extern "C" int rbd_aio_is_complete(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->is_complete(); +} + +extern "C" int rbd_aio_wait_for_complete(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->wait_for_complete(); +} + +extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->get_return_value(); +} + +extern "C" void *rbd_aio_get_arg(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->get_arg(); +} + +extern "C" void rbd_aio_release(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + comp->release(); +} + +extern "C" int rbd_group_create(rados_ioctx_t p, const char *name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Group<>::create(io_ctx, name); + tracepoint(librbd, group_create_exit, r); + return r; +} + +extern "C" int rbd_group_remove(rados_ioctx_t p, const char *name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Group<>::remove(io_ctx, name); + tracepoint(librbd, group_remove_exit, r); + return r; +} + +extern "C" int rbd_group_list(rados_ioctx_t p, char *names, size_t *size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + + vector<string> cpp_names; + int r = librbd::api::Group<>::list(io_ctx, &cpp_names); + + if (r < 0) { + tracepoint(librbd, group_list_exit, r); + return r; + } + + size_t expected_size = 0; + + for (size_t i = 0; i < cpp_names.size(); i++) { + expected_size += cpp_names[i].size() + 1; + } + if (*size < expected_size) { + *size = expected_size; + tracepoint(librbd, group_list_exit, -ERANGE); + return -ERANGE; + } + + if (names == NULL) { + tracepoint(librbd, group_list_exit, -EINVAL); + return -EINVAL; + } + + for (int i = 0; i < (int)cpp_names.size(); i++) { + const char* name = cpp_names[i].c_str(); + tracepoint(librbd, group_list_entry, name); + strcpy(names, name); + names += strlen(names) + 1; + } + tracepoint(librbd, group_list_exit, (int)expected_size); + return (int)expected_size; +} + +extern "C" int rbd_group_rename(rados_ioctx_t p, const char *src_name, + const char *dest_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx)); + tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), src_name, dest_name); + int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name); + tracepoint(librbd, group_rename_exit, r); + return r; +} + +extern "C" int rbd_group_image_add(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx image_ioctx; + + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_add_enter, group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + + int r = librbd::api::Group<>::image_add(group_ioctx, group_name, image_ioctx, + image_name); + + tracepoint(librbd, group_image_add_exit, r); + return r; +} + +extern "C" int rbd_group_image_remove(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx image_ioctx; + + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_enter, group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + + int r = librbd::api::Group<>::image_remove(group_ioctx, group_name, + image_ioctx, image_name); + + tracepoint(librbd, group_image_remove_exit, r); + return r; +} + +extern "C" int rbd_group_image_remove_by_id(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_id) +{ + librados::IoCtx group_ioctx; + librados::IoCtx image_ioctx; + + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_by_id_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_id); + + int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name, + image_ioctx, image_id); + + tracepoint(librbd, group_image_remove_by_id_exit, r); + return r; +} + +extern "C" int rbd_group_image_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t *image_size) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_image_list_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *image_size); + + if (group_image_info_size != sizeof(rbd_group_image_info_t)) { + *image_size = 0; + tracepoint(librbd, group_image_list_exit, -ERANGE); + return -ERANGE; + } + + std::vector<librbd::group_image_info_t> cpp_images; + int r = librbd::api::Group<>::image_list(group_ioctx, group_name, + &cpp_images); + + if (r == -ENOENT) { + tracepoint(librbd, group_image_list_exit, 0); + *image_size = 0; + return 0; + } + + if (r < 0) { + tracepoint(librbd, group_image_list_exit, r); + return r; + } + + if (*image_size < cpp_images.size()) { + *image_size = cpp_images.size(); + tracepoint(librbd, group_image_list_exit, -ERANGE); + return -ERANGE; + } + + for (size_t i = 0; i < cpp_images.size(); ++i) { + group_image_status_cpp_to_c(cpp_images[i], &images[i]); + } + + r = *image_size = cpp_images.size(); + tracepoint(librbd, group_image_list_exit, r); + return r; +} + +extern "C" int rbd_group_info_cleanup(rbd_group_info_t *group_info, + size_t group_info_size) { + if (group_info_size != sizeof(rbd_group_info_t)) { + return -ERANGE; + } + + free(group_info->name); + return 0; +} + +extern "C" int rbd_group_image_list_cleanup(rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t len) { + if (group_image_info_size != sizeof(rbd_group_image_info_t)) { + return -ERANGE; + } + + for (size_t i = 0; i < len; ++i) { + free(images[i].name); + } + return 0; +} + +extern "C" int rbd_group_snap_create(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_create_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, snap_name); + + tracepoint(librbd, group_snap_create_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_remove(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_remove_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name, snap_name); + + tracepoint(librbd, group_snap_remove_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_rename(rados_ioctx_t group_p, + const char *group_name, + const char *old_snap_name, + const char *new_snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rename_enter, + group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(), + group_name, old_snap_name, new_snap_name); + + int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name, + old_snap_name, new_snap_name); + + tracepoint(librbd, group_snap_list_exit, r); + return r; +} + +extern "C" int rbd_group_snap_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t *snaps_size) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_list_enter, group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(snaps, 0, sizeof(*snaps) * *snaps_size); + + if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) { + *snaps_size = 0; + tracepoint(librbd, group_snap_list_exit, -ERANGE); + return -ERANGE; + } + + std::vector<librbd::group_snap_info_t> cpp_snaps; + int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps); + + if (r == -ENOENT) { + *snaps_size = 0; + tracepoint(librbd, group_snap_list_exit, 0); + return 0; + } + + if (r < 0) { + tracepoint(librbd, group_snap_list_exit, r); + return r; + } + + if (*snaps_size < cpp_snaps.size()) { + *snaps_size = cpp_snaps.size(); + tracepoint(librbd, group_snap_list_exit, -ERANGE); + return -ERANGE; + } + + for (size_t i = 0; i < cpp_snaps.size(); ++i) { + group_snap_info_cpp_to_c(cpp_snaps[i], &snaps[i]); + } + + r = *snaps_size = cpp_snaps.size(); + tracepoint(librbd, group_snap_list_exit, r); + return r; +} + +extern "C" int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t len) { + if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) { + return -ERANGE; + } + + for (size_t i = 0; i < len; ++i) { + free(snaps[i].name); + } + return 0; +} + +extern "C" int rbd_group_snap_rollback(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + + tracepoint(librbd, group_snap_rollback_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + + tracepoint(librbd, group_snap_rollback_exit, r); + + return r; +} + +extern "C" int rbd_snap_get_namespace_type(rbd_image_t image, + uint64_t snap_id, + rbd_snap_namespace_type_t *namespace_type) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id, + namespace_type); + tracepoint(librbd, snap_get_namespace_type_exit, r); + return r; +} + +extern "C" int rbd_snap_get_group_namespace(rbd_image_t image, uint64_t snap_id, + rbd_snap_group_namespace_t *group_snap, + size_t snap_group_namespace_size) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_group_namespace_enter, ictx, + ictx->name.c_str()); + + if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) { + tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE); + return -ERANGE; + } + + librbd::snap_group_namespace_t group_namespace; + int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id, + &group_namespace); + if (r >= 0) { + group_snap->group_pool = group_namespace.group_pool; + group_snap->group_name = strdup(group_namespace.group_name.c_str()); + group_snap->group_snap_name = + strdup(group_namespace.group_snap_name.c_str()); + } + + tracepoint(librbd, snap_get_group_namespace_exit, r); + return r; +} + +extern "C" int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap, + size_t snap_group_namespace_size) { + if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) { + tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE); + return -ERANGE; + } + + free(group_snap->group_name); + free(group_snap->group_snap_name); + return 0; +} + +extern "C" int rbd_snap_get_trash_namespace(rbd_image_t image, uint64_t snap_id, + char *original_name, + size_t max_length) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + std::string cpp_original_name; + int r = librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id, + &cpp_original_name); + if (r < 0) { + return r; + } + + if (cpp_original_name.length() >= max_length) { + return -ERANGE; + } + + strcpy(original_name, cpp_original_name.c_str()); + return 0; +} +extern "C" int rbd_watchers_list(rbd_image_t image, + rbd_image_watcher_t *watchers, + size_t *max_watchers) { + std::list<librbd::image_watcher_t> watcher_list; + librbd::ImageCtx *ictx = (librbd::ImageCtx*)image; + + tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(watchers, 0, sizeof(*watchers) * *max_watchers); + int r = librbd::list_watchers(ictx, watcher_list); + if (r < 0) { + tracepoint(librbd, list_watchers_exit, r, 0); + return r; + } + + if (watcher_list.size() > *max_watchers) { + *max_watchers = watcher_list.size(); + tracepoint(librbd, list_watchers_exit, -ERANGE, watcher_list.size()); + return -ERANGE; + } + + *max_watchers = 0; + for (auto &watcher : watcher_list) { + tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie); + watchers[*max_watchers].addr = strdup(watcher.addr.c_str()); + watchers[*max_watchers].id = watcher.id; + watchers[*max_watchers].cookie = watcher.cookie; + *max_watchers += 1; + } + + tracepoint(librbd, list_watchers_exit, r, watcher_list.size()); + return 0; +} + +extern "C" void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers, + size_t num_watchers) { + for (size_t i = 0; i < num_watchers; ++i) { + free(watchers[i].addr); + } +} + +extern "C" int rbd_config_image_list(rbd_image_t image, + rbd_config_option_t *options, + int *max_options) { + librbd::ImageCtx *ictx = (librbd::ImageCtx*)image; + + std::vector<librbd::config_option_t> option_vector; + int r = librbd::api::Config<>::list(ictx, &option_vector); + if (r < 0) { + return r; + } + + if (*max_options < static_cast<int>(option_vector.size())) { + *max_options = static_cast<int>(option_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast<int>(option_vector.size()); ++i) { + config_option_cpp_to_c(option_vector[i], &options[i]); + } + *max_options = static_cast<int>(option_vector.size()); + return 0; +} + +extern "C" void rbd_config_image_list_cleanup(rbd_config_option_t *options, + int max_options) { + for (int i = 0; i < max_options; ++i) { + config_option_cleanup(options[i]); + } +} diff --git a/src/librbd/managed_lock/AcquireRequest.cc b/src/librbd/managed_lock/AcquireRequest.cc new file mode 100644 index 00000000..9b3e5e87 --- /dev/null +++ b/src/librbd/managed_lock/AcquireRequest.cc @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/AcquireRequest.h" +#include "librbd/Watcher.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "include/stringify.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/managed_lock/BreakRequest.h" +#include "librbd/managed_lock/GetLockerRequest.h" +#include "librbd/managed_lock/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::AcquireRequest: " << this \ + << " " << __func__ << ": " + +using std::string; + +namespace librbd { + +using librbd::util::detail::C_AsyncCallback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +namespace managed_lock { + +template <typename I> +AcquireRequest<I>* AcquireRequest<I>::create(librados::IoCtx& ioctx, + Watcher *watcher, + ContextWQ *work_queue, + const string& oid, + const string& cookie, + bool exclusive, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds, + Context *on_finish) { + return new AcquireRequest(ioctx, watcher, work_queue, oid, cookie, + exclusive, blacklist_on_break_lock, + blacklist_expire_seconds, on_finish); +} + +template <typename I> +AcquireRequest<I>::AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher, + ContextWQ *work_queue, const string& oid, + const string& cookie, bool exclusive, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds, + Context *on_finish) + : m_ioctx(ioctx), m_watcher(watcher), + m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())), + m_work_queue(work_queue), m_oid(oid), m_cookie(cookie), + m_exclusive(exclusive), + m_blacklist_on_break_lock(blacklist_on_break_lock), + m_blacklist_expire_seconds(blacklist_expire_seconds), + m_on_finish(new C_AsyncCallback<ContextWQ>(work_queue, on_finish)) { +} + +template <typename I> +AcquireRequest<I>::~AcquireRequest() { +} + +template <typename I> +void AcquireRequest<I>::send() { + send_get_locker(); +} + +template <typename I> +void AcquireRequest<I>::send_get_locker() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + AcquireRequest<I>, &AcquireRequest<I>::handle_get_locker>(this); + auto req = GetLockerRequest<I>::create(m_ioctx, m_oid, m_exclusive, + &m_locker, ctx); + req->send(); +} + +template <typename I> +void AcquireRequest<I>::handle_get_locker(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + ldout(m_cct, 20) << "no lockers detected" << dendl; + m_locker = {}; + } else if (r == -EBUSY) { + ldout(m_cct, 5) << "incompatible lock detected" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_lock(); +} + +template <typename I> +void AcquireRequest<I>::send_lock() { + ldout(m_cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", " + << "cookie=" << m_cookie << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::lock(&op, RBD_LOCK_NAME, + m_exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED, m_cookie, + util::get_watcher_lock_tag(), "", utime_t(), 0); + + using klass = AcquireRequest; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_lock>(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void AcquireRequest<I>::handle_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == 0) { + finish(0); + return; + } else if (r == -EBUSY && m_locker.cookie.empty()) { + ldout(m_cct, 5) << "already locked, refreshing locker" << dendl; + send_get_locker(); + return; + } else if (r != -EBUSY) { + lderr(m_cct) << "failed to lock: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_break_lock(); +} + +template <typename I> +void AcquireRequest<I>::send_break_lock() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + AcquireRequest<I>, &AcquireRequest<I>::handle_break_lock>(this); + auto req = BreakRequest<I>::create( + m_ioctx, m_work_queue, m_oid, m_locker, m_exclusive, + m_blacklist_on_break_lock, m_blacklist_expire_seconds, false, ctx); + req->send(); +} + +template <typename I> +void AcquireRequest<I>::handle_break_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -EAGAIN) { + ldout(m_cct, 5) << "lock owner is still alive" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to break lock : " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_locker = {}; + send_lock(); +} + +template <typename I> +void AcquireRequest<I>::finish(int r) { + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::AcquireRequest<librbd::ImageCtx>; diff --git a/src/librbd/managed_lock/AcquireRequest.h b/src/librbd/managed_lock/AcquireRequest.h new file mode 100644 index 00000000..20af0693 --- /dev/null +++ b/src/librbd/managed_lock/AcquireRequest.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/int_types.h" +#include "include/buffer.h" +#include "msg/msg_types.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/watcher/Types.h" +#include <string> + +class Context; +class ContextWQ; + +namespace librbd { + +class Watcher; + +namespace managed_lock { + +template <typename ImageCtxT> +class AcquireRequest { +private: + typedef watcher::Traits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Watcher Watcher; + +public: + static AcquireRequest* create(librados::IoCtx& ioctx, Watcher *watcher, + ContextWQ *work_queue, const std::string& oid, + const std::string& cookie, + bool exclusive, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds, + Context *on_finish); + + ~AcquireRequest(); + void send(); + +private: + + /** + * @verbatim + * + * <start> + * | + * v + * GET_LOCKER + * | ^ + * | . (EBUSY && no cached locker) + * | . + * | . (EBUSY && cached locker) + * \--> LOCK_IMAGE * * * * * * * * > BREAK_LOCK . . . . . + * | ^ | . + * | | | (success) . + * | \-------------------------/ . + * v . + * <finish> < . . . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher, + ContextWQ *work_queue, const std::string& oid, + const std::string& cookie, bool exclusive, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds, Context *on_finish); + + librados::IoCtx& m_ioctx; + Watcher *m_watcher; + CephContext *m_cct; + ContextWQ *m_work_queue; + std::string m_oid; + std::string m_cookie; + bool m_exclusive; + bool m_blacklist_on_break_lock; + uint32_t m_blacklist_expire_seconds; + Context *m_on_finish; + + bufferlist m_out_bl; + + Locker m_locker; + + void send_get_locker(); + void handle_get_locker(int r); + + void send_lock(); + void handle_lock(int r); + + void send_break_lock(); + void handle_break_lock(int r); + + void finish(int r); +}; + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H diff --git a/src/librbd/managed_lock/BreakRequest.cc b/src/librbd/managed_lock/BreakRequest.cc new file mode 100644 index 00000000..8caaea6f --- /dev/null +++ b/src/librbd/managed_lock/BreakRequest.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/BreakRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "include/stringify.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/managed_lock/GetLockerRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::BreakRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace managed_lock { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace { + +struct C_BlacklistClient : public Context { + librados::IoCtx &ioctx; + std::string locker_address; + uint32_t expire_seconds; + Context *on_finish; + + C_BlacklistClient(librados::IoCtx &ioctx, const std::string &locker_address, + uint32_t expire_seconds, Context *on_finish) + : ioctx(ioctx), locker_address(locker_address), + expire_seconds(expire_seconds), on_finish(on_finish) { + } + + void finish(int r) override { + librados::Rados rados(ioctx); + r = rados.blacklist_add(locker_address, expire_seconds); + on_finish->complete(r); + } +}; + +} // anonymous namespace + +template <typename I> +BreakRequest<I>::BreakRequest(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid, const Locker &locker, + bool exclusive, bool blacklist_locker, + uint32_t blacklist_expire_seconds, + bool force_break_lock, Context *on_finish) + : m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())), + m_work_queue(work_queue), m_oid(oid), m_locker(locker), + m_exclusive(exclusive), m_blacklist_locker(blacklist_locker), + m_blacklist_expire_seconds(blacklist_expire_seconds), + m_force_break_lock(force_break_lock), m_on_finish(on_finish) { +} + +template <typename I> +void BreakRequest<I>::send() { + send_get_watchers(); +} + +template <typename I> +void BreakRequest<I>::send_get_watchers() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + op.list_watchers(&m_watchers, &m_watchers_ret_val); + + using klass = BreakRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_get_watchers>(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void BreakRequest<I>::handle_get_watchers(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == 0) { + r = m_watchers_ret_val; + } + if (r < 0) { + lderr(m_cct) << "failed to retrieve watchers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + bool found_alive_locker = false; + for (auto &watcher : m_watchers) { + ldout(m_cct, 20) << "watcher=[" + << "addr=" << watcher.addr << ", " + << "entity=client." << watcher.watcher_id << "]" << dendl; + + if ((strncmp(m_locker.address.c_str(), + watcher.addr, sizeof(watcher.addr)) == 0) && + (m_locker.handle == watcher.cookie)) { + ldout(m_cct, 10) << "lock owner is still alive" << dendl; + found_alive_locker = true; + } + } + + if (!m_force_break_lock && found_alive_locker) { + finish(-EAGAIN); + return; + } + + send_get_locker(); +} + +template <typename I> +void BreakRequest<I>::send_get_locker() { + ldout(m_cct, 10) << dendl; + + using klass = BreakRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_get_locker>( + this); + auto req = GetLockerRequest<I>::create(m_ioctx, m_oid, m_exclusive, + &m_refreshed_locker, ctx); + req->send(); +} + +template <typename I> +void BreakRequest<I>::handle_get_locker(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + ldout(m_cct, 5) << "no lock owner" << dendl; + finish(0); + return; + } else if (r < 0 && r != -EBUSY) { + lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } else if (r < 0) { + m_refreshed_locker = {}; + } + + if (m_refreshed_locker != m_locker || m_refreshed_locker == Locker{}) { + ldout(m_cct, 5) << "no longer lock owner" << dendl; + finish(-EAGAIN); + return; + } + + send_blacklist(); +} + +template <typename I> +void BreakRequest<I>::send_blacklist() { + if (!m_blacklist_locker) { + send_break_lock(); + return; + } + + entity_name_t entity_name = entity_name_t::CLIENT(m_ioctx.get_instance_id()); + ldout(m_cct, 10) << "local entity=" << entity_name << ", " + << "locker entity=" << m_locker.entity << dendl; + + if (m_locker.entity == entity_name) { + lderr(m_cct) << "attempting to self-blacklist" << dendl; + finish(-EINVAL); + return; + } + + // TODO: need async version of RadosClient::blacklist_add + using klass = BreakRequest<I>; + Context *ctx = create_context_callback<klass, &klass::handle_blacklist>( + this); + m_work_queue->queue(new C_BlacklistClient(m_ioctx, m_locker.address, + m_blacklist_expire_seconds, ctx), + 0); +} + +template <typename I> +void BreakRequest<I>::handle_blacklist(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to blacklist lock owner: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + send_break_lock(); +} + +template <typename I> +void BreakRequest<I>::send_break_lock() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, m_locker.cookie, + m_locker.entity); + + using klass = BreakRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_break_lock>(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void BreakRequest<I>::handle_break_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to break lock: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void BreakRequest<I>::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::BreakRequest<librbd::ImageCtx>; diff --git a/src/librbd/managed_lock/BreakRequest.h b/src/librbd/managed_lock/BreakRequest.h new file mode 100644 index 00000000..4531a3c5 --- /dev/null +++ b/src/librbd/managed_lock/BreakRequest.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include "msg/msg_types.h" +#include <list> +#include <string> +#include <boost/optional.hpp> +#include "librbd/managed_lock/Types.h" + +class Context; +class ContextWQ; +class obj_watch_t; + +namespace librbd { + +class ImageCtx; +template <typename> class Journal; + +namespace managed_lock { + +template <typename ImageCtxT = ImageCtx> +class BreakRequest { +public: + static BreakRequest* create(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid, const Locker &locker, + bool exclusive, bool blacklist_locker, + uint32_t blacklist_expire_seconds, + bool force_break_lock, Context *on_finish) { + return new BreakRequest(ioctx, work_queue, oid, locker, exclusive, + blacklist_locker, blacklist_expire_seconds, + force_break_lock, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_WATCHERS + * | + * v + * GET_LOCKER + * | + * v + * BLACKLIST (skip if disabled) + * | + * v + * BREAK_LOCK + * | + * v + * <finish> + * + * @endvertbatim + */ + + librados::IoCtx &m_ioctx; + CephContext *m_cct; + ContextWQ *m_work_queue; + std::string m_oid; + Locker m_locker; + bool m_exclusive; + bool m_blacklist_locker; + uint32_t m_blacklist_expire_seconds; + bool m_force_break_lock; + Context *m_on_finish; + + bufferlist m_out_bl; + + std::list<obj_watch_t> m_watchers; + int m_watchers_ret_val; + + Locker m_refreshed_locker; + + BreakRequest(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid, const Locker &locker, + bool exclusive, bool blacklist_locker, + uint32_t blacklist_expire_seconds, bool force_break_lock, + Context *on_finish); + + void send_get_watchers(); + void handle_get_watchers(int r); + + void send_get_locker(); + void handle_get_locker(int r); + + void send_blacklist(); + void handle_blacklist(int r); + + void send_break_lock(); + void handle_break_lock(int r); + + void finish(int r); + +}; + +} // namespace managed_lock +} // namespace librbd + +extern template class librbd::managed_lock::BreakRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H diff --git a/src/librbd/managed_lock/GetLockerRequest.cc b/src/librbd/managed_lock/GetLockerRequest.cc new file mode 100644 index 00000000..fb5bb322 --- /dev/null +++ b/src/librbd/managed_lock/GetLockerRequest.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/GetLockerRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/managed_lock/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::GetLockerRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace managed_lock { + +using librbd::util::create_rados_callback; + +template <typename I> +GetLockerRequest<I>::GetLockerRequest(librados::IoCtx& ioctx, + const std::string& oid, bool exclusive, + Locker *locker, Context *on_finish) + : m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())), + m_oid(oid), m_exclusive(exclusive), m_locker(locker), + m_on_finish(on_finish) { +} + +template <typename I> +void GetLockerRequest<I>::send() { + send_get_lockers(); +} + +template <typename I> +void GetLockerRequest<I>::send_get_lockers() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = GetLockerRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_get_lockers>(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void GetLockerRequest<I>::handle_get_lockers(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t> lockers; + ClsLockType lock_type = LOCK_NONE; + std::string lock_tag; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = rados::cls::lock::get_lock_info_finish(&it, &lockers, &lock_type, + &lock_tag); + } + + if (r < 0) { + lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (lockers.empty()) { + ldout(m_cct, 20) << "no lockers detected" << dendl; + finish(-ENOENT); + return; + } + + if (lock_tag != util::get_watcher_lock_tag()) { + ldout(m_cct, 5) <<"locked by external mechanism: tag=" << lock_tag << dendl; + finish(-EBUSY); + return; + } + + if (m_exclusive && lock_type == LOCK_SHARED) { + ldout(m_cct, 5) << "incompatible shared lock type detected" << dendl; + finish(-EBUSY); + return; + } else if (!m_exclusive && lock_type == LOCK_EXCLUSIVE) { + ldout(m_cct, 5) << "incompatible exclusive lock type detected" << dendl; + finish(-EBUSY); + return; + } + + std::map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t>::iterator iter = lockers.begin(); + if (!util::decode_lock_cookie(iter->first.cookie, &m_locker->handle)) { + ldout(m_cct, 5) << "locked by external mechanism: " + << "cookie=" << iter->first.cookie << dendl; + finish(-EBUSY); + return; + } + + m_locker->entity = iter->first.locker; + m_locker->cookie = iter->first.cookie; + m_locker->address = iter->second.addr.get_legacy_str(); + if (m_locker->cookie.empty() || m_locker->address.empty()) { + ldout(m_cct, 20) << "no valid lockers detected" << dendl; + finish(-ENOENT); + return; + } + + ldout(m_cct, 10) << "retrieved exclusive locker: " + << m_locker->entity << "@" << m_locker->address << dendl; + finish(0); +} + +template <typename I> +void GetLockerRequest<I>::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::GetLockerRequest<librbd::ImageCtx>; diff --git a/src/librbd/managed_lock/GetLockerRequest.h b/src/librbd/managed_lock/GetLockerRequest.h new file mode 100644 index 00000000..b8fd08f6 --- /dev/null +++ b/src/librbd/managed_lock/GetLockerRequest.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados_fwd.hpp" +#include <string> + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace managed_lock { + +struct Locker; + +template <typename ImageCtxT = ImageCtx> +class GetLockerRequest { +public: + static GetLockerRequest* create(librados::IoCtx& ioctx, + const std::string& oid, bool exclusive, + Locker *locker, Context *on_finish) { + return new GetLockerRequest(ioctx, oid, exclusive, locker, on_finish); + } + + void send(); + +private: + librados::IoCtx &m_ioctx; + CephContext *m_cct; + std::string m_oid; + bool m_exclusive; + Locker *m_locker; + Context *m_on_finish; + + bufferlist m_out_bl; + + GetLockerRequest(librados::IoCtx& ioctx, const std::string& oid, + bool exclusive, Locker *locker, Context *on_finish); + + void send_get_lockers(); + void handle_get_lockers(int r); + + void finish(int r); + +}; + +} // namespace managed_lock +} // namespace librbd + +extern template class librbd::managed_lock::GetLockerRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H diff --git a/src/librbd/managed_lock/ReacquireRequest.cc b/src/librbd/managed_lock/ReacquireRequest.cc new file mode 100644 index 00000000..dc5624d4 --- /dev/null +++ b/src/librbd/managed_lock/ReacquireRequest.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/ReacquireRequest.h" +#include "librbd/Watcher.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/managed_lock/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::ReacquireRequest: " \ + << this << ": " << __func__ + +using std::string; + +namespace librbd { +namespace managed_lock { + +using librbd::util::create_rados_callback; + +template <typename I> +ReacquireRequest<I>::ReacquireRequest(librados::IoCtx& ioctx, + const string& oid, + const string& old_cookie, + const string &new_cookie, + bool exclusive, + Context *on_finish) + : m_ioctx(ioctx), m_oid(oid), m_old_cookie(old_cookie), + m_new_cookie(new_cookie), m_exclusive(exclusive), m_on_finish(on_finish) { +} + + +template <typename I> +void ReacquireRequest<I>::send() { + set_cookie(); +} + +template <typename I> +void ReacquireRequest<I>::set_cookie() { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::set_cookie(&op, RBD_LOCK_NAME, + m_exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED, + m_old_cookie, util::get_watcher_lock_tag(), + m_new_cookie); + + librados::AioCompletion *rados_completion = create_rados_callback< + ReacquireRequest, &ReacquireRequest::handle_set_cookie>(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void ReacquireRequest<I>::handle_set_cookie(int r) { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << ": r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + ldout(cct, 10) << ": OSD doesn't support updating lock" << dendl; + } else if (r < 0) { + lderr(cct) << ": failed to update lock: " << cpp_strerror(r) << dendl; + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::ReacquireRequest<librbd::ImageCtx>; diff --git a/src/librbd/managed_lock/ReacquireRequest.h b/src/librbd/managed_lock/ReacquireRequest.h new file mode 100644 index 00000000..3f2b7d7e --- /dev/null +++ b/src/librbd/managed_lock/ReacquireRequest.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/int_types.h" +#include <string> + +class Context; + +namespace librbd { + +class Watcher; + +namespace managed_lock { + +template <typename ImageCtxT> +class ReacquireRequest { +public: + + static ReacquireRequest *create(librados::IoCtx& ioctx, + const std::string& oid, + const std::string& old_cookie, + const std::string &new_cookie, + bool exclusive, + Context *on_finish) { + return new ReacquireRequest(ioctx, oid, old_cookie, new_cookie, exclusive, + on_finish); + } + + ReacquireRequest(librados::IoCtx& ioctx, const std::string& oid, + const std::string& old_cookie, + const std::string &new_cookie, bool exclusive, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * SET_COOKIE + * | + * v + * <finish> + * + * @endverbatim + */ + librados::IoCtx& m_ioctx; + std::string m_oid; + std::string m_old_cookie; + std::string m_new_cookie; + bool m_exclusive; + Context *m_on_finish; + + void set_cookie(); + void handle_set_cookie(int r); + +}; + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H diff --git a/src/librbd/managed_lock/ReleaseRequest.cc b/src/librbd/managed_lock/ReleaseRequest.cc new file mode 100644 index 00000000..d74e1462 --- /dev/null +++ b/src/librbd/managed_lock/ReleaseRequest.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/ReleaseRequest.h" +#include "librbd/Watcher.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Utils.h" + +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::ReleaseRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace managed_lock { + +using util::detail::C_AsyncCallback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +ReleaseRequest<I>* ReleaseRequest<I>::create(librados::IoCtx& ioctx, + Watcher *watcher, + ContextWQ *work_queue, + const string& oid, + const string& cookie, + Context *on_finish) { + return new ReleaseRequest(ioctx, watcher, work_queue, oid, cookie, + on_finish); +} + +template <typename I> +ReleaseRequest<I>::ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher, + ContextWQ *work_queue, const string& oid, + const string& cookie, Context *on_finish) + : m_ioctx(ioctx), m_watcher(watcher), m_oid(oid), m_cookie(cookie), + m_on_finish(new C_AsyncCallback<ContextWQ>(work_queue, on_finish)) { +} + +template <typename I> +ReleaseRequest<I>::~ReleaseRequest() { +} + + +template <typename I> +void ReleaseRequest<I>::send() { + send_unlock(); +} + +template <typename I> +void ReleaseRequest<I>::send_unlock() { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", " + << "cookie=" << m_cookie << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::unlock(&op, RBD_LOCK_NAME, m_cookie); + + using klass = ReleaseRequest; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_unlock>(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void ReleaseRequest<I>::handle_unlock(int r) { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to unlock: " << cpp_strerror(r) << dendl; + } + + finish(); +} + +template <typename I> +void ReleaseRequest<I>::finish() { + m_on_finish->complete(0); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::ReleaseRequest<librbd::ImageCtx>; + diff --git a/src/librbd/managed_lock/ReleaseRequest.h b/src/librbd/managed_lock/ReleaseRequest.h new file mode 100644 index 00000000..89205136 --- /dev/null +++ b/src/librbd/managed_lock/ReleaseRequest.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "librbd/watcher/Types.h" +#include <string> + +class Context; +class ContextWQ; + +namespace librbd { + +class Watcher; + +namespace managed_lock { + +template <typename ImageCtxT> +class ReleaseRequest { +private: + typedef watcher::Traits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Watcher Watcher; + +public: + static ReleaseRequest* create(librados::IoCtx& ioctx, Watcher *watcher, + ContextWQ *work_queue, + const std::string& oid, + const std::string& cookie, + Context *on_finish); + + ~ReleaseRequest(); + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * UNLOCK + * | + * v + * <finish> + * + * @endverbatim + */ + + ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher, + ContextWQ *work_queue, const std::string& oid, + const std::string& cookie, Context *on_finish); + + librados::IoCtx& m_ioctx; + Watcher *m_watcher; + std::string m_oid; + std::string m_cookie; + Context *m_on_finish; + + void send_unlock(); + void handle_unlock(int r); + + void finish(); + +}; + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H diff --git a/src/librbd/managed_lock/Types.h b/src/librbd/managed_lock/Types.h new file mode 100644 index 00000000..319789c8 --- /dev/null +++ b/src/librbd/managed_lock/Types.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_TYPES_H +#define CEPH_LIBRBD_MANAGED_LOCK_TYPES_H + +#include "msg/msg_types.h" +#include <string> + +namespace librbd { +namespace managed_lock { + +struct Locker { + entity_name_t entity; + std::string cookie; + std::string address; + uint64_t handle = 0; + + Locker() { + } + Locker(const entity_name_t& entity, const std::string &cookie, + const std::string &address, uint64_t handle) + : entity(entity), cookie(cookie), address(address), handle(handle) { + } + + inline bool operator==(const Locker &rhs) const { + return (entity == rhs.entity && + cookie == rhs.cookie && + address == rhs.address && + handle == rhs.handle); + } + inline bool operator!=(const Locker &rhs) const { + return !(*this == rhs); + } +}; + +enum Mode { + EXCLUSIVE, + SHARED +}; + + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_TYPES_H diff --git a/src/librbd/managed_lock/Utils.cc b/src/librbd/managed_lock/Utils.cc new file mode 100644 index 00000000..0b4f908d --- /dev/null +++ b/src/librbd/managed_lock/Utils.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/ceph_assert.h" +#include "librbd/managed_lock/Utils.h" +#include <sstream> + +namespace librbd { +namespace managed_lock { +namespace util { + +namespace { + +const std::string WATCHER_LOCK_COOKIE_PREFIX = "auto"; +const std::string WATCHER_LOCK_TAG("internal"); + +} // anonymous namespace + +const std::string &get_watcher_lock_tag() { + return WATCHER_LOCK_TAG; +} + +bool decode_lock_cookie(const std::string &tag, uint64_t *handle) { + std::string prefix; + std::istringstream ss(tag); + if (!(ss >> prefix >> *handle) || prefix != WATCHER_LOCK_COOKIE_PREFIX) { + return false; + } + return true; +} + +std::string encode_lock_cookie(uint64_t watch_handle) { + ceph_assert(watch_handle != 0); + std::ostringstream ss; + ss << WATCHER_LOCK_COOKIE_PREFIX << " " << watch_handle; + return ss.str(); +} + +} // namespace util +} // namespace managed_lock +} // namespace librbd + + diff --git a/src/librbd/managed_lock/Utils.h b/src/librbd/managed_lock/Utils.h new file mode 100644 index 00000000..679cbfe8 --- /dev/null +++ b/src/librbd/managed_lock/Utils.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_UTILS_H +#define CEPH_LIBRBD_MANAGED_LOCK_UTILS_H + +#include "include/int_types.h" +#include <string> + +namespace librbd { +namespace managed_lock { +namespace util { + +const std::string &get_watcher_lock_tag(); + +bool decode_lock_cookie(const std::string &tag, uint64_t *handle); +std::string encode_lock_cookie(uint64_t watch_handle); + +} // namespace util +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_UTILS_H diff --git a/src/librbd/mirror/DemoteRequest.cc b/src/librbd/mirror/DemoteRequest.cc new file mode 100644 index 00000000..c5d38752 --- /dev/null +++ b/src/librbd/mirror/DemoteRequest.cc @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/DemoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/GetInfoRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::DemoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; + +template <typename I> +void DemoteRequest<I>::send() { + get_info(); +} + +template <typename I> +void DemoteRequest<I>::get_info() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_get_info>(this); + auto req = GetInfoRequest<I>::create(m_image_ctx, &m_mirror_image, + &m_promotion_state, ctx); + req->send(); +} + +template <typename I> +void DemoteRequest<I>::handle_get_info(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + finish(-EINVAL); + return; + } else if (m_promotion_state != PROMOTION_STATE_PRIMARY) { + lderr(cct) << "image is not primary" << dendl; + finish(-EINVAL); + return; + } + + acquire_lock(); +} + +template <typename I> +void DemoteRequest<I>::acquire_lock() { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.owner_lock.get_read(); + if (m_image_ctx.exclusive_lock == nullptr) { + m_image_ctx.owner_lock.put_read(); + lderr(cct) << "exclusive lock is not active" << dendl; + finish(-EINVAL); + return; + } + + // avoid accepting new requests from peers while we demote + // the image + m_image_ctx.exclusive_lock->block_requests(0); + m_blocked_requests = true; + + if (m_image_ctx.exclusive_lock->is_lock_owner()) { + m_image_ctx.owner_lock.put_read(); + demote(); + return; + } + + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_acquire_lock>(this); + m_image_ctx.exclusive_lock->acquire_lock(ctx); + m_image_ctx.owner_lock.put_read(); +} + +template <typename I> +void DemoteRequest<I>::handle_acquire_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_image_ctx.owner_lock.get_read(); + if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.exclusive_lock->is_lock_owner()) { + r = m_image_ctx.exclusive_lock->get_unlocked_op_error(); + m_image_ctx.owner_lock.put_read(); + lderr(cct) << "failed to acquire exclusive lock" << dendl; + finish(r); + return; + } + m_image_ctx.owner_lock.put_read(); + + demote(); +} + +template <typename I> +void DemoteRequest<I>::demote() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_demote>(this); + Journal<I>::demote(&m_image_ctx, ctx); +} + +template <typename I> +void DemoteRequest<I>::handle_demote(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to demote image: " << cpp_strerror(r) << dendl; + } + + release_lock(); +} + +template <typename I> +void DemoteRequest<I>::release_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + m_image_ctx.owner_lock.get_read(); + if (m_image_ctx.exclusive_lock == nullptr) { + m_image_ctx.owner_lock.put_read(); + finish(0); + return; + } + + auto ctx = create_context_callback< + DemoteRequest<I>, &DemoteRequest<I>::handle_release_lock>(this); + m_image_ctx.exclusive_lock->release_lock(ctx); + m_image_ctx.owner_lock.put_read(); +} + +template <typename I> +void DemoteRequest<I>::handle_release_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template <typename I> +void DemoteRequest<I>::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + if (m_blocked_requests && m_image_ctx.exclusive_lock != nullptr) { + m_image_ctx.exclusive_lock->unblock_requests(); + } + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::DemoteRequest<librbd::ImageCtx>; diff --git a/src/librbd/mirror/DemoteRequest.h b/src/librbd/mirror/DemoteRequest.h new file mode 100644 index 00000000..7dc0585e --- /dev/null +++ b/src/librbd/mirror/DemoteRequest.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/mirror/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +class DemoteRequest { +public: + static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) { + return new DemoteRequest(image_ctx, on_finish); + } + + DemoteRequest(ImageCtxT &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_INFO + * | + * v + * ACQUIRE_LOCK * * * * + * | * + * v * + * DEMOTE * + * | * + * v * + * RELEASE_LOCK * + * | * + * v * + * <finish> < * * * * * + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + int m_ret_val = 0; + bool m_blocked_requests = false; + + cls::rbd::MirrorImage m_mirror_image; + PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY; + + void get_info(); + void handle_get_info(int r); + + void acquire_lock(); + void handle_acquire_lock(int r); + + void demote(); + void handle_demote(int r); + + void release_lock(); + void handle_release_lock(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::DemoteRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H diff --git a/src/librbd/mirror/DisableRequest.cc b/src/librbd/mirror/DisableRequest.cc new file mode 100644 index 00000000..6a21c560 --- /dev/null +++ b/src/librbd/mirror/DisableRequest.cc @@ -0,0 +1,492 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/DisableRequest.h" +#include "common/WorkQueue.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/journal/cls_journal_client.h" +#include "cls/rbd/cls_rbd_client.h" +#include "journal/Journaler.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/journal/PromoteRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::DisableRequest: " + +namespace librbd { +namespace mirror { + +using util::create_rados_callback; + +template <typename I> +DisableRequest<I>::DisableRequest(I *image_ctx, bool force, bool remove, + Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_remove(remove), + m_on_finish(on_finish), m_lock("mirror::DisableRequest::m_lock") { +} + +template <typename I> +void DisableRequest<I>::send() { + send_get_mirror_image(); +} + +template <typename I> +void DisableRequest<I>::send_get_mirror_image() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_ctx->id); + + using klass = DisableRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_mirror_image>(this); + m_out_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableRequest<I>::handle_get_mirror_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto iter = m_out_bl.cbegin(); + *result = cls_client::mirror_image_get_finish(&iter, &m_mirror_image); + } + + if (*result < 0) { + if (*result == -ENOENT) { + ldout(cct, 20) << this << " " << __func__ + << ": mirroring is not enabled for this image" << dendl; + *result = 0; + } else if (*result == -EOPNOTSUPP) { + ldout(cct, 5) << this << " " << __func__ + << ": mirroring is not supported by OSD" << dendl; + } else { + lderr(cct) << "failed to retrieve mirror image: " << cpp_strerror(*result) + << dendl; + } + return m_on_finish; + } + + send_get_tag_owner(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_get_tag_owner() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = DisableRequest<I>; + Context *ctx = util::create_context_callback< + klass, &klass::handle_get_tag_owner>(this); + + Journal<I>::is_tag_owner(m_image_ctx, &m_is_primary, ctx); +} + +template <typename I> +Context *DisableRequest<I>::handle_get_tag_owner(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to check tag ownership: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if (!m_is_primary && !m_force) { + lderr(cct) << "mirrored image is not primary, " + << "add force option to disable mirroring" << dendl; + *result = -EINVAL; + return m_on_finish; + } + + send_set_mirror_image(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_set_mirror_image() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_set(&op, m_image_ctx->id, m_mirror_image); + + using klass = DisableRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_set_mirror_image>(this); + m_out_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableRequest<I>::handle_set_mirror_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to disable mirroring: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_notify_mirroring_watcher(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_notify_mirroring_watcher() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = DisableRequest<I>; + Context *ctx = util::create_context_callback< + klass, &klass::handle_notify_mirroring_watcher>(this); + + MirroringWatcher<I>::notify_image_updated( + m_image_ctx->md_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLING, + m_image_ctx->id, m_mirror_image.global_image_id, ctx); +} + +template <typename I> +Context *DisableRequest<I>::handle_notify_mirroring_watcher(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to send update notification: " + << cpp_strerror(*result) << dendl; + *result = 0; + } + + send_promote_image(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_promote_image() { + if (m_is_primary) { + send_get_clients(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // Not primary -- shouldn't have the journal open + ceph_assert(m_image_ctx->journal == nullptr); + + using klass = DisableRequest<I>; + Context *ctx = util::create_context_callback< + klass, &klass::handle_promote_image>(this); + auto req = journal::PromoteRequest<I>::create(m_image_ctx, true, ctx); + req->send(); +} + +template <typename I> +Context *DisableRequest<I>::handle_promote_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to promote image: " << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + send_get_clients(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_get_clients() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = DisableRequest<I>; + Context *ctx = util::create_context_callback< + klass, &klass::handle_get_clients>(this); + + std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id); + m_clients.clear(); + cls::journal::client::client_list(m_image_ctx->md_ctx, header_oid, &m_clients, + ctx); +} + +template <typename I> +Context *DisableRequest<I>::handle_get_clients(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to get registered clients: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + Mutex::Locker locker(m_lock); + + ceph_assert(m_current_ops.empty()); + + for (auto client : m_clients) { + journal::ClientData client_data; + auto bl_it = client.data.cbegin(); + try { + using ceph::decode; + decode(client_data, bl_it); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode client data" << dendl; + m_error_result = -EBADMSG; + continue; + } + + journal::ClientMetaType type = client_data.get_client_meta_type(); + if (type != journal::ClientMetaType::MIRROR_PEER_CLIENT_META_TYPE) { + continue; + } + + if (m_current_ops.find(client.id) != m_current_ops.end()) { + // Should not happen. + lderr(cct) << this << " " << __func__ << ": clients with the same id " + << client.id << dendl; + continue; + } + + m_current_ops[client.id] = 0; + m_ret[client.id] = 0; + + journal::MirrorPeerClientMeta client_meta = + boost::get<journal::MirrorPeerClientMeta>(client_data.client_meta); + + for (const auto& sync : client_meta.sync_points) { + send_remove_snap(client.id, sync.snap_namespace, sync.snap_name); + } + + if (m_current_ops[client.id] == 0) { + // no snaps to remove + send_unregister_client(client.id); + } + } + + if (m_current_ops.empty()) { + if (m_error_result < 0) { + *result = m_error_result; + return m_on_finish; + } else if (!m_remove) { + return m_on_finish; + } + + // no mirror clients to unregister + send_remove_mirror_image(); + } + + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_remove_snap(const std::string &client_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": client_id=" << client_id + << ", snap_name=" << snap_name << dendl; + + ceph_assert(m_lock.is_locked()); + + m_current_ops[client_id]++; + + Context *ctx = create_context_callback( + &DisableRequest<I>::handle_remove_snap, client_id); + + ctx = new FunctionContext([this, snap_namespace, snap_name, ctx](int r) { + m_image_ctx->operations->snap_remove(snap_namespace, + snap_name.c_str(), + ctx); + }); + + m_image_ctx->op_work_queue->queue(ctx, 0); +} + +template <typename I> +Context *DisableRequest<I>::handle_remove_snap(int *result, + const std::string &client_id) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_current_ops[client_id] > 0); + m_current_ops[client_id]--; + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << + "failed to remove temporary snapshot created by remote peer: " + << cpp_strerror(*result) << dendl; + m_ret[client_id] = *result; + } + + if (m_current_ops[client_id] == 0) { + send_unregister_client(client_id); + } + + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_unregister_client( + const std::string &client_id) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_current_ops[client_id] == 0); + + Context *ctx = create_context_callback( + &DisableRequest<I>::handle_unregister_client, client_id); + + if (m_ret[client_id] < 0) { + m_image_ctx->op_work_queue->queue(ctx, m_ret[client_id]); + return; + } + + librados::ObjectWriteOperation op; + cls::journal::client::client_unregister(&op, client_id); + std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id); + librados::AioCompletion *comp = create_rados_callback(ctx); + + int r = m_image_ctx->md_ctx.aio_operate(header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableRequest<I>::handle_unregister_client( + int *result, const std::string &client_id) { + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_current_ops[client_id] == 0); + m_current_ops.erase(client_id); + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to unregister remote journal client: " + << cpp_strerror(*result) << dendl; + m_error_result = *result; + } + + if (!m_current_ops.empty()) { + return nullptr; + } + + if (m_error_result < 0) { + *result = m_error_result; + return m_on_finish; + } + + send_get_clients(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_remove_mirror_image() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_remove(&op, m_image_ctx->id); + + using klass = DisableRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_remove_mirror_image>(this); + m_out_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableRequest<I>::handle_remove_mirror_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -ENOENT) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to remove mirror image: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + ldout(cct, 20) << this << " " << __func__ + << ": removed image state from rbd_mirroring object" << dendl; + + send_notify_mirroring_watcher_removed(); + return nullptr; +} + +template <typename I> +void DisableRequest<I>::send_notify_mirroring_watcher_removed() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = DisableRequest<I>; + Context *ctx = util::create_context_callback< + klass, &klass::handle_notify_mirroring_watcher_removed>(this); + + MirroringWatcher<I>::notify_image_updated( + m_image_ctx->md_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED, m_image_ctx->id, + m_mirror_image.global_image_id, ctx); +} + +template <typename I> +Context *DisableRequest<I>::handle_notify_mirroring_watcher_removed( + int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to send update notification: " + << cpp_strerror(*result) << dendl; + *result = 0; + } + + return m_on_finish; +} + +template <typename I> +Context *DisableRequest<I>::create_context_callback( + Context*(DisableRequest<I>::*handle)(int*, const std::string &client_id), + const std::string &client_id) { + + return new FunctionContext([this, handle, client_id](int r) { + Context *on_finish = (this->*handle)(&r, client_id); + if (on_finish != nullptr) { + on_finish->complete(r); + delete this; + } + }); +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::DisableRequest<librbd::ImageCtx>; diff --git a/src/librbd/mirror/DisableRequest.h b/src/librbd/mirror/DisableRequest.h new file mode 100644 index 00000000..1a3b1223 --- /dev/null +++ b/src/librbd/mirror/DisableRequest.h @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H + +#include "include/buffer.h" +#include "common/Mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "cls/rbd/cls_rbd_types.h" +#include <map> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace mirror { + +template <typename ImageCtxT = ImageCtx> +class DisableRequest { +public: + static DisableRequest *create(ImageCtxT *image_ctx, bool force, + bool remove, Context *on_finish) { + return new DisableRequest(image_ctx, force, remove, on_finish); + } + + DisableRequest(ImageCtxT *image_ctx, bool force, bool remove, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * * * * + * | * + * v * + * GET_TAG_OWNER * * * * * * * * * * * * * * * * * * * * * * * * + * | * + * v * + * SET_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * * * * + * | * + * v * + * NOTIFY_MIRRORING_WATCHER * + * | * + * v * + * PROMOTE_IMAGE (skip if primary) * + * | * + * v * + * GET_CLIENTS <----------------------------------------\ * * * * + * | | (unregister clients) | * (on error) + * | |/----------------------------\ | * + * | | | | * + * | | /-----------\ (repeat | (repeat | (repeat + * | | | | as needed) | as needed) | as needed) + * | v v | | | * + * | REMOVE_SYNC_SNAP --/ * * * * * * | * * * * * * | * * * * + * | | | | * + * | v | | * + * | UNREGISTER_CLIENT ---------------/-------------/ * * * * + * | * + * | (no more clients * + * | to unregister) * + * v * + * REMOVE_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * * + * | (skip if no remove) * + * v * + * NOTIFY_MIRRORING_WATCHER_REMOVED * + * | (skip if not primary or no remove) * + * v * + * <finish> < * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + bool m_force; + bool m_remove; + Context *m_on_finish; + + bool m_is_primary = false; + bufferlist m_out_bl; + cls::rbd::MirrorImage m_mirror_image; + std::set<cls::journal::Client> m_clients; + std::map<std::string, int> m_ret; + std::map<std::string, int> m_current_ops; + int m_error_result = 0; + mutable Mutex m_lock; + + void send_get_mirror_image(); + Context *handle_get_mirror_image(int *result); + + void send_get_tag_owner(); + Context *handle_get_tag_owner(int *result); + + void send_set_mirror_image(); + Context *handle_set_mirror_image(int *result); + + void send_notify_mirroring_watcher(); + Context *handle_notify_mirroring_watcher(int *result); + + void send_promote_image(); + Context *handle_promote_image(int *result); + + void send_get_clients(); + Context *handle_get_clients(int *result); + + void send_remove_snap(const std::string &client_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name); + Context *handle_remove_snap(int *result, const std::string &client_id); + + void send_unregister_client(const std::string &client_id); + Context *handle_unregister_client(int *result, const std::string &client_id); + + void send_remove_mirror_image(); + Context *handle_remove_mirror_image(int *result); + + void send_notify_mirroring_watcher_removed(); + Context *handle_notify_mirroring_watcher_removed(int *result); + + Context *create_context_callback( + Context*(DisableRequest<ImageCtxT>::*handle)( + int*, const std::string &client_id), + const std::string &client_id); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::DisableRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H diff --git a/src/librbd/mirror/EnableRequest.cc b/src/librbd/mirror/EnableRequest.cc new file mode 100644 index 00000000..a5c5b125 --- /dev/null +++ b/src/librbd/mirror/EnableRequest.cc @@ -0,0 +1,190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/EnableRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::EnableRequest: " + +namespace librbd { +namespace mirror { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +EnableRequest<I>::EnableRequest(librados::IoCtx &io_ctx, + const std::string &image_id, + const std::string &non_primary_global_image_id, + ContextWQ *op_work_queue, Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), + m_non_primary_global_image_id(non_primary_global_image_id), + m_op_work_queue(op_work_queue), m_on_finish(on_finish), + m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())) { +} + +template <typename I> +void EnableRequest<I>::send() { + send_get_mirror_image(); +} + +template <typename I> +void EnableRequest<I>::send_get_mirror_image() { + ldout(m_cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_id); + + using klass = EnableRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_mirror_image>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *EnableRequest<I>::handle_get_mirror_image(int *result) { + ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto iter = m_out_bl.cbegin(); + *result = cls_client::mirror_image_get_finish(&iter, &m_mirror_image); + } + + if (*result == 0) { + if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + ldout(m_cct, 10) << this << " " << __func__ + << ": mirroring is already enabled" << dendl; + } else { + lderr(m_cct) << "currently disabling" << dendl; + *result = -EINVAL; + } + return m_on_finish; + } + + if (*result != -ENOENT) { + lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + *result = 0; + m_mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED; + if (m_non_primary_global_image_id.empty()) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + m_mirror_image.global_image_id = uuid_gen.to_string(); + } else { + m_mirror_image.global_image_id = m_non_primary_global_image_id; + } + + send_get_tag_owner(); + return nullptr; +} + +template <typename I> +void EnableRequest<I>::send_get_tag_owner() { + if (!m_non_primary_global_image_id.empty()) { + send_set_mirror_image(); + return; + } + ldout(m_cct, 10) << this << " " << __func__ << dendl; + + using klass = EnableRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_get_tag_owner>(this); + librbd::Journal<>::is_tag_owner(m_io_ctx, m_image_id, &m_is_primary, + m_op_work_queue, ctx); +} + +template <typename I> +Context *EnableRequest<I>::handle_get_tag_owner(int *result) { + ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to check tag ownership: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if (!m_is_primary) { + lderr(m_cct) << "last journal tag not owned by local cluster" << dendl; + *result = -EINVAL; + return m_on_finish; + } + + send_set_mirror_image(); + return nullptr; +} + +template <typename I> +void EnableRequest<I>::send_set_mirror_image() { + ldout(m_cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_set(&op, m_image_id, m_mirror_image); + + using klass = EnableRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_set_mirror_image>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *EnableRequest<I>::handle_set_mirror_image(int *result) { + ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_notify_mirroring_watcher(); + return nullptr; +} + +template <typename I> +void EnableRequest<I>::send_notify_mirroring_watcher() { + ldout(m_cct, 10) << this << " " << __func__ << dendl; + + using klass = EnableRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_notify_mirroring_watcher>(this); + + MirroringWatcher<>::notify_image_updated(m_io_ctx, + cls::rbd::MIRROR_IMAGE_STATE_ENABLED, + m_image_id, + m_mirror_image.global_image_id, ctx); +} + +template <typename I> +Context *EnableRequest<I>::handle_notify_mirroring_watcher(int *result) { + ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to send update notification: " + << cpp_strerror(*result) << dendl; + *result = 0; + } + + return m_on_finish; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::EnableRequest<librbd::ImageCtx>; diff --git a/src/librbd/mirror/EnableRequest.h b/src/librbd/mirror/EnableRequest.h new file mode 100644 index 00000000..965c2a36 --- /dev/null +++ b/src/librbd/mirror/EnableRequest.h @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H + +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include <map> +#include <string> + +class Context; +class ContextWQ; + +namespace librbd { + +class ImageCtx; + +namespace mirror { + +template <typename ImageCtxT = ImageCtx> +class EnableRequest { +public: + static EnableRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return create(image_ctx->md_ctx, image_ctx->id, "", + image_ctx->op_work_queue, on_finish); + } + static EnableRequest *create(librados::IoCtx &io_ctx, + const std::string &image_id, + const std::string &non_primary_global_image_id, + ContextWQ *op_work_queue, Context *on_finish) { + return new EnableRequest(io_ctx, image_id, non_primary_global_image_id, + op_work_queue, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_MIRROR_IMAGE * * * * * * * + * | * (on error) + * v * + * GET_TAG_OWNER * * * * * * * * + * | * + * v * + * SET_MIRROR_IMAGE * * * * * * * + * | * + * v * + * NOTIFY_MIRRORING_WATCHER * * * + * | * + * v * + * <finish> < * * * * * * * * * + * + * @endverbatim + */ + + EnableRequest(librados::IoCtx &io_ctx, const std::string &image_id, + const std::string &non_primary_global_image_id, + ContextWQ *op_work_queue, Context *on_finish); + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + std::string m_non_primary_global_image_id; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct = nullptr; + bool m_is_primary = false; + bufferlist m_out_bl; + cls::rbd::MirrorImage m_mirror_image; + + void send_get_mirror_image(); + Context *handle_get_mirror_image(int *result); + + void send_get_tag_owner(); + Context *handle_get_tag_owner(int *result); + + void send_set_mirror_image(); + Context *handle_set_mirror_image(int *result); + + void send_notify_mirroring_watcher(); + Context *handle_notify_mirroring_watcher(int *result); +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::EnableRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H diff --git a/src/librbd/mirror/GetInfoRequest.cc b/src/librbd/mirror/GetInfoRequest.cc new file mode 100644 index 00000000..460c8cb1 --- /dev/null +++ b/src/librbd/mirror/GetInfoRequest.cc @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/GetInfoRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::GetInfoRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +void GetInfoRequest<I>::send() { + get_mirror_image(); +} + +template <typename I> +void GetInfoRequest<I>::get_mirror_image() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_ctx.id); + + librados::AioCompletion *comp = create_rados_callback< + GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_mirror_image>(this); + int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void GetInfoRequest<I>::handle_get_mirror_image(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_mirror_image->state = cls::rbd::MIRROR_IMAGE_STATE_DISABLED; + *m_promotion_state = PROMOTION_STATE_NON_PRIMARY; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_get_finish(&iter, m_mirror_image); + } + + if (r == -ENOENT || + m_mirror_image->state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + ldout(cct, 20) << "mirroring is disabled" << dendl; + finish(0); + return; + } else if (r < 0) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + get_tag_owner(); +} + +template <typename I> +void GetInfoRequest<I>::get_tag_owner() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_tag_owner>(this); + Journal<I>::get_tag_owner(m_image_ctx.md_ctx, m_image_ctx.id, + &m_mirror_uuid, m_image_ctx.op_work_queue, ctx); +} + +template <typename I> +void GetInfoRequest<I>::handle_get_tag_owner(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to determine tag ownership: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_mirror_uuid == Journal<>::LOCAL_MIRROR_UUID) { + *m_promotion_state = PROMOTION_STATE_PRIMARY; + } else if (m_mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) { + *m_promotion_state = PROMOTION_STATE_ORPHAN; + } + + finish(0); +} + +template <typename I> +void GetInfoRequest<I>::finish(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::GetInfoRequest<librbd::ImageCtx>; diff --git a/src/librbd/mirror/GetInfoRequest.h b/src/librbd/mirror/GetInfoRequest.h new file mode 100644 index 00000000..db8073c6 --- /dev/null +++ b/src/librbd/mirror/GetInfoRequest.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H +#define CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H + +#include "include/buffer.h" +#include "librbd/mirror/Types.h" +#include <string> + +struct Context; +namespace cls { namespace rbd { struct MirrorImage; } } + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +class GetInfoRequest { +public: + static GetInfoRequest *create(ImageCtxT &image_ctx, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + Context *on_finish) { + return new GetInfoRequest(image_ctx, mirror_image, promotion_state, + on_finish); + } + + GetInfoRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, Context *on_finish) + : m_image_ctx(image_ctx), m_mirror_image(mirror_image), + m_promotion_state(promotion_state), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_MIRROR_IMAGE + * | + * v + * GET_TAG_OWNER + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + cls::rbd::MirrorImage *m_mirror_image; + PromotionState *m_promotion_state; + Context *m_on_finish; + + bufferlist m_out_bl; + std::string m_mirror_uuid; + + void get_mirror_image(); + void handle_get_mirror_image(int r); + + void get_tag_owner(); + void handle_get_tag_owner(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::GetInfoRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H + diff --git a/src/librbd/mirror/GetStatusRequest.cc b/src/librbd/mirror/GetStatusRequest.cc new file mode 100644 index 00000000..57096883 --- /dev/null +++ b/src/librbd/mirror/GetStatusRequest.cc @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/GetStatusRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/GetInfoRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::GetStatusRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +void GetStatusRequest<I>::send() { + *m_mirror_image_status = cls::rbd::MirrorImageStatus( + cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN, "status not found"); + + get_info(); +} + +template <typename I> +void GetStatusRequest<I>::get_info() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + GetStatusRequest<I>, &GetStatusRequest<I>::handle_get_info>(this); + auto req = GetInfoRequest<I>::create(m_image_ctx, m_mirror_image, + m_promotion_state, ctx); + req->send(); +} + +template <typename I> +void GetStatusRequest<I>::handle_get_info(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else if (m_mirror_image->state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + finish(0); + return; + } + + get_status(); +} + +template <typename I> +void GetStatusRequest<I>::get_status() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_status_get_start( + &op, m_mirror_image->global_image_id); + + librados::AioCompletion *comp = create_rados_callback< + GetStatusRequest<I>, &GetStatusRequest<I>::handle_get_status>(this); + int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void GetStatusRequest<I>::handle_get_status(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_status_get_finish(&iter, + m_mirror_image_status); + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirror image status: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void GetStatusRequest<I>::finish(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::GetStatusRequest<librbd::ImageCtx>; diff --git a/src/librbd/mirror/GetStatusRequest.h b/src/librbd/mirror/GetStatusRequest.h new file mode 100644 index 00000000..4c1a81f0 --- /dev/null +++ b/src/librbd/mirror/GetStatusRequest.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H +#define CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H + +#include "include/buffer.h" +#include "librbd/mirror/Types.h" +#include <string> + +struct Context; +namespace cls { namespace rbd { struct MirrorImage; } } +namespace cls { namespace rbd { struct MirrorImageStatus; } } + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +class GetStatusRequest { +public: + static GetStatusRequest *create(ImageCtxT &image_ctx, + cls::rbd::MirrorImageStatus *status, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + Context *on_finish) { + return new GetStatusRequest(image_ctx, status, mirror_image, + promotion_state, on_finish); + } + + GetStatusRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImageStatus *status, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, Context *on_finish) + : m_image_ctx(image_ctx), m_mirror_image_status(status), + m_mirror_image(mirror_image), m_promotion_state(promotion_state), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_INFO + * | + * v + * GET_STATUS + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + cls::rbd::MirrorImageStatus *m_mirror_image_status; + cls::rbd::MirrorImage *m_mirror_image; + PromotionState *m_promotion_state; + Context *m_on_finish; + + bufferlist m_out_bl; + + void get_info(); + void handle_get_info(int r); + + void get_status(); + void handle_get_status(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::GetStatusRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H + diff --git a/src/librbd/mirror/PromoteRequest.cc b/src/librbd/mirror/PromoteRequest.cc new file mode 100644 index 00000000..5603cb13 --- /dev/null +++ b/src/librbd/mirror/PromoteRequest.cc @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/PromoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/GetInfoRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::PromoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; + +template <typename I> +void PromoteRequest<I>::send() { + get_info(); +} + +template <typename I> +void PromoteRequest<I>::get_info() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_get_info>(this); + auto req = GetInfoRequest<I>::create(m_image_ctx, &m_mirror_image, + &m_promotion_state, ctx); + req->send(); +} + +template <typename I> +void PromoteRequest<I>::handle_get_info(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + finish(-EINVAL); + return; + } else if (m_promotion_state == PROMOTION_STATE_PRIMARY) { + lderr(cct) << "image is already primary" << dendl; + finish(-EINVAL); + return; + } else if (m_promotion_state == PROMOTION_STATE_NON_PRIMARY && !m_force) { + lderr(cct) << "image is still primary within a remote cluster" << dendl; + finish(-EBUSY); + return; + } + + promote(); +} + +template <typename I> +void PromoteRequest<I>::promote() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + PromoteRequest<I>, &PromoteRequest<I>::handle_promote>(this); + Journal<I>::promote(&m_image_ctx, ctx); +} + +template <typename I> +void PromoteRequest<I>::handle_promote(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to promote image: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template <typename I> +void PromoteRequest<I>::finish(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::PromoteRequest<librbd::ImageCtx>; diff --git a/src/librbd/mirror/PromoteRequest.h b/src/librbd/mirror/PromoteRequest.h new file mode 100644 index 00000000..17609c5f --- /dev/null +++ b/src/librbd/mirror/PromoteRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/mirror/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +class PromoteRequest { +public: + static PromoteRequest *create(ImageCtxT &image_ctx, bool force, + Context *on_finish) { + return new PromoteRequest(image_ctx, force, on_finish); + } + + PromoteRequest(ImageCtxT &image_ctx, bool force, Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_INFO + * | + * v + * GET_TAG_OWNER + * | + * v + * PROMOTE + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + bool m_force; + Context *m_on_finish; + + cls::rbd::MirrorImage m_mirror_image; + PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY; + + void get_info(); + void handle_get_info(int r); + + void promote(); + void handle_promote(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::PromoteRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H diff --git a/src/librbd/mirror/Types.h b/src/librbd/mirror/Types.h new file mode 100644 index 00000000..38511bdb --- /dev/null +++ b/src/librbd/mirror/Types.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_TYPES_H +#define CEPH_LIBRBD_MIRROR_TYPES_H + +namespace librbd { +namespace mirror { + +enum PromotionState { + PROMOTION_STATE_PRIMARY, + PROMOTION_STATE_NON_PRIMARY, + PROMOTION_STATE_ORPHAN +}; + +} // namespace mirror +} // namespace librbd + +#endif // CEPH_LIBRBD_MIRROR_TYPES_H + diff --git a/src/librbd/mirroring_watcher/Types.cc b/src/librbd/mirroring_watcher/Types.cc new file mode 100644 index 00000000..3226b635 --- /dev/null +++ b/src/librbd/mirroring_watcher/Types.cc @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Formatter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "librbd/mirroring_watcher/Types.h" +#include "librbd/watcher/Utils.h" + +namespace librbd { +namespace mirroring_watcher { + +namespace { + +class DumpPayloadVisitor : public boost::static_visitor<void> { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void ModeUpdatedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(static_cast<uint32_t>(mirror_mode), bl); +} + +void ModeUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + uint32_t mirror_mode_decode; + decode(mirror_mode_decode, iter); + mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode_decode); +} + +void ModeUpdatedPayload::dump(Formatter *f) const { + f->dump_stream("mirror_mode") << mirror_mode; +} + +void ImageUpdatedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(static_cast<uint32_t>(mirror_image_state), bl); + encode(image_id, bl); + encode(global_image_id, bl); +} + +void ImageUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + uint32_t mirror_image_state_decode; + decode(mirror_image_state_decode, iter); + mirror_image_state = static_cast<cls::rbd::MirrorImageState>( + mirror_image_state_decode); + decode(image_id, iter); + decode(global_image_id, iter); +} + +void ImageUpdatedPayload::dump(Formatter *f) const { + f->dump_stream("mirror_image_state") << mirror_image_state; + f->dump_string("image_id", image_id); + f->dump_string("global_image_id", global_image_id); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_MODE_UPDATED: + payload = ModeUpdatedPayload(); + break; + case NOTIFY_OP_IMAGE_UPDATED: + payload = ImageUpdatedPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) { + o.push_back(new NotifyMessage(ModeUpdatedPayload(cls::rbd::MIRROR_MODE_DISABLED))); + o.push_back(new NotifyMessage(ImageUpdatedPayload(cls::rbd::MIRROR_IMAGE_STATE_DISABLING, + "image id", "global image id"))); +} + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op) { + switch (op) { + case NOTIFY_OP_MODE_UPDATED: + out << "ModeUpdated"; + break; + case NOTIFY_OP_IMAGE_UPDATED: + out << "ImageUpdated"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(op) << ")"; + break; + } + return out; +} + +} // namespace mirroring_watcher +} // namespace librbd diff --git a/src/librbd/mirroring_watcher/Types.h b/src/librbd/mirroring_watcher/Types.h new file mode 100644 index 00000000..1e096a9d --- /dev/null +++ b/src/librbd/mirroring_watcher/Types.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H +#define CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "cls/rbd/cls_rbd_types.h" +#include <iosfwd> +#include <list> +#include <string> +#include <boost/variant.hpp> + +namespace ceph { class Formatter; } + +namespace librbd { +namespace mirroring_watcher { + +enum NotifyOp { + NOTIFY_OP_MODE_UPDATED = 0, + NOTIFY_OP_IMAGE_UPDATED = 1 +}; + +struct ModeUpdatedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_MODE_UPDATED; + + cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + + ModeUpdatedPayload() { + } + ModeUpdatedPayload(cls::rbd::MirrorMode mirror_mode) + : mirror_mode(mirror_mode) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ImageUpdatedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_UPDATED; + + cls::rbd::MirrorImageState mirror_image_state = + cls::rbd::MIRROR_IMAGE_STATE_ENABLED; + std::string image_id; + std::string global_image_id; + + ImageUpdatedPayload() { + } + ImageUpdatedPayload(cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, + const std::string &global_image_id) + : mirror_image_state(mirror_image_state), image_id(image_id), + global_image_id(global_image_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1); + + UnknownPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant<ModeUpdatedPayload, + ImageUpdatedPayload, + UnknownPayload> Payload; + +struct NotifyMessage { + NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) { + } + + Payload payload; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<NotifyMessage *> &o); +}; + +WRITE_CLASS_ENCODER(NotifyMessage); + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op); + +} // namespace mirroring_watcher +} // namespace librbd + +using librbd::mirroring_watcher::encode; +using librbd::mirroring_watcher::decode; + +#endif // CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H diff --git a/src/librbd/object_map/CreateRequest.cc b/src/librbd/object_map/CreateRequest.cc new file mode 100644 index 00000000..a2b9d8f6 --- /dev/null +++ b/src/librbd/object_map/CreateRequest.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/CreateRequest.h" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::CreateRequest: " + +namespace librbd { +namespace object_map { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +CreateRequest<I>::CreateRequest(I *image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { +} + +template <typename I> +void CreateRequest<I>::send() { + CephContext *cct = m_image_ctx->cct; + + uint64_t max_size = m_image_ctx->size; + + { + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + m_snap_ids.push_back(CEPH_NOSNAP); + for (auto it : m_image_ctx->snap_info) { + max_size = std::max(max_size, it.second.size); + m_snap_ids.push_back(it.first); + } + + if (ObjectMap<>::is_compatible(m_image_ctx->layout, max_size)) { + send_object_map_resize(); + return; + } + } + + lderr(cct) << "image size not compatible with object map" << dendl; + m_on_finish->complete(-EINVAL); +} + +template <typename I> +void CreateRequest<I>::send_object_map_resize() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = create_context_callback< + CreateRequest<I>, &CreateRequest<I>::handle_object_map_resize>(this); + C_Gather *gather_ctx = new C_Gather(cct, ctx); + + for (auto snap_id : m_snap_ids) { + librados::ObjectWriteOperation op; + uint64_t snap_size = m_image_ctx->get_image_size(snap_id); + + cls_client::object_map_resize(&op, Striper::get_num_objects( + m_image_ctx->layout, snap_size), + OBJECT_NONEXISTENT); + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id)); + librados::AioCompletion *comp = create_rados_callback(gather_ctx->new_sub()); + int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + gather_ctx->activate(); +} + +template <typename I> +Context *CreateRequest<I>::handle_object_map_resize(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "object map resize failed: " << cpp_strerror(*result) + << dendl; + } + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::CreateRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/CreateRequest.h b/src/librbd/object_map/CreateRequest.h new file mode 100644 index 00000000..6929abe7 --- /dev/null +++ b/src/librbd/object_map/CreateRequest.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H + +#include "include/buffer.h" +#include "common/Mutex.h" +#include <map> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = ImageCtx> +class CreateRequest { +public: + static CreateRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return new CreateRequest(image_ctx, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | . . . + * v v . + * OBJECT_MAP_RESIZE . (for every snapshot) + * | . . + * v . . . + * <finis> + * + * @endverbatim + */ + + CreateRequest(ImageCtxT *image_ctx, Context *on_finish); + + ImageCtxT *m_image_ctx; + Context *m_on_finish; + + std::vector<uint64_t> m_snap_ids; + + void send_object_map_resize(); + Context *handle_object_map_resize(int *result); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::CreateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H diff --git a/src/librbd/object_map/InvalidateRequest.cc b/src/librbd/object_map/InvalidateRequest.cc new file mode 100644 index 00000000..9f7a2fa1 --- /dev/null +++ b/src/librbd/object_map/InvalidateRequest.cc @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/InvalidateRequest.h" +#include "common/dout.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::InvalidateRequest: " + +namespace librbd { +namespace object_map { + +template <typename I> +InvalidateRequest<I>* InvalidateRequest<I>::create(I &image_ctx, + uint64_t snap_id, bool force, + Context *on_finish) { + return new InvalidateRequest<I>(image_ctx, snap_id, force, on_finish); +} + +template <typename I> +void InvalidateRequest<I>::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + ceph_assert(image_ctx.snap_lock.is_wlocked()); + + uint64_t snap_flags; + int r = image_ctx.get_flags(m_snap_id, &snap_flags); + if (r < 0 || ((snap_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0)) { + this->async_complete(r); + return; + } + + CephContext *cct = image_ctx.cct; + lderr(cct) << this << " invalidating object map in-memory" << dendl; + + // update in-memory flags + uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID; + if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + flags |= RBD_FLAG_FAST_DIFF_INVALID; + } + + r = image_ctx.update_flags(m_snap_id, flags, true); + if (r < 0) { + this->async_complete(r); + return; + } + + // do not update on-disk flags if not image owner + if (image_ctx.image_watcher == nullptr || + (!m_force && m_snap_id == CEPH_NOSNAP && + image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner())) { + this->async_complete(-EROFS); + return; + } + + lderr(cct) << this << " invalidating object map on-disk" << dendl; + librados::ObjectWriteOperation op; + cls_client::set_flags(&op, m_snap_id, flags, flags); + + librados::AioCompletion *rados_completion = + this->create_callback_completion(); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion, + &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +bool InvalidateRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + lderr(cct) << this << " " << __func__ << ": r=" << r << dendl; + return true; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/InvalidateRequest.h b/src/librbd/object_map/InvalidateRequest.h new file mode 100644 index 00000000..11c42037 --- /dev/null +++ b/src/librbd/object_map/InvalidateRequest.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/AsyncRequest.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = ImageCtx> +class InvalidateRequest : public AsyncRequest<ImageCtxT> { +public: + static InvalidateRequest* create(ImageCtxT &image_ctx, uint64_t snap_id, + bool force, Context *on_finish); + + InvalidateRequest(ImageCtxT &image_ctx, uint64_t snap_id, bool force, + Context *on_finish) + : AsyncRequest<ImageCtxT>(image_ctx, on_finish), + m_snap_id(snap_id), m_force(force) { + } + + void send() override; + +protected: + bool should_complete(int r) override; + int filter_return_code(int r) const override{ + // never propagate an error back to the caller + return 0; + } + +private: + uint64_t m_snap_id; + bool m_force; +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H diff --git a/src/librbd/object_map/LockRequest.cc b/src/librbd/object_map/LockRequest.cc new file mode 100644 index 00000000..4ed4fa1b --- /dev/null +++ b/src/librbd/object_map/LockRequest.cc @@ -0,0 +1,157 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/LockRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::LockRequest: " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template <typename I> +LockRequest<I>::LockRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), m_broke_lock(false) { +} + +template <typename I> +void LockRequest<I>::send() { + send_lock(); +} + +template <typename I> +void LockRequest<I>::send_lock() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::lock(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "", "", + utime_t(), 0); + + using klass = LockRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_lock>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *LockRequest<I>::handle_lock(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val == 0) { + return m_on_finish; + } else if (*ret_val == -EEXIST) { + // already locked by myself + *ret_val = 0; + return m_on_finish; + } else if (m_broke_lock || *ret_val != -EBUSY) { + lderr(cct) << "failed to lock object map: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + return m_on_finish; + } + + send_get_lock_info(); + return nullptr; +} + +template <typename I> +void LockRequest<I>::send_get_lock_info() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectReadOperation op; + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = LockRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_get_lock_info>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *LockRequest<I>::handle_get_lock_info(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val == -ENOENT) { + send_lock(); + return nullptr; + } + + ClsLockType lock_type; + std::string lock_tag; + if (*ret_val == 0) { + auto it = m_out_bl.cbegin(); + *ret_val = rados::cls::lock::get_lock_info_finish(&it, &m_lockers, + &lock_type, &lock_tag); + } + if (*ret_val < 0) { + lderr(cct) << "failed to list object map locks: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + return m_on_finish; + } + + send_break_locks(); + return nullptr; +} + +template <typename I> +void LockRequest<I>::send_break_locks() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << ", " + << "num_lockers=" << m_lockers.size() << dendl; + + librados::ObjectWriteOperation op; + for (auto &locker : m_lockers) { + rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, locker.first.cookie, + locker.first.locker); + } + + using klass = LockRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_break_locks>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *LockRequest<I>::handle_break_locks(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + m_broke_lock = true; + if (*ret_val == 0 || *ret_val == -ENOENT) { + send_lock(); + return nullptr; + } + + lderr(cct) << "failed to break object map lock: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::LockRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/LockRequest.h b/src/librbd/object_map/LockRequest.h new file mode 100644 index 00000000..0333548e --- /dev/null +++ b/src/librbd/object_map/LockRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H + +#include "include/buffer.h" +#include "cls/lock/cls_lock_types.h" +#include <map> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = ImageCtx> +class LockRequest { +public: + static LockRequest* create(ImageCtxT &image_ctx, Context *on_finish) { + return new LockRequest(image_ctx, on_finish); + } + LockRequest(ImageCtxT &image_ctx, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> /------------------------------------- BREAK_LOCKS * * * + * | | ^ * + * | | | * + * | | | * + * | v (EBUSY && !broke_lock) | * + * \---------> LOCK_OBJECT_MAP * * * * * * * * * * * > GET_LOCK_INFO * * + * | * ^ * * + * | * * * * + * | * * (ENOENT) * * + * | * * * * * * * * * * * * * * * * * * + * | * * + * | * (other errors) * + * | * * + * v v (other errors) * + * <finish> < * * * * * * * * * * * * * * * * * * * * * * * * + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + bool m_broke_lock; + std::map<rados::cls::lock::locker_id_t, + rados::cls::lock::locker_info_t> m_lockers; + bufferlist m_out_bl; + + void send_lock(); + Context *handle_lock(int *ret_val); + + void send_get_lock_info(); + Context *handle_get_lock_info(int *ret_val); + + void send_break_locks(); + Context *handle_break_locks(int *ret_val); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::LockRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H diff --git a/src/librbd/object_map/RefreshRequest.cc b/src/librbd/object_map/RefreshRequest.cc new file mode 100644 index 00000000..c4fc2539 --- /dev/null +++ b/src/librbd/object_map/RefreshRequest.cc @@ -0,0 +1,303 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/RefreshRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "librbd/object_map/LockRequest.h" +#include "librbd/object_map/ResizeRequest.h" +#include "librbd/Utils.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::RefreshRequest: " + +namespace librbd { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace object_map { + +template <typename I> +RefreshRequest<I>::RefreshRequest(I &image_ctx, ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish) + : m_image_ctx(image_ctx), m_object_map(object_map), m_snap_id(snap_id), + m_on_finish(on_finish), m_object_count(0), + m_truncate_on_disk_object_map(false) { +} + +template <typename I> +void RefreshRequest<I>::send() { + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + m_object_count = Striper::get_num_objects( + m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id)); + } + + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": " + << "object_count=" << m_object_count << dendl; + send_lock(); +} + +template <typename I> +void RefreshRequest<I>::apply() { + uint64_t num_objs; + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + num_objs = Striper::get_num_objects( + m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id)); + } + ceph_assert(m_on_disk_object_map.size() >= num_objs); + + *m_object_map = m_on_disk_object_map; +} + +template <typename I> +void RefreshRequest<I>::send_lock() { + CephContext *cct = m_image_ctx.cct; + if (m_object_count > cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT) { + send_invalidate_and_close(); + return; + } else if (m_snap_id != CEPH_NOSNAP) { + send_load(); + return; + } + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_lock>(this); + + LockRequest<I> *req = LockRequest<I>::create(m_image_ctx, ctx); + req->send(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_lock(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + ceph_assert(*ret_val == 0); + send_load(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_load() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectReadOperation op; + cls_client::object_map_load_start(&op); + + using klass = RefreshRequest<I>; + m_out_bl.clear(); + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_load>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_load(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val == 0) { + auto bl_it = m_out_bl.cbegin(); + *ret_val = cls_client::object_map_load_finish(&bl_it, + &m_on_disk_object_map); + } + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + if (*ret_val == -EINVAL) { + // object map is corrupt on-disk -- clear it and properly size it + // so future IO can keep the object map in sync + lderr(cct) << "object map corrupt on-disk: " << oid << dendl; + m_truncate_on_disk_object_map = true; + send_resize_invalidate(); + return nullptr; + } else if (*ret_val < 0) { + lderr(cct) << "failed to load object map: " << oid << dendl; + send_invalidate(); + return nullptr; + } + + if (m_on_disk_object_map.size() < m_object_count) { + lderr(cct) << "object map smaller than current object count: " + << m_on_disk_object_map.size() << " != " + << m_object_count << dendl; + send_resize_invalidate(); + return nullptr; + } + + ldout(cct, 20) << "refreshed object map: num_objs=" + << m_on_disk_object_map.size() << dendl; + if (m_on_disk_object_map.size() > m_object_count) { + // resize op might have been interrupted + ldout(cct, 1) << "object map larger than current object count: " + << m_on_disk_object_map.size() << " != " + << m_object_count << dendl; + } + + apply(); + return m_on_finish; +} + +template <typename I> +void RefreshRequest<I>::send_invalidate() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_disk_object_map.clear(); + object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count, + OBJECT_EXISTS); + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_invalidate>(this); + InvalidateRequest<I> *req = InvalidateRequest<I>::create( + m_image_ctx, m_snap_id, true, ctx); + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + req->send(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_invalidate(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val) + << dendl; + } + + apply(); + return m_on_finish; +} + +template <typename I> +void RefreshRequest<I>::send_resize_invalidate() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_disk_object_map.clear(); + object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count, + OBJECT_EXISTS); + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_resize_invalidate>(this); + InvalidateRequest<I> *req = InvalidateRequest<I>::create( + m_image_ctx, m_snap_id, true, ctx); + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + req->send(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_resize_invalidate(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val) + << dendl; + apply(); + return m_on_finish; + } + + send_resize(); + return nullptr; +} + +template <typename I> +void RefreshRequest<I>::send_resize() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + } + if (m_truncate_on_disk_object_map) { + op.truncate(0); + } + cls_client::object_map_resize(&op, m_object_count, OBJECT_NONEXISTENT); + + using klass = RefreshRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_resize>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_resize(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to adjust object map size: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + } + + apply(); + return m_on_finish; +} + +template <typename I> +void RefreshRequest<I>::send_invalidate_and_close() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_invalidate_and_close>(this); + InvalidateRequest<I> *req = InvalidateRequest<I>::create( + m_image_ctx, m_snap_id, false, ctx); + + lderr(cct) << "object map too large: " << m_object_count << dendl; + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + req->send(); +} + +template <typename I> +Context *RefreshRequest<I>::handle_invalidate_and_close(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val) + << dendl; + } else { + *ret_val = -EFBIG; + } + + m_object_map->clear(); + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::RefreshRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/RefreshRequest.h b/src/librbd/object_map/RefreshRequest.h new file mode 100644 index 00000000..24a7998a --- /dev/null +++ b/src/librbd/object_map/RefreshRequest.h @@ -0,0 +1,96 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/bit_vector.hpp" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = ImageCtx> +class RefreshRequest { +public: + static RefreshRequest *create(ImageCtxT &image_ctx, + ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish) { + return new RefreshRequest(image_ctx, object_map, snap_id, on_finish); + } + + RefreshRequest(ImageCtxT &image_ctx, ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> -----> LOCK (skip if snapshot) + * * | + * * v (other errors) + * * LOAD * * * * * * * > INVALIDATE ------------\ + * * | * | + * * | * (-EINVAL or too small) | + * * | * * * * * * > INVALIDATE_AND_RESIZE | + * * | | * | + * * | | * | + * * | v * | + * * | RESIZE * | + * * | | * | + * * | | * * * * * * * | + * * | | * | + * * | v v | + * * \--------------------> LOCK <-------------/ + * * | + * v v + * INVALIDATE_AND_CLOSE ---------------> <finish> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + ceph::BitVector<2> *m_object_map; + uint64_t m_snap_id; + Context *m_on_finish; + + uint64_t m_object_count; + ceph::BitVector<2> m_on_disk_object_map; + bool m_truncate_on_disk_object_map; + bufferlist m_out_bl; + + void send_lock(); + Context *handle_lock(int *ret_val); + + void send_load(); + Context *handle_load(int *ret_val); + + void send_invalidate(); + Context *handle_invalidate(int *ret_val); + + void send_resize_invalidate(); + Context *handle_resize_invalidate(int *ret_val); + + void send_resize(); + Context *handle_resize(int *ret_val); + + void send_invalidate_and_close(); + Context *handle_invalidate_and_close(int *ret_val); + + void apply(); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::RefreshRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H diff --git a/src/librbd/object_map/RemoveRequest.cc b/src/librbd/object_map/RemoveRequest.cc new file mode 100644 index 00000000..a7db0c03 --- /dev/null +++ b/src/librbd/object_map/RemoveRequest.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/RemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::RemoveRequest: " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template <typename I> +RemoveRequest<I>::RemoveRequest(I *image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), + m_lock("object_map::RemoveRequest::m_lock") { +} + +template <typename I> +void RemoveRequest<I>::send() { + send_remove_object_map(); +} + +template <typename I> +void RemoveRequest<I>::send_remove_object_map() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + std::vector<uint64_t> snap_ids; + snap_ids.push_back(CEPH_NOSNAP); + for (auto it : m_image_ctx->snap_info) { + snap_ids.push_back(it.first); + } + + Mutex::Locker locker(m_lock); + ceph_assert(m_ref_counter == 0); + + for (auto snap_id : snap_ids) { + m_ref_counter++; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id)); + using klass = RemoveRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_remove_object_map>(this); + + int r = m_image_ctx->md_ctx.aio_remove(oid, comp); + ceph_assert(r == 0); + comp->release(); + } +} + +template <typename I> +Context *RemoveRequest<I>::handle_remove_object_map(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": r=" << *result << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_ref_counter > 0); + m_ref_counter--; + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to remove object map: " << cpp_strerror(*result) + << dendl; + m_error_result = *result; + } + if (m_ref_counter > 0) { + return nullptr; + } + } + if (m_error_result < 0) { + *result = m_error_result; + } + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::RemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/RemoveRequest.h b/src/librbd/object_map/RemoveRequest.h new file mode 100644 index 00000000..1353ef9b --- /dev/null +++ b/src/librbd/object_map/RemoveRequest.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H + +#include "include/buffer.h" +#include "common/Mutex.h" +#include <map> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = ImageCtx> +class RemoveRequest { +public: + static RemoveRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return new RemoveRequest(image_ctx, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | . . . + * v v . + * REMOVE_OBJECT_MAP . (for every snapshot) + * | . . + * v . . . + * <finis> + * + * @endverbatim + */ + + RemoveRequest(ImageCtxT *image_ctx, Context *on_finish); + + ImageCtxT *m_image_ctx; + Context *m_on_finish; + + int m_error_result = 0; + int m_ref_counter = 0; + mutable Mutex m_lock; + + void send_remove_object_map(); + Context *handle_remove_object_map(int *result); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::RemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H diff --git a/src/librbd/object_map/Request.cc b/src/librbd/object_map/Request.cc new file mode 100644 index 00000000..a5e3a8bd --- /dev/null +++ b/src/librbd/object_map/Request.cc @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/Request.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/RWLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/object_map/InvalidateRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::Request: " + +namespace librbd { +namespace object_map { + +bool Request::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " should_complete: r=" << r << dendl; + + switch (m_state) + { + case STATE_REQUEST: + if (r < 0) { + lderr(cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + return invalidate(); + } + + finish_request(); + return true; + + case STATE_INVALIDATE: + ldout(cct, 20) << "INVALIDATE" << dendl; + if (r < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r) + << dendl; + } + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + ceph_abort(); + break; + } + return false; +} + +bool Request::invalidate() { + bool flags_set; + int r = m_image_ctx.test_flags(m_snap_id, RBD_FLAG_OBJECT_MAP_INVALID, + &flags_set); + if (r == 0 && flags_set) { + return true; + } + + m_state = STATE_INVALIDATE; + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id, + true, + create_callback_context()); + req->send(); + return false; +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/Request.h b/src/librbd/object_map/Request.h new file mode 100644 index 00000000..aef8800d --- /dev/null +++ b/src/librbd/object_map/Request.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_REQUEST_H + +#include "include/int_types.h" +#include "librbd/AsyncRequest.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class Request : public AsyncRequest<> { +public: + Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish) + : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id), + m_state(STATE_REQUEST) + { + } + + void send() override = 0; + +protected: + const uint64_t m_snap_id; + + bool should_complete(int r) override; + int filter_return_code(int r) const override { + if (m_state == STATE_REQUEST) { + // never propagate an error back to the caller + return 0; + } + return r; + } + virtual void finish_request() { + } + +private: + /** + * <start> ---> STATE_REQUEST ---> <finish> + * | ^ + * v | + * STATE_INVALIDATE -------/ + */ + enum State { + STATE_REQUEST, + STATE_INVALIDATE + }; + + State m_state; + + bool invalidate(); +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_REQUEST_H diff --git a/src/librbd/object_map/ResizeRequest.cc b/src/librbd/object_map/ResizeRequest.cc new file mode 100644 index 00000000..1ec0d99a --- /dev/null +++ b/src/librbd/object_map/ResizeRequest.cc @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/ResizeRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "cls/lock/cls_lock_client.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::ResizeRequest: " + +namespace librbd { +namespace object_map { + +void ResizeRequest::resize(ceph::BitVector<2> *object_map, uint64_t num_objs, + uint8_t default_state) { + size_t orig_object_map_size = object_map->size(); + object_map->resize(num_objs); + if (num_objs > orig_object_map_size) { + auto it = object_map->begin() + orig_object_map_size; + auto end_it = object_map->begin() + num_objs; + for (;it != end_it; ++it) { + *it = default_state; + } + } +} + +void ResizeRequest::send() { + CephContext *cct = m_image_ctx.cct; + + RWLock::WLocker l(m_image_ctx.object_map_lock); + m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size); + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << this << " resizing on-disk object map: " + << "ictx=" << &m_image_ctx << ", " + << "oid=" << oid << ", num_objs=" << m_num_objs << dendl; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + } + cls_client::object_map_resize(&op, m_num_objs, m_default_object_state); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void ResizeRequest::finish_request() { + RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " resizing in-memory object map: " + << m_num_objs << dendl; + + resize(m_object_map, m_num_objs, m_default_object_state); +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/ResizeRequest.h b/src/librbd/object_map/ResizeRequest.h new file mode 100644 index 00000000..51839400 --- /dev/null +++ b/src/librbd/object_map/ResizeRequest.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/object_map/Request.h" +#include "common/bit_vector.hpp" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class ResizeRequest : public Request { +public: + ResizeRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map, + uint64_t snap_id, uint64_t new_size, + uint8_t default_object_state, Context *on_finish) + : Request(image_ctx, snap_id, on_finish), m_object_map(object_map), + m_num_objs(0), m_new_size(new_size), + m_default_object_state(default_object_state) + { + } + + static void resize(ceph::BitVector<2> *object_map, uint64_t num_objs, + uint8_t default_state); + + void send() override; + +protected: + void finish_request() override; + +private: + ceph::BitVector<2> *m_object_map; + uint64_t m_num_objs; + uint64_t m_new_size; + uint8_t m_default_object_state; +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H diff --git a/src/librbd/object_map/SnapshotCreateRequest.cc b/src/librbd/object_map/SnapshotCreateRequest.cc new file mode 100644 index 00000000..2949dec8 --- /dev/null +++ b/src/librbd/object_map/SnapshotCreateRequest.cc @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/SnapshotCreateRequest.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "cls/lock/cls_lock_client.h" +#include <iostream> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::SnapshotCreateRequest: " + +namespace librbd { +namespace object_map { + +namespace { + +std::ostream& operator<<(std::ostream& os, + const SnapshotCreateRequest::State& state) { + switch(state) { + case SnapshotCreateRequest::STATE_READ_MAP: + os << "READ_MAP"; + break; + case SnapshotCreateRequest::STATE_WRITE_MAP: + os << "WRITE_MAP"; + break; + case SnapshotCreateRequest::STATE_ADD_SNAPSHOT: + os << "ADD_SNAPSHOT"; + break; + default: + os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")"; + break; + } + return os; +} + +} // anonymous namespace + +void SnapshotCreateRequest::send() { + send_read_map(); +} + +bool SnapshotCreateRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0 && m_ret_val == 0) { + m_ret_val = r; + } + if (m_ret_val < 0) { + // pass errors down to base class to invalidate the object map + return Request::should_complete(r); + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + bool finished = false; + switch (m_state) { + case STATE_READ_MAP: + send_write_map(); + break; + case STATE_WRITE_MAP: + finished = send_add_snapshot(); + break; + case STATE_ADD_SNAPSHOT: + update_object_map(); + finished = true; + break; + default: + ceph_abort(); + break; + } + return finished; +} + +void SnapshotCreateRequest::send_read_map() { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl; + m_state = STATE_READ_MAP; + + // IO is blocked due to the snapshot creation -- consistent to read from disk + librados::ObjectReadOperation op; + op.read(0, 0, NULL, NULL); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, + &m_read_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotCreateRequest::send_write_map() { + CephContext *cct = m_image_ctx.cct; + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid + << dendl; + m_state = STATE_WRITE_MAP; + + librados::ObjectWriteOperation op; + op.write_full(m_read_bl); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +bool SnapshotCreateRequest::send_add_snapshot() { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) == 0) { + return true; + } + + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl; + m_state = STATE_ADD_SNAPSHOT; + + librados::ObjectWriteOperation op; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + cls_client::object_map_snap_add(&op); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); + return false; +} + +void SnapshotCreateRequest::update_object_map() { + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock); + + auto it = m_object_map.begin(); + auto end_it = m_object_map.end(); + for (; it != end_it; ++it) { + if (*it == OBJECT_EXISTS) { + *it = OBJECT_EXISTS_CLEAN; + } + } +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/SnapshotCreateRequest.h b/src/librbd/object_map/SnapshotCreateRequest.h new file mode 100644 index 00000000..dd15abef --- /dev/null +++ b/src/librbd/object_map/SnapshotCreateRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "common/bit_vector.hpp" +#include "librbd/object_map/Request.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class SnapshotCreateRequest : public Request { +public: + /** + * Snapshot create goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_READ_MAP + * | + * v (skip) + * STATE_WRITE_MAP . . . . . . . + * | . + * v v + * STATE_ADD_SNAPSHOT ---> <finish> + * + * @endverbatim + * + * The _ADD_SNAPSHOT state is skipped if the FAST_DIFF feature isn't enabled. + */ + enum State { + STATE_READ_MAP, + STATE_WRITE_MAP, + STATE_ADD_SNAPSHOT + }; + + SnapshotCreateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish) + : Request(image_ctx, snap_id, on_finish), + m_object_map(*object_map), m_ret_val(0) { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + State m_state = STATE_READ_MAP; + ceph::BitVector<2> &m_object_map; + + bufferlist m_read_bl; + int m_ret_val; + + void send_read_map(); + void send_write_map(); + bool send_add_snapshot(); + + void update_object_map(); + +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H diff --git a/src/librbd/object_map/SnapshotRemoveRequest.cc b/src/librbd/object_map/SnapshotRemoveRequest.cc new file mode 100644 index 00000000..54f67c8d --- /dev/null +++ b/src/librbd/object_map/SnapshotRemoveRequest.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/SnapshotRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "cls/lock/cls_lock_client.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::SnapshotRemoveRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace object_map { + +void SnapshotRemoveRequest::send() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.snap_lock.is_wlocked()); + + if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + int r = m_image_ctx.get_flags(m_snap_id, &m_flags); + ceph_assert(r == 0); + + compute_next_snap_id(); + load_map(); + } else { + remove_map(); + } +} + +void SnapshotRemoveRequest::load_map() { + CephContext *cct = m_image_ctx.cct; + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << "snap_oid=" << snap_oid << dendl; + + librados::ObjectReadOperation op; + cls_client::object_map_load_start(&op); + + auto rados_completion = librbd::util::create_rados_callback< + SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_load_map>(this); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op, + &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRemoveRequest::handle_load_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::object_map_load_finish(&it, &m_snap_object_map); + } + if (r == -ENOENT) { + // implies we have already deleted this snapshot and handled the + // necessary fast-diff cleanup + complete(0); + return; + } else if (r < 0) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + lderr(cct) << "failed to load object map " << oid << ": " + << cpp_strerror(r) << dendl; + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + invalidate_next_map(); + return; + } + + remove_snapshot(); +} + +void SnapshotRemoveRequest::remove_snapshot() { + if ((m_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) { + // snapshot object map exists on disk but is invalid. cannot clean fast-diff + // on next snapshot if current snapshot was invalid. + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + invalidate_next_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_next_snap_id)); + ldout(cct, 5) << "oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + if (m_next_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + } + cls_client::object_map_snap_remove(&op, m_snap_object_map); + + auto rados_completion = librbd::util::create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_remove_snapshot>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRemoveRequest::handle_remove_snapshot(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0 && r != -ENOENT) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, + m_next_snap_id)); + lderr(cct) << "failed to remove object map snapshot " << oid << ": " + << cpp_strerror(r) << dendl; + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + invalidate_next_map(); + return; + } + + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + update_object_map(); + remove_map(); +} + +void SnapshotRemoveRequest::invalidate_next_map() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + ceph_assert(m_image_ctx.snap_lock.is_wlocked()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = librbd::util::create_context_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_invalidate_next_map>(this); + InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, + m_next_snap_id, true, ctx); + req->send(); +} + +void SnapshotRemoveRequest::handle_invalidate_next_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, + m_next_snap_id)); + lderr(cct) << "failed to invalidate object map " << oid << ": " + << cpp_strerror(r) << dendl; + complete(r); + return; + } + + remove_map(); +} + +void SnapshotRemoveRequest::remove_map() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << "oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + op.remove(); + + auto rados_completion = librbd::util::create_rados_callback< + SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_remove_map>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRemoveRequest::handle_remove_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + lderr(cct) << "failed to remove object map " << oid << ": " + << cpp_strerror(r) << dendl; + complete(r); + return; + } + + complete(0); +} + +void SnapshotRemoveRequest::compute_next_snap_id() { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + + m_next_snap_id = CEPH_NOSNAP; + std::map<librados::snap_t, SnapInfo>::const_iterator it = + m_image_ctx.snap_info.find(m_snap_id); + ceph_assert(it != m_image_ctx.snap_info.end()); + + ++it; + if (it != m_image_ctx.snap_info.end()) { + m_next_snap_id = it->first; + } +} + +void SnapshotRemoveRequest::update_object_map() { + assert(m_image_ctx.snap_lock.is_locked()); + RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock); + if (m_next_snap_id == m_image_ctx.snap_id && m_next_snap_id == CEPH_NOSNAP) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto it = m_object_map.begin(); + auto end_it = m_object_map.end(); + auto snap_it = m_snap_object_map.begin(); + uint64_t i = 0; + for (; it != end_it; ++it) { + if (*it == OBJECT_EXISTS_CLEAN && + (i >= m_snap_object_map.size() || + *snap_it == OBJECT_EXISTS)) { + *it = OBJECT_EXISTS; + } + if (i < m_snap_object_map.size()) { + ++snap_it; + } + ++i; + } + } +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/SnapshotRemoveRequest.h b/src/librbd/object_map/SnapshotRemoveRequest.h new file mode 100644 index 00000000..bd020d1a --- /dev/null +++ b/src/librbd/object_map/SnapshotRemoveRequest.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/bit_vector.hpp" +#include "librbd/AsyncRequest.h" + +namespace librbd { +namespace object_map { + +class SnapshotRemoveRequest : public AsyncRequest<> { +public: + /** + * Snapshot rollback goes through the following state machine: + * + * @verbatim + * + * <start> -----------> STATE_LOAD_MAP ----\ + * . * | + * . * (error) | + * . (invalid object map) v | + * . . . > STATE_INVALIDATE_NEXT_MAP | + * . | | + * . | | + * . (fast diff disabled) v v + * . . . . . . . . . . > STATE_REMOVE_MAP + * | + * v + * <finish> + * + * @endverbatim + * + * The _LOAD_MAP state is skipped if the fast diff feature is disabled. + * If the fast diff feature is enabled and the snapshot is flagged as + * invalid, the next snapshot / HEAD object mapis flagged as invalid; + * otherwise, the state machine proceeds to remove the object map. + */ + + SnapshotRemoveRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish) + : AsyncRequest(image_ctx, on_finish), m_object_map(*object_map), + m_snap_id(snap_id), m_next_snap_id(CEPH_NOSNAP) { + } + + void send() override; + +protected: + bool should_complete(int r) override { + return true; + } + +private: + ceph::BitVector<2> &m_object_map; + uint64_t m_snap_id; + uint64_t m_next_snap_id; + + uint64_t m_flags = 0; + + ceph::BitVector<2> m_snap_object_map; + bufferlist m_out_bl; + + void load_map(); + void handle_load_map(int r); + + void remove_snapshot(); + void handle_remove_snapshot(int r); + + void invalidate_next_map(); + void handle_invalidate_next_map(int r); + + void remove_map(); + void handle_remove_map(int r); + + void compute_next_snap_id(); + void update_object_map(); +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H diff --git a/src/librbd/object_map/SnapshotRollbackRequest.cc b/src/librbd/object_map/SnapshotRollbackRequest.cc new file mode 100644 index 00000000..d32123ff --- /dev/null +++ b/src/librbd/object_map/SnapshotRollbackRequest.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/SnapshotRollbackRequest.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "cls/lock/cls_lock_client.h" +#include <iostream> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::SnapshotRollbackRequest: " + +namespace librbd { +namespace object_map { + +namespace { + +std::ostream& operator<<(std::ostream& os, + const SnapshotRollbackRequest::State& state) { + switch(state) { + case SnapshotRollbackRequest::STATE_READ_MAP: + os << "READ_MAP"; + break; + case SnapshotRollbackRequest::STATE_INVALIDATE_MAP: + os << "INVALIDATE_MAP"; + break; + case SnapshotRollbackRequest::STATE_WRITE_MAP: + os << "WRITE_MAP"; + break; + default: + os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")"; + break; + } + return os; +} + +} // anonymous namespace + +void SnapshotRollbackRequest::send() { + send_read_map(); +} + +bool SnapshotRollbackRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0 && m_ret_val == 0) { + m_ret_val = r; + } + + bool finished = false; + switch (m_state) { + case STATE_READ_MAP: + if (r < 0) { + // invalidate the snapshot object map + send_invalidate_map(); + } else { + send_write_map(); + } + break; + case STATE_INVALIDATE_MAP: + // invalidate the HEAD object map as well + finished = Request::should_complete(m_ret_val); + break; + case STATE_WRITE_MAP: + finished = Request::should_complete(r); + break; + default: + ceph_abort(); + break; + } + return finished; +} + +void SnapshotRollbackRequest::send_read_map() { + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid + << dendl; + m_state = STATE_READ_MAP; + + librados::ObjectReadOperation op; + op.read(0, 0, NULL, NULL); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op, + &m_read_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRollbackRequest::send_write_map() { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + + CephContext *cct = m_image_ctx.cct; + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, + CEPH_NOSNAP)); + ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid + << dendl; + m_state = STATE_WRITE_MAP; + + librados::ObjectWriteOperation op; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + op.write_full(m_read_bl); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRollbackRequest::send_invalidate_map() { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_INVALIDATE_MAP; + + InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id, + false, + create_callback_context()); + req->send(); +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/SnapshotRollbackRequest.h b/src/librbd/object_map/SnapshotRollbackRequest.h new file mode 100644 index 00000000..e26b1e0a --- /dev/null +++ b/src/librbd/object_map/SnapshotRollbackRequest.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H + +#include "include/int_types.h" +#include "librbd/object_map/Request.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class SnapshotRollbackRequest : public Request { +public: + /** + * Snapshot rollback goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v (error) + * STATE_READ_MAP * * * * > STATE_INVALIDATE_MAP + * | | + * v v + * STATE_WRITE_MAP -------> <finish> + * + * @endverbatim + * + * If an error occurs within the READ_MAP state, the associated snapshot's + * object map will be flagged as invalid. Otherwise, an error from any state + * will result in the HEAD object map being flagged as invalid via the base + * class. + */ + enum State { + STATE_READ_MAP, + STATE_INVALIDATE_MAP, + STATE_WRITE_MAP + }; + + SnapshotRollbackRequest(ImageCtx &image_ctx, uint64_t snap_id, + Context *on_finish) + : Request(image_ctx, CEPH_NOSNAP, on_finish), + m_snap_id(snap_id), m_ret_val(0) { + ceph_assert(snap_id != CEPH_NOSNAP); + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + State m_state = STATE_READ_MAP; + uint64_t m_snap_id; + int m_ret_val; + + bufferlist m_read_bl; + + void send_read_map(); + void send_invalidate_map(); + void send_write_map(); + +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H diff --git a/src/librbd/object_map/UnlockRequest.cc b/src/librbd/object_map/UnlockRequest.cc new file mode 100644 index 00000000..0220ec90 --- /dev/null +++ b/src/librbd/object_map/UnlockRequest.cc @@ -0,0 +1,66 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/UnlockRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::UnlockRequest: " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template <typename I> +UnlockRequest<I>::UnlockRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { +} + +template <typename I> +void UnlockRequest<I>::send() { + send_unlock(); +} + +template <typename I> +void UnlockRequest<I>::send_unlock() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::unlock(&op, RBD_LOCK_NAME, ""); + + using klass = UnlockRequest<I>; + librados::AioCompletion *rados_completion = + create_rados_callback<klass, &klass::handle_unlock>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *UnlockRequest<I>::handle_unlock(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0 && *ret_val != -ENOENT) { + lderr(m_image_ctx.cct) << "failed to release object map lock: " + << cpp_strerror(*ret_val) << dendl; + + } + + *ret_val = 0; + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::UnlockRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/UnlockRequest.h b/src/librbd/object_map/UnlockRequest.h new file mode 100644 index 00000000..ae1d9e93 --- /dev/null +++ b/src/librbd/object_map/UnlockRequest.h @@ -0,0 +1,47 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = ImageCtx> +class UnlockRequest { +public: + static UnlockRequest *create(ImageCtxT &image_ctx, Context *on_finish) { + return new UnlockRequest(image_ctx, on_finish); + } + + UnlockRequest(ImageCtxT &image_ctx, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> ----> UNLOCK ----> <finish> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + void send_unlock(); + Context* handle_unlock(int *ret_val); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::UnlockRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H diff --git a/src/librbd/object_map/UpdateRequest.cc b/src/librbd/object_map/UpdateRequest.cc new file mode 100644 index 00000000..4b305334 --- /dev/null +++ b/src/librbd/object_map/UpdateRequest.cc @@ -0,0 +1,129 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/UpdateRequest.h" +#include "include/rbd/object_map_types.h" +#include "include/stringify.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "cls/lock/cls_lock_client.h" +#include <string> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace object_map { + +namespace { + +// keep aligned to bit_vector 4K block sizes +const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10); + +} + +template <typename I> +void UpdateRequest<I>::send() { + update_object_map(); +} + +template <typename I> +void UpdateRequest<I>::update_object_map() { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.object_map_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + // break very large requests into manageable batches + m_update_end_object_no = std::min( + m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE); + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", " + << "[" << m_update_start_object_no << "," + << m_update_end_object_no << ") = " + << (m_current_state ? + stringify(static_cast<uint32_t>(*m_current_state)) : "") + << "->" << static_cast<uint32_t>(m_new_state) + << dendl; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + } + cls_client::object_map_update(&op, m_update_start_object_no, + m_update_end_object_no, m_new_state, + m_current_state); + + auto rados_completion = librbd::util::create_rados_callback< + UpdateRequest<I>, &UpdateRequest<I>::handle_update_object_map>(this); + std::vector<librados::snap_t> snaps; + int r = m_image_ctx.md_ctx.aio_operate( + oid, rados_completion, &op, 0, snaps, + (m_trace.valid() ? m_trace.get_info() : nullptr)); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void UpdateRequest<I>::handle_update_object_map(int r) { + ldout(m_image_ctx.cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT && m_ignore_enoent) { + r = 0; + } + if (r < 0 && m_ret_val == 0) { + m_ret_val = r; + } + + { + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock); + update_in_memory_object_map(); + + if (m_update_end_object_no < m_end_object_no) { + m_update_start_object_no = m_update_end_object_no; + update_object_map(); + return; + } + } + + // no more batch updates to send + complete(m_ret_val); +} + +template <typename I> +void UpdateRequest<I>::update_in_memory_object_map() { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + ceph_assert(m_image_ctx.object_map_lock.is_locked()); + + // rebuilding the object map might update on-disk only + if (m_snap_id == m_image_ctx.snap_id) { + ldout(m_image_ctx.cct, 20) << dendl; + + auto it = m_object_map.begin() + + std::min(m_update_start_object_no, m_object_map.size()); + auto end_it = m_object_map.begin() + + std::min(m_update_end_object_no, m_object_map.size()); + for (; it != end_it; ++it) { + auto state_ref = *it; + uint8_t state = state_ref; + if (!m_current_state || state == *m_current_state || + (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) { + state_ref = m_new_state; + } + } + } +} + +template <typename I> +void UpdateRequest<I>::finish_request() { +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::UpdateRequest<librbd::ImageCtx>; diff --git a/src/librbd/object_map/UpdateRequest.h b/src/librbd/object_map/UpdateRequest.h new file mode 100644 index 00000000..f5dcce15 --- /dev/null +++ b/src/librbd/object_map/UpdateRequest.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/object_map/Request.h" +#include "common/bit_vector.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Utils.h" +#include <boost/optional.hpp> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template <typename ImageCtxT = librbd::ImageCtx> +class UpdateRequest : public Request { +public: + static UpdateRequest *create(ImageCtx &image_ctx, + ceph::BitVector<2> *object_map, + uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, + bool ignore_enoent, Context *on_finish) { + return new UpdateRequest(image_ctx, object_map, snap_id, start_object_no, + end_object_no, new_state, current_state, + parent_trace, ignore_enoent, on_finish); + } + + UpdateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map, + uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional<uint8_t> ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + Context *on_finish) + : Request(image_ctx, snap_id, on_finish), m_object_map(*object_map), + m_start_object_no(start_object_no), m_end_object_no(end_object_no), + m_update_start_object_no(start_object_no), m_new_state(new_state), + m_current_state(current_state), + m_trace(util::create_trace(image_ctx, "update object map", parent_trace)), + m_ignore_enoent(ignore_enoent) + { + m_trace.event("start"); + } + virtual ~UpdateRequest() { + m_trace.event("finish"); + } + + void send() override; + +protected: + void finish_request() override; + +private: + /** + * @verbatim + * + * <start> + * | + * |/------------------\ + * v | (repeat in batches) + * UPDATE_OBJECT_MAP -----/ + * | + * v + * <finish> + * + * @endverbatim + */ + + ceph::BitVector<2> &m_object_map; + uint64_t m_start_object_no; + uint64_t m_end_object_no; + uint64_t m_update_start_object_no; + uint64_t m_update_end_object_no = 0; + uint8_t m_new_state; + boost::optional<uint8_t> m_current_state; + ZTracer::Trace m_trace; + bool m_ignore_enoent; + + int m_ret_val = 0; + + void update_object_map(); + void handle_update_object_map(int r); + + void update_in_memory_object_map(); + +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::UpdateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H diff --git a/src/librbd/operation/DisableFeaturesRequest.cc b/src/librbd/operation/DisableFeaturesRequest.cc new file mode 100644 index 00000000..27dfa458 --- /dev/null +++ b/src/librbd/operation/DisableFeaturesRequest.cc @@ -0,0 +1,646 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/DisableFeaturesRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/image/SetFlagsRequest.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/object_map/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::DisableFeaturesRequest: " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +DisableFeaturesRequest<I>::DisableFeaturesRequest(I &image_ctx, + Context *on_finish, + uint64_t journal_op_tid, + uint64_t features, + bool force) + : Request<I>(image_ctx, on_finish, journal_op_tid), m_features(features), + m_force(force) { +} + +template <typename I> +void DisableFeaturesRequest<I>::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ceph_assert(image_ctx.owner_lock.is_locked()); + + ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features + << dendl; + + send_prepare_lock(); +} + +template <typename I> +bool DisableFeaturesRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_prepare_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + image_ctx.state->prepare_lock(create_async_context_callback( + image_ctx, create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_prepare_lock>(this))); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_prepare_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_block_writes(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + RWLock::WLocker locker(image_ctx.owner_lock); + image_ctx.io_work_queue->block_writes(create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_block_writes>(this)); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + m_writes_blocked = true; + + { + RWLock::WLocker locker(image_ctx.owner_lock); + // avoid accepting new requests from peers while we manipulate + // the image features + if (image_ctx.exclusive_lock != nullptr && + (image_ctx.journal == nullptr || + !image_ctx.journal->is_journal_replaying())) { + image_ctx.exclusive_lock->block_requests(0); + m_requests_blocked = true; + } + } + + send_acquire_exclusive_lock(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_acquire_exclusive_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_acquire_exclusive_lock>(this); + + { + RWLock::WLocker locker(image_ctx.owner_lock); + // if disabling features w/ exclusive lock supported, we need to + // acquire the lock to temporarily block IO against the image + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + m_acquired_lock = true; + + image_ctx.exclusive_lock->acquire_lock(ctx); + return; + } + } + + ctx->complete(0); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_acquire_exclusive_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + image_ctx.owner_lock.get_read(); + if (*result < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl; + image_ctx.owner_lock.put_read(); + return handle_finish(*result); + } else if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + lderr(cct) << "failed to acquire exclusive lock" << dendl; + *result = image_ctx.exclusive_lock->get_unlocked_op_error(); + image_ctx.owner_lock.put_read(); + return handle_finish(*result); + } + + do { + m_features &= image_ctx.features; + + // interlock object-map and fast-diff together + if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) || + ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) { + m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF); + } + + m_new_features = image_ctx.features & ~m_features; + m_features_mask = m_features; + + if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) { + if ((m_new_features & RBD_FEATURE_OBJECT_MAP) != 0 || + (m_new_features & RBD_FEATURE_JOURNALING) != 0) { + lderr(cct) << "cannot disable exclusive-lock. object-map " + "or journaling must be disabled before " + "disabling exclusive-lock." << dendl; + *result = -EINVAL; + break; + } + m_features_mask |= (RBD_FEATURE_OBJECT_MAP | + RBD_FEATURE_FAST_DIFF | + RBD_FEATURE_JOURNALING); + } + if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) { + m_disable_flags |= RBD_FLAG_FAST_DIFF_INVALID; + } + if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) { + m_disable_flags |= RBD_FLAG_OBJECT_MAP_INVALID; + } + } while (false); + image_ctx.owner_lock.put_read(); + + if (*result < 0) { + return handle_finish(*result); + } + + send_get_mirror_mode(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_get_mirror_mode() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + send_append_op_event(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = DisableFeaturesRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_mirror_mode>(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_get_mirror_mode(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode); + } + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to retrieve pool mirror mode: " + << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + ldout(cct, 20) << this << " " << __func__ << ": m_mirror_mode=" + << m_mirror_mode << dendl; + + send_get_mirror_image(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_get_mirror_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + send_disable_mirror_image(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, image_ctx.id); + + using klass = DisableFeaturesRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_mirror_image>(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_get_mirror_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + cls::rbd::MirrorImage mirror_image; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::mirror_image_get_finish(&it, &mirror_image); + } + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to retrieve pool mirror image: " + << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + if ((mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) && !m_force) { + lderr(cct) << "cannot disable journaling: image mirroring " + << "enabled and mirror pool mode set to image" + << dendl; + *result = -EINVAL; + return handle_finish(*result); + } + + send_disable_mirror_image(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_disable_mirror_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_disable_mirror_image>(this); + + mirror::DisableRequest<I> *req = + mirror::DisableRequest<I>::create(&image_ctx, m_force, true, ctx); + req->send(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_disable_mirror_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to disable image mirroring: " << cpp_strerror(*result) + << dendl; + // not fatal + } + + send_close_journal(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_close_journal() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + { + RWLock::WLocker locker(image_ctx.owner_lock); + if (image_ctx.journal != nullptr) { + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::swap(m_journal, image_ctx.journal); + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_close_journal>(this); + + m_journal->close(ctx); + return; + } + } + + send_remove_journal(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_close_journal(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close image journal: " << cpp_strerror(*result) + << dendl; + } + + ceph_assert(m_journal != nullptr); + delete m_journal; + m_journal = nullptr; + + send_remove_journal(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_remove_journal() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_remove_journal>(this); + + journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create( + image_ctx.md_ctx, image_ctx.id, librbd::Journal<>::IMAGE_CLIENT_ID, + image_ctx.op_work_queue, ctx); + + req->send(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_remove_journal(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to remove image journal: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_append_op_event(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!this->template append_op_event< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_append_op_event>(this)) { + send_remove_object_map(); + } + + ldout(cct, 20) << this << " " << __func__ << dendl; +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_remove_object_map(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_remove_object_map() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) { + send_set_features(); + return; + } + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_remove_object_map>(this); + + object_map::RemoveRequest<I> *req = + object_map::RemoveRequest<I>::create(&image_ctx, ctx); + req->send(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_remove_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to remove object map: " << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + send_set_features(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_set_features() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": new_features=" + << m_new_features << ", features_mask=" << m_features_mask + << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::set_features(&op, m_new_features, m_features_mask); + + using klass = DisableFeaturesRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_set_features>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_set_features(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -EINVAL && (m_features_mask & RBD_FEATURE_JOURNALING) != 0) { + // NOTE: infernalis OSDs will not accept a mask with new features, so + // re-attempt with a reduced mask. + ldout(cct, 5) << this << " " << __func__ + << ": re-attempt with a reduced mask" << dendl; + m_features_mask &= ~RBD_FEATURE_JOURNALING; + send_set_features(); + } + + if (*result < 0) { + lderr(cct) << "failed to update features: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_update_flags(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_update_flags() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_disable_flags == 0) { + send_notify_update(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": disable_flags=" + << m_disable_flags << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_update_flags>(this); + + image::SetFlagsRequest<I> *req = + image::SetFlagsRequest<I>::create(&image_ctx, 0, m_disable_flags, ctx); + req->send(); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_update_flags(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update image flags: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_notify_update(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_notify_update() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_notify_update>(this); + + image_ctx.notify_update(ctx); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_notify_update(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (image_ctx.exclusive_lock == nullptr || !m_acquired_lock) { + return handle_finish(*result); + } + + send_release_exclusive_lock(); + return nullptr; +} + +template <typename I> +void DisableFeaturesRequest<I>::send_release_exclusive_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest<I>, + &DisableFeaturesRequest<I>::handle_release_exclusive_lock>(this); + + image_ctx.exclusive_lock->release_lock(ctx); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_release_exclusive_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + return handle_finish(*result); +} + +template <typename I> +Context *DisableFeaturesRequest<I>::handle_finish(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + { + RWLock::WLocker locker(image_ctx.owner_lock); + if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) { + image_ctx.exclusive_lock->unblock_requests(); + } + + image_ctx.io_work_queue->unblock_writes(); + } + image_ctx.state->handle_prepare_lock_complete(); + + return this->create_context_finisher(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::DisableFeaturesRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/DisableFeaturesRequest.h b/src/librbd/operation/DisableFeaturesRequest.h new file mode 100644 index 00000000..b064bc45 --- /dev/null +++ b/src/librbd/operation/DisableFeaturesRequest.h @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H +#define CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H + +#include "librbd/ImageCtx.h" +#include "librbd/operation/Request.h" +#include "cls/rbd/cls_rbd_client.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class DisableFeaturesRequest : public Request<ImageCtxT> { +public: + static DisableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, + uint64_t features, bool force) { + return new DisableFeaturesRequest(image_ctx, on_finish, journal_op_tid, + features, force); + } + + DisableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, uint64_t features, bool force); + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::UpdateFeaturesEvent(op_tid, m_features, false); + } + +private: + /** + * DisableFeatures goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_PREPARE_LOCK + * | + * v + * STATE_BLOCK_WRITES + * | + * v + * STATE_ACQUIRE_EXCLUSIVE_LOCK (skip if not + * | required) + * | (disbling journaling) + * \-------------------\ + * | | + * | V + * | STATE_GET_MIRROR_MODE + * |(not | + * | disabling v + * | journaling) STATE_GET_MIRROR_IMAGE + * | | + * | v + * | STATE_DISABLE_MIRROR_IMAGE (skip if not + * | | required) + * | v + * | STATE_CLOSE_JOURNAL + * | | + * | v + * | STATE_REMOVE_JOURNAL + * | | + * |/-------------------/ + * | + * v + * STATE_APPEND_OP_EVENT (skip if journaling + * | disabled) + * v + * STATE_REMOVE_OBJECT_MAP (skip if not + * | disabling object map) + * v + * STATE_SET_FEATURES + * | + * v + * STATE_UPDATE_FLAGS + * | + * v + * STATE_NOTIFY_UPDATE + * | + * v + * STATE_REALEASE_EXCLUSIVE_LOCK (skip if not + * | required) + * | (unblock writes) + * v + * <finish> + * + * @endverbatim + * + */ + + uint64_t m_features; + bool m_force; + + bool m_acquired_lock = false; + bool m_writes_blocked = false; + bool m_snap_lock_acquired = false; + bool m_requests_blocked = false; + + uint64_t m_new_features = 0; + uint64_t m_disable_flags = 0; + uint64_t m_features_mask = 0; + + decltype(ImageCtxT::journal) m_journal = nullptr; + cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + bufferlist m_out_bl; + + void send_prepare_lock(); + Context *handle_prepare_lock(int *result); + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_acquire_exclusive_lock(); + Context *handle_acquire_exclusive_lock(int *result); + + void send_get_mirror_mode(); + Context *handle_get_mirror_mode(int *result); + + void send_get_mirror_image(); + Context *handle_get_mirror_image(int *result); + + void send_disable_mirror_image(); + Context *handle_disable_mirror_image(int *result); + + void send_close_journal(); + Context *handle_close_journal(int *result); + + void send_remove_journal(); + Context *handle_remove_journal(int *result); + + void send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_remove_object_map(); + Context *handle_remove_object_map(int *result); + + void send_set_features(); + Context *handle_set_features(int *result); + + void send_update_flags(); + Context *handle_update_flags(int *result); + + void send_notify_update(); + Context *handle_notify_update(int *result); + + void send_release_exclusive_lock(); + Context *handle_release_exclusive_lock(int *result); + + Context *handle_finish(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::DisableFeaturesRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H diff --git a/src/librbd/operation/EnableFeaturesRequest.cc b/src/librbd/operation/EnableFeaturesRequest.cc new file mode 100644 index 00000000..e2c1113d --- /dev/null +++ b/src/librbd/operation/EnableFeaturesRequest.cc @@ -0,0 +1,489 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/EnableFeaturesRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/image/SetFlagsRequest.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/object_map/CreateRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::EnableFeaturesRequest: " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +EnableFeaturesRequest<I>::EnableFeaturesRequest(I &image_ctx, + Context *on_finish, + uint64_t journal_op_tid, + uint64_t features) + : Request<I>(image_ctx, on_finish, journal_op_tid), m_features(features) { +} + +template <typename I> +void EnableFeaturesRequest<I>::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ceph_assert(image_ctx.owner_lock.is_locked()); + + ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features + << dendl; + send_prepare_lock(); +} + +template <typename I> +bool EnableFeaturesRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_prepare_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + image_ctx.state->prepare_lock(create_async_context_callback( + image_ctx, create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_prepare_lock>(this))); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_prepare_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_block_writes(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + RWLock::WLocker locker(image_ctx.owner_lock); + image_ctx.io_work_queue->block_writes(create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_block_writes>(this)); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + m_writes_blocked = true; + + send_get_mirror_mode(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_get_mirror_mode() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + Context *ctx = create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_get_mirror_mode>(this); + ctx->complete(-ENOENT); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = EnableFeaturesRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_get_mirror_mode>(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_get_mirror_mode(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::mirror_mode_get_finish(&it, &mirror_mode); + } else if (*result == -ENOENT) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve pool mirror mode: " + << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + m_enable_mirroring = (mirror_mode == cls::rbd::MIRROR_MODE_POOL); + + bool create_journal = false; + do { + RWLock::WLocker locker(image_ctx.owner_lock); + + // avoid accepting new requests from peers while we manipulate + // the image features + if (image_ctx.exclusive_lock != nullptr && + (image_ctx.journal == nullptr || + !image_ctx.journal->is_journal_replaying())) { + image_ctx.exclusive_lock->block_requests(0); + m_requests_blocked = true; + } + + m_features &= ~image_ctx.features; + + // interlock object-map and fast-diff together + if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) || + ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) { + m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF); + } + + m_new_features = image_ctx.features | m_features; + m_features_mask = m_features; + + if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) { + if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot enable object-map. exclusive-lock must be " + "enabled before enabling object-map." << dendl; + *result = -EINVAL; + break; + } + m_enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID; + m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_FAST_DIFF); + } + if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) { + m_enable_flags |= RBD_FLAG_FAST_DIFF_INVALID; + m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP); + } + + if ((m_features & RBD_FEATURE_JOURNALING) != 0) { + if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot enable journaling. exclusive-lock must be " + "enabled before enabling journaling." << dendl; + *result = -EINVAL; + break; + } + m_features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK; + create_journal = true; + } + } while (false); + + if (*result < 0) { + return handle_finish(*result); + } + if (create_journal) { + send_create_journal(); + return nullptr; + } + send_append_op_event(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_create_journal() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + ldout(cct, 20) << this << " " << __func__ << dendl; + + journal::TagData tag_data(librbd::Journal<>::LOCAL_MIRROR_UUID); + Context *ctx = create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_create_journal>(this); + + journal::CreateRequest<I> *req = journal::CreateRequest<I>::create( + image_ctx.md_ctx, image_ctx.id, + image_ctx.config.template get_val<uint64_t>("rbd_journal_order"), + image_ctx.config.template get_val<uint64_t>("rbd_journal_splay_width"), + image_ctx.config.template get_val<std::string>("rbd_journal_pool"), + cls::journal::Tag::TAG_CLASS_NEW, tag_data, + librbd::Journal<>::IMAGE_CLIENT_ID, image_ctx.op_work_queue, ctx); + + req->send(); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_create_journal(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to create journal: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_append_op_event(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!this->template append_op_event< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_append_op_event>(this)) { + send_update_flags(); + } + + ldout(cct, 20) << this << " " << __func__ << dendl; +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_update_flags(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_update_flags() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_enable_flags == 0) { + send_set_features(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": enable_flags=" + << m_enable_flags << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_update_flags>(this); + + image::SetFlagsRequest<I> *req = + image::SetFlagsRequest<I>::create(&image_ctx, m_enable_flags, + m_enable_flags, ctx); + req->send(); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_update_flags(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update image flags: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_set_features(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_set_features() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": new_features=" + << m_new_features << ", features_mask=" << m_features_mask + << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::set_features(&op, m_new_features, m_features_mask); + + using klass = EnableFeaturesRequest<I>; + librados::AioCompletion *comp = + create_rados_callback<klass, &klass::handle_set_features>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_set_features(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update features: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_create_object_map(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_create_object_map() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (((image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0) || + ((m_features & RBD_FEATURE_OBJECT_MAP) == 0)) { + send_enable_mirror_image(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_create_object_map>(this); + + object_map::CreateRequest<I> *req = + object_map::CreateRequest<I>::create(&image_ctx, ctx); + req->send(); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_create_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to create object map: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_enable_mirror_image(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_enable_mirror_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + if (!m_enable_mirroring) { + send_notify_update(); + return; + } + + Context *ctx = create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_enable_mirror_image>(this); + + mirror::EnableRequest<I> *req = + mirror::EnableRequest<I>::create(&image_ctx, ctx); + req->send(); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_enable_mirror_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to enable mirroring: " << cpp_strerror(*result) + << dendl; + // not fatal + } + + send_notify_update(); + return nullptr; +} + +template <typename I> +void EnableFeaturesRequest<I>::send_notify_update() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest<I>, + &EnableFeaturesRequest<I>::handle_notify_update>(this); + + image_ctx.notify_update(ctx); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_notify_update(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + return handle_finish(*result); +} + +template <typename I> +Context *EnableFeaturesRequest<I>::handle_finish(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + { + RWLock::WLocker locker(image_ctx.owner_lock); + + if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) { + image_ctx.exclusive_lock->unblock_requests(); + } + if (m_writes_blocked) { + image_ctx.io_work_queue->unblock_writes(); + } + } + image_ctx.state->handle_prepare_lock_complete(); + + return this->create_context_finisher(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::EnableFeaturesRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/EnableFeaturesRequest.h b/src/librbd/operation/EnableFeaturesRequest.h new file mode 100644 index 00000000..1c91b4dc --- /dev/null +++ b/src/librbd/operation/EnableFeaturesRequest.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H +#define CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H + +#include "librbd/operation/Request.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class EnableFeaturesRequest : public Request<ImageCtxT> { +public: + static EnableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, + uint64_t features) { + return new EnableFeaturesRequest(image_ctx, on_finish, journal_op_tid, + features); + } + + EnableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, uint64_t features); + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::UpdateFeaturesEvent(op_tid, m_features, true); + } + +private: + /** + * EnableFeatures goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_PREPARE_LOCK + * | + * v + * STATE_BLOCK_WRITES + * | + * v + * STATE_GET_MIRROR_MODE + * | + * v + * STATE_CREATE_JOURNAL (skip if not + * | required) + * v + * STATE_APPEND_OP_EVENT (skip if journaling + * | disabled) + * v + * STATE_UPDATE_FLAGS + * | + * v + * STATE_SET_FEATURES + * | + * v + * STATE_CREATE_OBJECT_MAP (skip if not + * | required) + * v + * STATE_ENABLE_MIRROR_IMAGE + * | + * V + * STATE_NOTIFY_UPDATE + * | + * | (unblock writes) + * v + * <finish> + * @endverbatim + * + */ + + uint64_t m_features; + + bool m_enable_mirroring = false; + bool m_requests_blocked = false; + bool m_writes_blocked = false; + + uint64_t m_new_features = 0; + uint64_t m_enable_flags = 0; + uint64_t m_features_mask = 0; + + bufferlist m_out_bl; + + void send_prepare_lock(); + Context *handle_prepare_lock(int *result); + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_get_mirror_mode(); + Context *handle_get_mirror_mode(int *result); + + void send_create_journal(); + Context *handle_create_journal(int *result); + + void send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_update_flags(); + Context *handle_update_flags(int *result); + + void send_set_features(); + Context *handle_set_features(int *result); + + void send_create_object_map(); + Context *handle_create_object_map(int *result); + + void send_enable_mirror_image(); + Context *handle_enable_mirror_image(int *result); + + void send_notify_update(); + Context *handle_notify_update(int *result); + + Context *handle_finish(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::EnableFeaturesRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H diff --git a/src/librbd/operation/FlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc new file mode 100644 index 00000000..af21bd4e --- /dev/null +++ b/src/librbd/operation/FlattenRequest.cc @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/FlattenRequest.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/image/DetachParentRequest.h" +#include "librbd/Types.h" +#include "librbd/io/ObjectRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::FlattenRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +class C_FlattenObject : public C_AsyncObjectThrottle<I> { +public: + C_FlattenObject(AsyncObjectThrottle<I> &throttle, I *image_ctx, + ::SnapContext snapc, uint64_t object_no) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc), + m_object_no(object_no) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + CephContext *cct = image_ctx.cct; + + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during flatten" << dendl; + return -ERESTART; + } + + { + RWLock::RLocker snap_lock(image_ctx.snap_lock); + if (image_ctx.object_map != nullptr && + !image_ctx.object_map->object_may_not_exist(m_object_no)) { + // can skip because the object already exists + return 1; + } + } + + bufferlist bl; + string oid = image_ctx.get_object_name(m_object_no); + auto req = new io::ObjectWriteRequest<I>(&image_ctx, oid, m_object_no, 0, + std::move(bl), m_snapc, 0, {}, + this); + if (!req->has_parent()) { + // stop early if the parent went away - it just means + // another flatten finished first or the image was resized + delete req; + return 1; + } + + req->send(); + return 0; + } + +private: + ::SnapContext m_snapc; + uint64_t m_object_no; +}; + +template <typename I> +bool FlattenRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void FlattenRequest<I>::send_op() { + flatten_objects(); +} + +template <typename I> +void FlattenRequest<I>::flatten_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + assert(image_ctx.owner_lock.is_locked()); + auto ctx = create_context_callback< + FlattenRequest<I>, + &FlattenRequest<I>::handle_flatten_objects>(this); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_FlattenObject<I> >(), + boost::lambda::_1, &image_ctx, m_snapc, boost::lambda::_2)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, m_overlap_objects); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void FlattenRequest<I>::handle_flatten_objects(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -ERESTART) { + ldout(cct, 5) << "flatten operation interrupted" << dendl; + this->complete(r); + return; + } else if (r < 0) { + lderr(cct) << "flatten encountered an error: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + detach_child(); +} + +template <typename I> +void FlattenRequest<I>::detach_child() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + // should have been canceled prior to releasing lock + image_ctx.owner_lock.get_read(); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // if there are no snaps, remove from the children object as well + // (if snapshots remain, they have their own parent info, and the child + // will be removed when the last snap goes away) + image_ctx.snap_lock.get_read(); + if ((image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 && + !image_ctx.snaps.empty()) { + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + detach_parent(); + return; + } + image_ctx.snap_lock.put_read(); + + ldout(cct, 5) << dendl; + auto ctx = create_context_callback< + FlattenRequest<I>, + &FlattenRequest<I>::handle_detach_child>(this); + auto req = image::DetachChildRequest<I>::create(image_ctx, ctx); + req->send(); + image_ctx.owner_lock.put_read(); +} + +template <typename I> +void FlattenRequest<I>::handle_detach_child(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "detach encountered an error: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + detach_parent(); +} + +template <typename I> +void FlattenRequest<I>::detach_parent() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + // should have been canceled prior to releasing lock + image_ctx.owner_lock.get_read(); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // stop early if the parent went away - it just means + // another flatten finished first, so this one is useless. + image_ctx.parent_lock.get_read(); + if (!image_ctx.parent) { + ldout(cct, 5) << "image already flattened" << dendl; + image_ctx.parent_lock.put_read(); + image_ctx.owner_lock.put_read(); + this->complete(0); + return; + } + image_ctx.parent_lock.put_read(); + + // remove parent from this (base) image + auto ctx = create_context_callback< + FlattenRequest<I>, + &FlattenRequest<I>::handle_detach_parent>(this); + auto req = image::DetachParentRequest<I>::create(image_ctx, ctx); + req->send(); + image_ctx.owner_lock.put_read(); +} + +template <typename I> +void FlattenRequest<I>::handle_detach_parent(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "remove parent encountered an error: " << cpp_strerror(r) + << dendl; + } + + this->complete(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::FlattenRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/FlattenRequest.h b/src/librbd/operation/FlattenRequest.h new file mode 100644 index 00000000..78730963 --- /dev/null +++ b/src/librbd/operation/FlattenRequest.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H +#define CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H + +#include "librbd/operation/Request.h" +#include "common/snap_types.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class FlattenRequest : public Request<ImageCtxT> +{ +public: + FlattenRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t overlap_objects, const ::SnapContext &snapc, + ProgressContext &prog_ctx) + : Request<ImageCtxT>(image_ctx, on_finish), + m_overlap_objects(overlap_objects), + m_snapc(snapc), m_prog_ctx(prog_ctx) { + } + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::FlattenEvent(op_tid); + } + +private: + /** + * @verbatim + * + * <start> + * | + * v + * FLATTEN_OBJECTS + * | + * v + * DETACH_CHILD + * | + * v + * DETACH_PARENT + * | + * v + * <finish> + * + * @endverbatim + */ + + uint64_t m_overlap_objects; + ::SnapContext m_snapc; + ProgressContext &m_prog_ctx; + + void flatten_objects(); + void handle_flatten_objects(int r); + + void detach_child(); + void handle_detach_child(int r); + + void detach_parent(); + void handle_detach_parent(int r); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::FlattenRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H diff --git a/src/librbd/operation/MetadataRemoveRequest.cc b/src/librbd/operation/MetadataRemoveRequest.cc new file mode 100644 index 00000000..828e7a5b --- /dev/null +++ b/src/librbd/operation/MetadataRemoveRequest.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/MetadataRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MetadataRemoveRequest: " + +namespace librbd { +namespace operation { + +template <typename I> +MetadataRemoveRequest<I>::MetadataRemoveRequest(I &image_ctx, + Context *on_finish, + const std::string &key) + : Request<I>(image_ctx, on_finish), m_key(key) { +} + +template <typename I> +void MetadataRemoveRequest<I>::send_op() { + send_metadata_remove(); +} + +template <typename I> +bool MetadataRemoveRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void MetadataRemoveRequest<I>::send_metadata_remove() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectWriteOperation op; + cls_client::metadata_remove(&op, m_key); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::MetadataRemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/MetadataRemoveRequest.h b/src/librbd/operation/MetadataRemoveRequest.h new file mode 100644 index 00000000..1d7f2a46 --- /dev/null +++ b/src/librbd/operation/MetadataRemoveRequest.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offremove:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H + +#include "librbd/operation/Request.h" +#include <iosfwd> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class MetadataRemoveRequest : public Request<ImageCtxT> { +public: + MetadataRemoveRequest(ImageCtxT &image_ctx, Context *on_finish, + const std::string &key); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::MetadataRemoveEvent(op_tid, m_key); + } + +private: + std::string m_key; + + void send_metadata_remove(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::MetadataRemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H diff --git a/src/librbd/operation/MetadataSetRequest.cc b/src/librbd/operation/MetadataSetRequest.cc new file mode 100644 index 00000000..760e9b1e --- /dev/null +++ b/src/librbd/operation/MetadataSetRequest.cc @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/MetadataSetRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MetadataSetRequest: " + +namespace librbd { +namespace operation { + +template <typename I> +MetadataSetRequest<I>::MetadataSetRequest(I &image_ctx, + Context *on_finish, + const std::string &key, + const std::string &value) + : Request<I>(image_ctx, on_finish), m_key(key), m_value(value) { +} + +template <typename I> +void MetadataSetRequest<I>::send_op() { + send_metadata_set(); +} + +template <typename I> +bool MetadataSetRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void MetadataSetRequest<I>::send_metadata_set() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + m_data[m_key].append(m_value); + librados::ObjectWriteOperation op; + cls_client::metadata_set(&op, m_data); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::MetadataSetRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/MetadataSetRequest.h b/src/librbd/operation/MetadataSetRequest.h new file mode 100644 index 00000000..5f8daa2f --- /dev/null +++ b/src/librbd/operation/MetadataSetRequest.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H +#define CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H + +#include "librbd/operation/Request.h" +#include "include/buffer.h" +#include <string> +#include <map> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class MetadataSetRequest : public Request<ImageCtxT> { +public: + MetadataSetRequest(ImageCtxT &image_ctx, Context *on_finish, + const std::string &key, const std::string &value); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::MetadataSetEvent(op_tid, m_key, m_value); + } + +private: + std::string m_key; + std::string m_value; + std::map<std::string, bufferlist> m_data; + + void send_metadata_set(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::MetadataSetRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H diff --git a/src/librbd/operation/MigrateRequest.cc b/src/librbd/operation/MigrateRequest.cc new file mode 100644 index 00000000..e3ddff0e --- /dev/null +++ b/src/librbd/operation/MigrateRequest.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/MigrateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/ObjectCopyRequest.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectRequest.h" +#include "osdc/Striper.h" +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MigrateRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_async_context_callback; + +namespace { + +template <typename I> +class C_MigrateObject : public C_AsyncObjectThrottle<I> { +public: + C_MigrateObject(AsyncObjectThrottle<I> &throttle, I *image_ctx, + ::SnapContext snapc, uint64_t object_no) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc), + m_object_no(object_no) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + CephContext *cct = image_ctx.cct; + + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during migrate" << dendl; + return -ERESTART; + } + + start_async_op(); + return 0; + } + +private: + uint64_t m_object_size; + ::SnapContext m_snapc; + uint64_t m_object_no; + + io::AsyncOperation *m_async_op = nullptr; + + void start_async_op() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(m_async_op == nullptr); + m_async_op = new io::AsyncOperation(); + m_async_op->start_op(image_ctx); + + if (!image_ctx.io_work_queue->writes_blocked()) { + migrate_object(); + return; + } + + auto ctx = create_async_context_callback( + image_ctx, create_context_callback< + C_MigrateObject<I>, &C_MigrateObject<I>::handle_start_async_op>(this)); + m_async_op->finish_op(); + delete m_async_op; + m_async_op = nullptr; + image_ctx.io_work_queue->wait_on_writes_unblocked(ctx); + } + + void handle_start_async_op(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to start async op: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + start_async_op(); + } + + bool is_within_overlap_bounds() { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + auto overlap = std::min(image_ctx.size, image_ctx.migration_info.overlap); + return overlap > 0 && + Striper::get_num_objects(image_ctx.layout, overlap) > m_object_no; + } + + void migrate_object() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + CephContext *cct = image_ctx.cct; + + auto ctx = create_context_callback< + C_MigrateObject<I>, &C_MigrateObject<I>::handle_migrate_object>(this); + + if (is_within_overlap_bounds()) { + bufferlist bl; + string oid = image_ctx.get_object_name(m_object_no); + auto req = new io::ObjectWriteRequest<I>(&image_ctx, oid, m_object_no, 0, + std::move(bl), m_snapc, 0, {}, + ctx); + + ldout(cct, 20) << "copyup object req " << req << ", object_no " + << m_object_no << dendl; + + req->send(); + } else { + ceph_assert(image_ctx.parent != nullptr); + + auto req = deep_copy::ObjectCopyRequest<I>::create( + image_ctx.parent, &image_ctx, 0, 0, image_ctx.migration_info.snap_map, + m_object_no, image_ctx.migration_info.flatten, ctx); + + ldout(cct, 20) << "deep copy object req " << req << ", object_no " + << m_object_no << dendl; + req->send(); + } + } + + void handle_migrate_object(int r) { + CephContext *cct = this->m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } + + m_async_op->finish_op(); + delete m_async_op; + this->complete(r); + } +}; + +} // anonymous namespace + +template <typename I> +void MigrateRequest<I>::send_op() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << dendl; + + migrate_objects(); +} + +template <typename I> +bool MigrateRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + + return true; +} + +template <typename I> +void MigrateRequest<I>::migrate_objects() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ceph_assert(image_ctx.owner_lock.is_locked()); + + uint64_t overlap_objects = get_num_overlap_objects(); + + ldout(cct, 10) << "from 0 to " << overlap_objects << dendl; + + auto ctx = create_context_callback< + MigrateRequest<I>, &MigrateRequest<I>::handle_migrate_objects>(this); + + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_MigrateObject<I> >(), + boost::lambda::_1, &image_ctx, image_ctx.snapc, boost::lambda::_2)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, overlap_objects); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void MigrateRequest<I>::handle_migrate_objects(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to migrate objects: " << cpp_strerror(r) << dendl; + } + + this->complete(r); +} + +template <typename I> +uint64_t MigrateRequest<I>::get_num_overlap_objects() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << dendl; + + RWLock::RLocker snap_locker(image_ctx.snap_lock); + RWLock::RLocker parent_locker(image_ctx.parent_lock); + + auto overlap = image_ctx.migration_info.overlap; + + return overlap > 0 ? + Striper::get_num_objects(image_ctx.layout, overlap) : 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::MigrateRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/MigrateRequest.h b/src/librbd/operation/MigrateRequest.h new file mode 100644 index 00000000..99f3b012 --- /dev/null +++ b/src/librbd/operation/MigrateRequest.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H + +#include "librbd/operation/Request.h" +#include "common/snap_types.h" +#include "librbd/Types.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class MigrateRequest : public Request<ImageCtxT> +{ +public: + MigrateRequest(ImageCtxT &image_ctx, Context *on_finish, + ProgressContext &prog_ctx) + : Request<ImageCtxT>(image_ctx, on_finish), m_prog_ctx(prog_ctx) { + } + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + ceph_abort(); + return journal::UnknownEvent(); + } + +private: + /** + * Migrate goes through the following state machine to copy objects + * from the parent (migrating source) image: + * + * @verbatim + * + * <start> + * | + * v + * MIGRATE_OBJECTS + * | + * v + * <finish> + * + * @endverbatim + * + */ + + ProgressContext &m_prog_ctx; + + void migrate_objects(); + void handle_migrate_objects(int r); + + uint64_t get_num_overlap_objects(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::MigrateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H diff --git a/src/librbd/operation/ObjectMapIterate.cc b/src/librbd/operation/ObjectMapIterate.cc new file mode 100644 index 00000000..66958416 --- /dev/null +++ b/src/librbd/operation/ObjectMapIterate.cc @@ -0,0 +1,310 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/ObjectMapIterate.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "librbd/Utils.h" +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ObjectMapIterateRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template <typename I> +class C_VerifyObjectCallback : public C_AsyncObjectThrottle<I> { +public: + C_VerifyObjectCallback(AsyncObjectThrottle<I> &throttle, I *image_ctx, + uint64_t snap_id, uint64_t object_no, + ObjectIterateWork<I> handle_mismatch, + std::atomic_flag *invalidate) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), + m_snap_id(snap_id), m_object_no(object_no), + m_oid(image_ctx->get_object_name(m_object_no)), + m_handle_mismatch(handle_mismatch), + m_invalidate(invalidate) + { + m_io_ctx.dup(image_ctx->data_ctx); + m_io_ctx.snap_set_read(CEPH_SNAPDIR); + } + + void complete(int r) override { + I &image_ctx = this->m_image_ctx; + if (should_complete(r)) { + ldout(image_ctx.cct, 20) << m_oid << " C_VerifyObjectCallback completed " + << dendl; + m_io_ctx.close(); + + this->finish(r); + delete this; + } + } + + int send() override { + send_list_snaps(); + return 0; + } + +private: + librados::IoCtx m_io_ctx; + uint64_t m_snap_id; + uint64_t m_object_no; + std::string m_oid; + ObjectIterateWork<I> m_handle_mismatch; + std::atomic_flag *m_invalidate; + + librados::snap_set_t m_snap_set; + int m_snap_list_ret = 0; + + bool should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + if (r == 0) { + r = m_snap_list_ret; + } + if (r < 0 && r != -ENOENT) { + lderr(cct) << m_oid << " C_VerifyObjectCallback::should_complete: " + << "encountered an error: " << cpp_strerror(r) << dendl; + return true; + } + + ldout(cct, 20) << m_oid << " C_VerifyObjectCallback::should_complete: " + << " r=" + << r << dendl; + return object_map_action(get_object_state()); + } + + void send_list_snaps() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + ldout(image_ctx.cct, 5) << m_oid + << " C_VerifyObjectCallback::send_list_snaps" + << dendl; + + librados::ObjectReadOperation op; + op.list_snaps(&m_snap_set, &m_snap_list_ret); + + librados::AioCompletion *comp = util::create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_oid, comp, &op, NULL); + ceph_assert(r == 0); + comp->release(); + } + + uint8_t get_object_state() { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker snap_locker(image_ctx.snap_lock); + for (std::vector<librados::clone_info_t>::const_iterator r = + m_snap_set.clones.begin(); r != m_snap_set.clones.end(); ++r) { + librados::snap_t from_snap_id; + librados::snap_t to_snap_id; + if (r->cloneid == librados::SNAP_HEAD) { + from_snap_id = next_valid_snap_id(m_snap_set.seq + 1); + to_snap_id = librados::SNAP_HEAD; + } else { + from_snap_id = next_valid_snap_id(r->snaps[0]); + to_snap_id = r->snaps[r->snaps.size()-1]; + } + + if (to_snap_id < m_snap_id) { + continue; + } else if (m_snap_id < from_snap_id) { + break; + } + + if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 && + from_snap_id != m_snap_id) { + return OBJECT_EXISTS_CLEAN; + } + return OBJECT_EXISTS; + } + return OBJECT_NONEXISTENT; + } + + uint64_t next_valid_snap_id(uint64_t snap_id) { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.snap_lock.is_locked()); + + std::map<librados::snap_t, SnapInfo>::iterator it = + image_ctx.snap_info.lower_bound(snap_id); + if (it == image_ctx.snap_info.end()) { + return CEPH_NOSNAP; + } + return it->first; + } + + bool object_map_action(uint8_t new_state) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + RWLock::RLocker owner_locker(image_ctx.owner_lock); + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + RWLock::RLocker snap_locker(image_ctx.snap_lock); + ceph_assert(image_ctx.object_map != nullptr); + + RWLock::WLocker l(image_ctx.object_map_lock); + uint8_t state = (*image_ctx.object_map)[m_object_no]; + + ldout(cct, 10) << "C_VerifyObjectCallback::object_map_action" + << " object " << image_ctx.get_object_name(m_object_no) + << " state " << (int)state + << " new_state " << (int)new_state << dendl; + + if (state != new_state) { + int r = 0; + + ceph_assert(m_handle_mismatch); + r = m_handle_mismatch(image_ctx, m_object_no, state, new_state); + if (r) { + lderr(cct) << "object map error: object " + << image_ctx.get_object_name(m_object_no) + << " marked as " << (int)state << ", but should be " + << (int)new_state << dendl; + m_invalidate->test_and_set(); + } else { + ldout(cct, 1) << "object map inconsistent: object " + << image_ctx.get_object_name(m_object_no) + << " marked as " << (int)state << ", but should be " + << (int)new_state << dendl; + } + } + + return true; + } +}; + +} // anonymous namespace + +template <typename I> +void ObjectMapIterateRequest<I>::send() { + if (!m_image_ctx.data_ctx.is_valid()) { + this->async_complete(-ENODEV); + return; + } + + send_verify_objects(); +} + +template <typename I> +bool ObjectMapIterateRequest<I>::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl; + + if (r == -ENODEV) { + lderr(cct) << "missing data pool" << dendl; + return true; + } + + if (r < 0) { + lderr(cct) << "object map operation encountered an error: " + << cpp_strerror(r) << dendl; + } + + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + switch (m_state) { + case STATE_VERIFY_OBJECTS: + if (m_invalidate.test_and_set()) { + send_invalidate_object_map(); + return false; + } else if (r == 0) { + return true; + } + break; + + case STATE_INVALIDATE_OBJECT_MAP: + if (r == 0) { + return true; + } + break; + + default: + ceph_abort(); + break; + } + + if (r < 0) { + return true; + } + + return false; +} + +template <typename I> +void ObjectMapIterateRequest<I>::send_verify_objects() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + uint64_t snap_id; + uint64_t num_objects; + { + RWLock::RLocker l(m_image_ctx.snap_lock); + snap_id = m_image_ctx.snap_id; + num_objects = Striper::get_num_objects(m_image_ctx.layout, + m_image_ctx.get_image_size(snap_id)); + } + ldout(cct, 5) << this << " send_verify_objects" << dendl; + + m_state = STATE_VERIFY_OBJECTS; + + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_VerifyObjectCallback<I> >(), + boost::lambda::_1, &m_image_ctx, snap_id, + boost::lambda::_2, m_handle_mismatch, &m_invalidate)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, m_image_ctx, context_factory, this->create_callback_context(), + &m_prog_ctx, 0, num_objects); + throttle->start_ops( + m_image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +uint64_t ObjectMapIterateRequest<I>::get_image_size() const { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + if (m_image_ctx.snap_id == CEPH_NOSNAP) { + if (!m_image_ctx.resize_reqs.empty()) { + return m_image_ctx.resize_reqs.front()->get_image_size(); + } else { + return m_image_ctx.size; + } + } + return m_image_ctx.get_image_size(m_image_ctx.snap_id); +} + +template <typename I> +void ObjectMapIterateRequest<I>::send_invalidate_object_map() { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 5) << this << " send_invalidate_object_map" << dendl; + m_state = STATE_INVALIDATE_OBJECT_MAP; + + object_map::InvalidateRequest<I>*req = + object_map::InvalidateRequest<I>::create(m_image_ctx, m_image_ctx.snap_id, + true, + this->create_callback_context()); + + ceph_assert(m_image_ctx.owner_lock.is_locked()); + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + req->send(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::ObjectMapIterateRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/ObjectMapIterate.h b/src/librbd/operation/ObjectMapIterate.h new file mode 100644 index 00000000..14215902 --- /dev/null +++ b/src/librbd/operation/ObjectMapIterate.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H +#define CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H + +#include <iostream> +#include <atomic> + +#include "include/int_types.h" +#include "include/rbd/object_map_types.h" +#include "librbd/AsyncRequest.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +using ObjectIterateWork = bool(*)(ImageCtxT &image_ctx, + uint64_t object_no, + uint8_t current_state, + uint8_t new_state); + +template <typename ImageCtxT = ImageCtx> +class ObjectMapIterateRequest : public AsyncRequest<ImageCtxT> { +public: + ObjectMapIterateRequest(ImageCtxT &image_ctx, Context *on_finish, + ProgressContext &prog_ctx, + ObjectIterateWork<ImageCtxT> handle_mismatch) + : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx), + m_prog_ctx(prog_ctx), m_handle_mismatch(handle_mismatch) + { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + enum State { + STATE_VERIFY_OBJECTS, + STATE_INVALIDATE_OBJECT_MAP + }; + + ImageCtxT &m_image_ctx; + ProgressContext &m_prog_ctx; + ObjectIterateWork<ImageCtxT> m_handle_mismatch; + std::atomic_flag m_invalidate = ATOMIC_FLAG_INIT; + State m_state = STATE_VERIFY_OBJECTS; + + void send_verify_objects(); + void send_invalidate_object_map(); + + uint64_t get_image_size() const; +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::ObjectMapIterateRequest<librbd::ImageCtx>; + +#endif diff --git a/src/librbd/operation/RebuildObjectMapRequest.cc b/src/librbd/operation/RebuildObjectMapRequest.cc new file mode 100644 index 00000000..f923f36c --- /dev/null +++ b/src/librbd/operation/RebuildObjectMapRequest.cc @@ -0,0 +1,248 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/RebuildObjectMapRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/operation/TrimRequest.h" +#include "librbd/operation/ObjectMapIterate.h" +#include "librbd/Utils.h" +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::RebuildObjectMapRequest: " + +namespace librbd { +namespace operation { + +template <typename I> +void RebuildObjectMapRequest<I>::send() { + send_resize_object_map(); +} + +template <typename I> +bool RebuildObjectMapRequest<I>::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl; + + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + switch (m_state) { + case STATE_RESIZE_OBJECT_MAP: + ldout(cct, 5) << "RESIZE_OBJECT_MAP" << dendl; + if (r == -ESTALE && !m_attempted_trim) { + // objects are still flagged as in-use -- delete them + m_attempted_trim = true; + send_trim_image(); + return false; + } else if (r == 0) { + send_verify_objects(); + } + break; + + case STATE_TRIM_IMAGE: + ldout(cct, 5) << "TRIM_IMAGE" << dendl; + if (r == 0) { + send_resize_object_map(); + } + break; + + case STATE_VERIFY_OBJECTS: + ldout(cct, 5) << "VERIFY_OBJECTS" << dendl; + if (r == 0) { + send_save_object_map(); + } + break; + + case STATE_SAVE_OBJECT_MAP: + ldout(cct, 5) << "SAVE_OBJECT_MAP" << dendl; + if (r == 0) { + send_update_header(); + } + break; + case STATE_UPDATE_HEADER: + ldout(cct, 5) << "UPDATE_HEADER" << dendl; + if (r == 0) { + return true; + } + break; + + default: + ceph_abort(); + break; + } + + if (r == -ERESTART) { + ldout(cct, 5) << "rebuild object map operation interrupted" << dendl; + return true; + } else if (r < 0) { + lderr(cct) << "rebuild object map encountered an error: " << cpp_strerror(r) + << dendl; + return true; + } + return false; +} + +template <typename I> +void RebuildObjectMapRequest<I>::send_resize_object_map() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.snap_lock.get_read(); + ceph_assert(m_image_ctx.object_map != nullptr); + + uint64_t size = get_image_size(); + uint64_t num_objects = Striper::get_num_objects(m_image_ctx.layout, size); + + if (m_image_ctx.object_map->size() == num_objects) { + m_image_ctx.snap_lock.put_read(); + send_verify_objects(); + return; + } + + ldout(cct, 5) << this << " send_resize_object_map" << dendl; + m_state = STATE_RESIZE_OBJECT_MAP; + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + m_image_ctx.object_map->aio_resize(size, OBJECT_NONEXISTENT, + this->create_callback_context()); + m_image_ctx.snap_lock.put_read(); +} + +template <typename I> +void RebuildObjectMapRequest<I>::send_trim_image() { + CephContext *cct = m_image_ctx.cct; + + RWLock::RLocker l(m_image_ctx.owner_lock); + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + ldout(cct, 5) << this << " send_trim_image" << dendl; + m_state = STATE_TRIM_IMAGE; + + uint64_t new_size; + uint64_t orig_size; + { + RWLock::RLocker l(m_image_ctx.snap_lock); + ceph_assert(m_image_ctx.object_map != nullptr); + + new_size = get_image_size(); + orig_size = m_image_ctx.get_object_size() * + m_image_ctx.object_map->size(); + } + TrimRequest<I> *req = TrimRequest<I>::create(m_image_ctx, + this->create_callback_context(), + orig_size, new_size, m_prog_ctx); + req->send(); +} + +template <typename I> +bool update_object_map(I& image_ctx, uint64_t object_no, uint8_t current_state, + uint8_t new_state) { + CephContext *cct = image_ctx.cct; + uint64_t snap_id = image_ctx.snap_id; + + uint8_t state = (*image_ctx.object_map)[object_no]; + if (state == OBJECT_EXISTS && new_state == OBJECT_NONEXISTENT && + snap_id == CEPH_NOSNAP) { + // might be writing object to OSD concurrently + new_state = state; + } + + if (new_state != state) { + ldout(cct, 15) << image_ctx.get_object_name(object_no) + << " rebuild updating object map " + << static_cast<uint32_t>(state) << "->" + << static_cast<uint32_t>(new_state) << dendl; + (*image_ctx.object_map)[object_no] = new_state; + } + return false; +} + +template <typename I> +void RebuildObjectMapRequest<I>::send_verify_objects() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + m_state = STATE_VERIFY_OBJECTS; + ldout(cct, 5) << this << " send_verify_objects" << dendl; + + ObjectMapIterateRequest<I> *req = + new ObjectMapIterateRequest<I>(m_image_ctx, + this->create_callback_context(), + m_prog_ctx, update_object_map); + + req->send(); +} + +template <typename I> +void RebuildObjectMapRequest<I>::send_save_object_map() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 5) << this << " send_save_object_map" << dendl; + m_state = STATE_SAVE_OBJECT_MAP; + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + ceph_assert(m_image_ctx.object_map != nullptr); + m_image_ctx.object_map->aio_save(this->create_callback_context()); +} + +template <typename I> +void RebuildObjectMapRequest<I>::send_update_header() { + ceph_assert(m_image_ctx.owner_lock.is_locked()); + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + ldout(m_image_ctx.cct, 5) << this << " send_update_header" << dendl; + m_state = STATE_UPDATE_HEADER; + + librados::ObjectWriteOperation op; + + uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID | RBD_FLAG_FAST_DIFF_INVALID; + cls_client::set_flags(&op, m_image_ctx.snap_id, 0, flags); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + + RWLock::WLocker snap_locker(m_image_ctx.snap_lock); + m_image_ctx.update_flags(m_image_ctx.snap_id, flags, false); +} + +template <typename I> +uint64_t RebuildObjectMapRequest<I>::get_image_size() const { + ceph_assert(m_image_ctx.snap_lock.is_locked()); + if (m_image_ctx.snap_id == CEPH_NOSNAP) { + if (!m_image_ctx.resize_reqs.empty()) { + return m_image_ctx.resize_reqs.front()->get_image_size(); + } else { + return m_image_ctx.size; + } + } + return m_image_ctx.get_image_size(m_image_ctx.snap_id); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/RebuildObjectMapRequest.h b/src/librbd/operation/RebuildObjectMapRequest.h new file mode 100644 index 00000000..c7f1aa3b --- /dev/null +++ b/src/librbd/operation/RebuildObjectMapRequest.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H +#define CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H + +#include "include/int_types.h" +#include "librbd/AsyncRequest.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class RebuildObjectMapRequest : public AsyncRequest<ImageCtxT> { +public: + + RebuildObjectMapRequest(ImageCtxT &image_ctx, Context *on_finish, + ProgressContext &prog_ctx) + : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx), + m_prog_ctx(prog_ctx), m_attempted_trim(false) + { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + /** + * Rebuild object map goes through the following state machine to + * verify per-object state: + * + * <start> + * . | . . . . . . . . . . + * . | . . + * . v v . + * . STATE_RESIZE_OBJECT_MAP . . . > STATE_TRIM_IMAGE + * . | + * . v + * . . . > STATE_VERIFY_OBJECTS + * | + * v + * STATE_SAVE_OBJECT_MAP + * | + * v + * STATE_UPDATE_HEADER + * + * The _RESIZE_OBJECT_MAP state will be skipped if the object map + * is appropriately sized for the image. The _TRIM_IMAGE state will + * only be hit if the resize failed due to an in-use object. + */ + enum State { + STATE_RESIZE_OBJECT_MAP, + STATE_TRIM_IMAGE, + STATE_VERIFY_OBJECTS, + STATE_SAVE_OBJECT_MAP, + STATE_UPDATE_HEADER + }; + + ImageCtxT &m_image_ctx; + ProgressContext &m_prog_ctx; + State m_state = STATE_RESIZE_OBJECT_MAP; + bool m_attempted_trim; + + void send_resize_object_map(); + void send_trim_image(); + void send_verify_objects(); + void send_save_object_map(); + void send_update_header(); + + uint64_t get_image_size() const; + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H diff --git a/src/librbd/operation/RenameRequest.cc b/src/librbd/operation/RenameRequest.cc new file mode 100644 index 00000000..44c5e2cd --- /dev/null +++ b/src/librbd/operation/RenameRequest.cc @@ -0,0 +1,212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/RenameRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/rados/librados.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::RenameRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template <typename I> +std::ostream& operator<<(std::ostream& os, + const typename RenameRequest<I>::State& state) { + switch(state) { + case RenameRequest<I>::STATE_READ_SOURCE_HEADER: + os << "READ_SOURCE_HEADER"; + break; + case RenameRequest<I>::STATE_WRITE_DEST_HEADER: + os << "WRITE_DEST_HEADER"; + break; + case RenameRequest<I>::STATE_UPDATE_DIRECTORY: + os << "UPDATE_DIRECTORY"; + break; + case RenameRequest<I>::STATE_REMOVE_SOURCE_HEADER: + os << "REMOVE_SOURCE_HEADER"; + break; + default: + os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")"; + break; + } + return os; +} + +} // anonymous namespace + +template <typename I> +RenameRequest<I>::RenameRequest(I &image_ctx, Context *on_finish, + const std::string &dest_name) + : Request<I>(image_ctx, on_finish), m_dest_name(dest_name), + m_source_oid(image_ctx.old_format ? util::old_header_name(image_ctx.name) : + util::id_obj_name(image_ctx.name)), + m_dest_oid(image_ctx.old_format ? util::old_header_name(dest_name) : + util::id_obj_name(dest_name)) { +} + +template <typename I> +void RenameRequest<I>::send_op() { + send_read_source_header(); +} + +template <typename I> +bool RenameRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + r = filter_return_code(r); + if (r < 0) { + if (r == -EEXIST) { + ldout(cct, 1) << "image already exists" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; + } + + if (m_state == STATE_UPDATE_DIRECTORY) { + // update in-memory name before removing source header + apply(); + } else if (m_state == STATE_REMOVE_SOURCE_HEADER) { + return true; + } + + RWLock::RLocker owner_lock(image_ctx.owner_lock); + switch (m_state) { + case STATE_READ_SOURCE_HEADER: + send_write_destination_header(); + break; + case STATE_WRITE_DEST_HEADER: + send_update_directory(); + break; + case STATE_UPDATE_DIRECTORY: + send_remove_source_header(); + break; + default: + ceph_abort(); + break; + } + return false; +} + +template <typename I> +int RenameRequest<I>::filter_return_code(int r) const { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_state == STATE_READ_SOURCE_HEADER && r == -ENOENT) { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.name == m_dest_name) { + // signal that replay raced with itself + return -EEXIST; + } + } else if (m_state == STATE_REMOVE_SOURCE_HEADER && r < 0) { + if (r != -ENOENT) { + lderr(cct) << "warning: couldn't remove old source object (" + << m_source_oid << ")" << dendl; + } + return 0; + } + return r; +} + +template <typename I> +void RenameRequest<I>::send_read_source_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_READ_SOURCE_HEADER; + + librados::ObjectReadOperation op; + op.read(0, 0, NULL, NULL); + + // TODO: old code read omap values but there are no omap values on the + // old format header nor the new format id object + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op, + &m_header_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void RenameRequest<I>::send_write_destination_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_WRITE_DEST_HEADER; + + librados::ObjectWriteOperation op; + op.create(true); + op.write_full(m_header_bl); + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(m_dest_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void RenameRequest<I>::send_update_directory() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_UPDATE_DIRECTORY; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + bufferlist cmd_bl; + bufferlist empty_bl; + encode(static_cast<__u8>(CEPH_OSD_TMAP_SET), cmd_bl); + encode(m_dest_name, cmd_bl); + encode(empty_bl, cmd_bl); + encode(static_cast<__u8>(CEPH_OSD_TMAP_RM), cmd_bl); + encode(image_ctx.name, cmd_bl); + op.tmap_update(cmd_bl); + } else { + cls_client::dir_rename_image(&op, image_ctx.name, m_dest_name, + image_ctx.id); + } + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void RenameRequest<I>::send_remove_source_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_REMOVE_SOURCE_HEADER; + + librados::ObjectWriteOperation op; + op.remove(); + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +void RenameRequest<I>::apply() { + I &image_ctx = this->m_image_ctx; + image_ctx.set_image_name(m_dest_name); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::RenameRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/RenameRequest.h b/src/librbd/operation/RenameRequest.h new file mode 100644 index 00000000..6534d36c --- /dev/null +++ b/src/librbd/operation/RenameRequest.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_RENAME_REQUEST_H +#define CEPH_LIBRBD_RENAME_REQUEST_H + +#include "librbd/operation/Request.h" +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class RenameRequest : public Request<ImageCtxT> +{ +public: + /** + * Rename goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_READ_SOURCE_HEADER + * | + * v + * STATE_WRITE_DEST_HEADER + * | + * v + * STATE_UPDATE_DIRECTORY + * | + * v + * STATE_REMOVE_SOURCE_HEADER + * | + * v + * <finish> + * + * @endverbatim + * + */ + enum State { + STATE_READ_SOURCE_HEADER, + STATE_WRITE_DEST_HEADER, + STATE_UPDATE_DIRECTORY, + STATE_REMOVE_SOURCE_HEADER + }; + + RenameRequest(ImageCtxT &image_ctx, Context *on_finish, + const std::string &dest_name); + +protected: + void send_op() override; + bool should_complete(int r) override; + int filter_return_code(int r) const override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::RenameEvent(op_tid, m_dest_name); + } + +private: + std::string m_dest_name; + + std::string m_source_oid; + std::string m_dest_oid; + + State m_state = STATE_READ_SOURCE_HEADER; + + bufferlist m_header_bl; + + void send_read_source_header(); + void send_write_destination_header(); + void send_update_directory(); + void send_remove_source_header(); + + void apply(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::RenameRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_RENAME_REQUEST_H diff --git a/src/librbd/operation/Request.cc b/src/librbd/operation/Request.cc new file mode 100644 index 00000000..429cc01e --- /dev/null +++ b/src/librbd/operation/Request.cc @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/Request.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Request: " + +namespace librbd { +namespace operation { + +template <typename I> +Request<I>::Request(I &image_ctx, Context *on_finish, uint64_t journal_op_tid) + : AsyncRequest<I>(image_ctx, on_finish), m_op_tid(journal_op_tid) { +} + +template <typename I> +void Request<I>::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + // automatically create the event if we don't need to worry + // about affecting concurrent IO ops + if (can_affect_io() || !append_op_event()) { + send_op(); + } +} + +template <typename I> +Context *Request<I>::create_context_finisher(int r) { + // automatically commit the event if required (delete after commit) + if (m_appended_op_event && !m_committed_op_event && + commit_op_event(r)) { + return nullptr; + } + + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + return util::create_context_callback<Request<I>, &Request<I>::finish>(this); +} + +template <typename I> +void Request<I>::finish_and_destroy(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + // automatically commit the event if required (delete after commit) + if (m_appended_op_event && !m_committed_op_event && + commit_op_event(r)) { + return; + } + + AsyncRequest<I>::finish_and_destroy(r); +} + +template <typename I> +void Request<I>::finish(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + ceph_assert(!m_appended_op_event || m_committed_op_event); + AsyncRequest<I>::finish(r); +} + +template <typename I> +bool Request<I>::append_op_event() { + I &image_ctx = this->m_image_ctx; + + ceph_assert(image_ctx.owner_lock.is_locked()); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()) { + append_op_event(util::create_context_callback< + Request<I>, &Request<I>::handle_op_event_safe>(this)); + return true; + } + return false; +} + +template <typename I> +bool Request<I>::commit_op_event(int r) { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + if (!m_appended_op_event) { + return false; + } + + ceph_assert(m_op_tid != 0); + ceph_assert(!m_committed_op_event); + m_committed_op_event = true; + + if (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()) { + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + // ops will be canceled / completed before closing journal + ceph_assert(image_ctx.journal->is_journal_ready()); + image_ctx.journal->commit_op_event(m_op_tid, r, + new C_CommitOpEvent(this, r)); + return true; + } + return false; +} + +template <typename I> +void Request<I>::handle_commit_op_event(int r, int original_ret_val) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r) + << dendl; + } + if (original_ret_val < 0) { + r = original_ret_val; + } + finish(r); +} + +template <typename I> +void Request<I>::replay_op_ready(Context *on_safe) { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + ceph_assert(image_ctx.snap_lock.is_locked()); + ceph_assert(m_op_tid != 0); + + m_appended_op_event = true; + image_ctx.journal->replay_op_ready( + m_op_tid, util::create_async_context_callback(image_ctx, on_safe)); +} + +template <typename I> +void Request<I>::append_op_event(Context *on_safe) { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + ceph_assert(image_ctx.snap_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_op_tid = image_ctx.journal->allocate_op_tid(); + image_ctx.journal->append_op_event( + m_op_tid, journal::EventEntry{create_event(m_op_tid)}, + new C_AppendOpEvent(this, on_safe)); +} + +template <typename I> +void Request<I>::handle_op_event_safe(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r) + << dendl; + this->finish(r); + delete this; + } else { + ceph_assert(!can_affect_io()); + + // haven't started the request state machine yet + RWLock::RLocker owner_locker(image_ctx.owner_lock); + send_op(); + } +} + +} // namespace operation +} // namespace librbd + +#ifndef TEST_F +template class librbd::operation::Request<librbd::ImageCtx>; +#endif diff --git a/src/librbd/operation/Request.h b/src/librbd/operation/Request.h new file mode 100644 index 00000000..315a3c96 --- /dev/null +++ b/src/librbd/operation/Request.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_REQUEST_H +#define CEPH_LIBRBD_OPERATION_REQUEST_H + +#include "librbd/AsyncRequest.h" +#include "include/Context.h" +#include "common/RWLock.h" +#include "librbd/Utils.h" +#include "librbd/Journal.h" + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class Request : public AsyncRequest<ImageCtxT> { +public: + Request(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid = 0); + + void send(); + +protected: + void finish(int r) override; + virtual void send_op() = 0; + + virtual bool can_affect_io() const { + return false; + } + virtual journal::Event create_event(uint64_t op_tid) const = 0; + + template <typename T, Context*(T::*MF)(int*)> + bool append_op_event(T *request) { + ImageCtxT &image_ctx = this->m_image_ctx; + + ceph_assert(can_affect_io()); + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.journal != nullptr) { + if (image_ctx.journal->is_journal_replaying()) { + Context *ctx = util::create_context_callback<T, MF>(request); + replay_op_ready(ctx); + return true; + } else if (image_ctx.journal->is_journal_appending()) { + Context *ctx = util::create_context_callback<T, MF>(request); + append_op_event(ctx); + return true; + } + } + return false; + } + + bool append_op_event(); + + // NOTE: temporary until converted to new state machine format + Context *create_context_finisher(int r); + void finish_and_destroy(int r) override; + +private: + struct C_AppendOpEvent : public Context { + Request *request; + Context *on_safe; + C_AppendOpEvent(Request *request, Context *on_safe) + : request(request), on_safe(on_safe) { + } + void finish(int r) override { + if (r >= 0) { + request->m_appended_op_event = true; + } + on_safe->complete(r); + } + }; + + struct C_CommitOpEvent : public Context { + Request *request; + int ret_val; + C_CommitOpEvent(Request *request, int ret_val) + : request(request), ret_val(ret_val) { + } + void finish(int r) override { + request->handle_commit_op_event(r, ret_val); + delete request; + } + }; + + uint64_t m_op_tid = 0; + bool m_appended_op_event = false; + bool m_committed_op_event = false; + + void replay_op_ready(Context *on_safe); + void append_op_event(Context *on_safe); + void handle_op_event_safe(int r); + + bool commit_op_event(int r); + void handle_commit_op_event(int r, int original_ret_val); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::Request<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_REQUEST_H diff --git a/src/librbd/operation/ResizeRequest.cc b/src/librbd/operation/ResizeRequest.cc new file mode 100644 index 00000000..537436ce --- /dev/null +++ b/src/librbd/operation/ResizeRequest.cc @@ -0,0 +1,467 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/ResizeRequest.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/operation/TrimRequest.h" +#include "common/dout.h" +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::ResizeRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +ResizeRequest<I>::ResizeRequest(I &image_ctx, Context *on_finish, + uint64_t new_size, bool allow_shrink, ProgressContext &prog_ctx, + uint64_t journal_op_tid, bool disable_journal) + : Request<I>(image_ctx, on_finish, journal_op_tid), + m_original_size(0), m_new_size(new_size), m_allow_shrink(allow_shrink), + m_prog_ctx(prog_ctx), m_new_parent_overlap(0), m_disable_journal(disable_journal), + m_xlist_item(this) +{ +} + +template <typename I> +ResizeRequest<I>::~ResizeRequest() { + I &image_ctx = this->m_image_ctx; + ResizeRequest *next_req = NULL; + { + RWLock::WLocker snap_locker(image_ctx.snap_lock); + ceph_assert(m_xlist_item.remove_myself()); + if (!image_ctx.resize_reqs.empty()) { + next_req = image_ctx.resize_reqs.front(); + } + } + + if (next_req != NULL) { + RWLock::RLocker owner_locker(image_ctx.owner_lock); + next_req->send(); + } +} + +template <typename I> +void ResizeRequest<I>::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + { + RWLock::WLocker snap_locker(image_ctx.snap_lock); + if (!m_xlist_item.is_on_list()) { + image_ctx.resize_reqs.push_back(&m_xlist_item); + if (image_ctx.resize_reqs.front() != this) { + return; + } + } + + ceph_assert(image_ctx.resize_reqs.front() == this); + m_original_size = image_ctx.size; + compute_parent_overlap(); + } + + Request<I>::send(); +} + +template <typename I> +void ResizeRequest<I>::send_op() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + if (this->is_canceled()) { + this->async_complete(-ERESTART); + } else { + send_pre_block_writes(); + } +} + +template <typename I> +void ResizeRequest<I>::send_pre_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + image_ctx.io_work_queue->block_writes(create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_pre_block_writes>(this)); +} + +template <typename I> +Context *ResizeRequest<I>::handle_pre_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(*result); + } + + return send_append_op_event(); +} + +template <typename I> +Context *ResizeRequest<I>::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_new_size < m_original_size && !m_allow_shrink) { + ldout(cct, 1) << "shrinking the image is not permitted" << dendl; + image_ctx.io_work_queue->unblock_writes(); + this->async_complete(-EINVAL); + return nullptr; + } + + if (m_disable_journal || !this->template append_op_event< + ResizeRequest<I>, &ResizeRequest<I>::handle_append_op_event>(this)) { + return send_grow_object_map(); + } + + ldout(cct, 5) << dendl; + return nullptr; +} + +template <typename I> +Context *ResizeRequest<I>::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(*result); + } + + return send_grow_object_map(); +} + +template <typename I> +void ResizeRequest<I>::send_trim_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + TrimRequest<I> *req = TrimRequest<I>::create( + image_ctx, create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_trim_image>(this), + m_original_size, m_new_size, m_prog_ctx); + req->send(); +} + +template <typename I> +Context *ResizeRequest<I>::handle_trim_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result == -ERESTART) { + ldout(cct, 5) << "resize operation interrupted" << dendl; + return this->create_context_finisher(*result); + } else if (*result < 0) { + lderr(cct) << "failed to trim image: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_post_block_writes(); + return nullptr; +} + +template <typename I> +void ResizeRequest<I>::send_flush_cache() { + I &image_ctx = this->m_image_ctx; + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + auto ctx = create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_flush_cache>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(&image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec<I>::create_flush_request( + image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + delete req; +} + +template <typename I> +Context *ResizeRequest<I>::handle_flush_cache(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to flush cache: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_invalidate_cache(); + return nullptr; +} + +template <typename I> +void ResizeRequest<I>::send_invalidate_cache() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + // need to invalidate since we're deleting objects, and + // ObjectCacher doesn't track non-existent objects + RWLock::RLocker owner_locker(image_ctx.owner_lock); + image_ctx.io_object_dispatcher->invalidate_cache(create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_invalidate_cache>(this)); +} + +template <typename I> +Context *ResizeRequest<I>::handle_invalidate_cache(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + // ignore busy error -- writeback was successfully flushed so we might be + // wasting some cache space for trimmed objects, but they will get purged + // eventually. Most likely cause of the issue was a in-flight cache read + if (*result < 0 && *result != -EBUSY) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result) + << dendl; + return this->create_context_finisher(*result); + } + + send_trim_image(); + return nullptr; +} + +template <typename I> +Context *ResizeRequest<I>::send_grow_object_map() { + I &image_ctx = this->m_image_ctx; + + { + RWLock::WLocker snap_locker(image_ctx.snap_lock); + m_shrink_size_visible = true; + } + + if (m_original_size == m_new_size) { + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(0); + } else if (m_new_size < m_original_size) { + image_ctx.io_work_queue->unblock_writes(); + send_flush_cache(); + return nullptr; + } + + image_ctx.owner_lock.get_read(); + image_ctx.snap_lock.get_read(); + if (image_ctx.object_map == nullptr) { + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + + // IO is still blocked + send_update_header(); + return nullptr; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + image_ctx.object_map->aio_resize( + m_new_size, OBJECT_NONEXISTENT, create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_grow_object_map>(this)); + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + return nullptr; +} + +template <typename I> +Context *ResizeRequest<I>::handle_grow_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to resize object map: " + << cpp_strerror(*result) << dendl; + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(*result); + } + + // IO is still blocked + send_update_header(); + return nullptr; +} + +template <typename I> +Context *ResizeRequest<I>::send_shrink_object_map() { + I &image_ctx = this->m_image_ctx; + + image_ctx.owner_lock.get_read(); + image_ctx.snap_lock.get_read(); + if (image_ctx.object_map == nullptr || m_new_size > m_original_size) { + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + + update_size_and_overlap(); + return this->create_context_finisher(0); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "original_size=" << m_original_size << ", " + << "new_size=" << m_new_size << dendl; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + image_ctx.object_map->aio_resize( + m_new_size, OBJECT_NONEXISTENT, create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_shrink_object_map>(this)); + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + return nullptr; +} + +template <typename I> +Context *ResizeRequest<I>::handle_shrink_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to resize object map: " + << cpp_strerror(*result) << dendl; + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(*result); + } + + update_size_and_overlap(); + return this->create_context_finisher(0); +} + +template <typename I> +void ResizeRequest<I>::send_post_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + image_ctx.io_work_queue->block_writes(create_context_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_post_block_writes>(this)); +} + +template <typename I> +Context *ResizeRequest<I>::handle_post_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + image_ctx.io_work_queue->unblock_writes(); + lderr(cct) << "failed to block writes prior to header update: " + << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_update_header(); + return nullptr; +} + +template <typename I> +void ResizeRequest<I>::send_update_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "original_size=" << m_original_size << ", " + << "new_size=" << m_new_size << dendl;; + + // should have been canceled prior to releasing lock + RWLock::RLocker owner_locker(image_ctx.owner_lock); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + // rewrite only the size field of the header + ceph_le64 new_size = init_le64(m_new_size); + bufferlist bl; + bl.append(reinterpret_cast<const char*>(&new_size), sizeof(new_size)); + op.write(offsetof(rbd_obj_header_ondisk, image_size), bl); + } else { + cls_client::set_size(&op, m_new_size); + } + + librados::AioCompletion *rados_completion = create_rados_callback< + ResizeRequest<I>, &ResizeRequest<I>::handle_update_header>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, + rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *ResizeRequest<I>::handle_update_header(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update image header: " << cpp_strerror(*result) + << dendl; + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(*result); + } + + return send_shrink_object_map(); +} + +template <typename I> +void ResizeRequest<I>::compute_parent_overlap() { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker l2(image_ctx.parent_lock); + if (image_ctx.parent == NULL) { + m_new_parent_overlap = 0; + } else { + m_new_parent_overlap = std::min(m_new_size, image_ctx.parent_md.overlap); + } +} + +template <typename I> +void ResizeRequest<I>::update_size_and_overlap() { + I &image_ctx = this->m_image_ctx; + { + RWLock::WLocker snap_locker(image_ctx.snap_lock); + image_ctx.size = m_new_size; + + RWLock::WLocker parent_locker(image_ctx.parent_lock); + if (image_ctx.parent != NULL && m_new_size < m_original_size) { + image_ctx.parent_md.overlap = m_new_parent_overlap; + } + } + + // blocked by PRE_BLOCK_WRITES (grow) or POST_BLOCK_WRITES (shrink) state + image_ctx.io_work_queue->unblock_writes(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::ResizeRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/ResizeRequest.h b/src/librbd/operation/ResizeRequest.h new file mode 100644 index 00000000..f5e2f807 --- /dev/null +++ b/src/librbd/operation/ResizeRequest.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H + +#include "librbd/operation/Request.h" +#include "include/xlist.h" + +namespace librbd +{ + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class ResizeRequest : public Request<ImageCtxT> { +public: + static ResizeRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t new_size, bool allow_shrink, + ProgressContext &prog_ctx, uint64_t journal_op_tid, + bool disable_journal) { + return new ResizeRequest(image_ctx, on_finish, new_size, allow_shrink, prog_ctx, + journal_op_tid, disable_journal); + } + + ResizeRequest(ImageCtxT &image_ctx, Context *on_finish, uint64_t new_size, + bool allow_shrink, ProgressContext &prog_ctx, uint64_t journal_op_tid, + bool disable_journal); + ~ResizeRequest() override; + + inline bool shrinking() const { + return (m_shrink_size_visible && m_new_size < m_original_size); + } + + inline uint64_t get_image_size() const { + return m_new_size; + } + + void send() override; + +protected: + void send_op() override; + bool should_complete(int r) override { + return true; + } + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::ResizeEvent(op_tid, m_new_size); + } + +private: + /** + * Resize goes through the following state machine to resize the image + * and update the object map: + * + * @verbatim + * + * <start> + * | + * v + * STATE_PRE_BLOCK_WRITES + * | + * v + * STATE_APPEND_OP_EVENT (skip if journaling + * | disabled) + * | + * | (grow) + * |\--------> STATE_GROW_OBJECT_MAP (skip if object map + * | | disabled) + * | v + * | STATE_UPDATE_HEADER ----------------------------\ + * | (unblock writes) | + * | | + * | (unblock writes) | + * | | + * | (shrink) | + * |\--------> STATE_FLUSH_CACHE | + * | | | + * | v | + * | STATE_INVALIDATE_CACHE | + * | | | + * | v | + * | STATE_TRIM_IMAGE | + * | | | + * | v | + * | STATE_POST_BLOCK_WRITES | + * | | | + * | v | + * | STATE_UPDATE_HEADER | + * | | | + * | v | + * | STATE_SHRINK_OBJECT_MAP (skip if object map | + * | | disabled) | + * | | (unblock writes) | + * | (no change) v | + * \------------> <finish> <-----------------------------------/ + * + * @endverbatim + * + * The _OBJECT_MAP states are skipped if the object map isn't enabled. + * The state machine will immediately transition to _FINISHED if there + * are no objects to trim. + */ + + uint64_t m_original_size; + uint64_t m_new_size; + bool m_allow_shrink = true; + ProgressContext &m_prog_ctx; + uint64_t m_new_parent_overlap; + bool m_shrink_size_visible = false; + bool m_disable_journal = false; + + typename xlist<ResizeRequest<ImageCtxT>*>::item m_xlist_item; + + void send_pre_block_writes(); + Context *handle_pre_block_writes(int *result); + + Context *send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_flush_cache(); + Context *handle_flush_cache(int *result); + + void send_invalidate_cache(); + Context *handle_invalidate_cache(int *result); + + void send_trim_image(); + Context *handle_trim_image(int *result); + + Context *send_grow_object_map(); + Context *handle_grow_object_map(int *result); + + Context *send_shrink_object_map(); + Context *handle_shrink_object_map(int *result); + + void send_post_block_writes(); + Context *handle_post_block_writes(int *result); + + void send_update_header(); + Context *handle_update_header(int *result); + + void compute_parent_overlap(); + void update_size_and_overlap(); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::ResizeRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H diff --git a/src/librbd/operation/SnapshotCreateRequest.cc b/src/librbd/operation/SnapshotCreateRequest.cc new file mode 100644 index 00000000..66293609 --- /dev/null +++ b/src/librbd/operation/SnapshotCreateRequest.cc @@ -0,0 +1,342 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/operation/SnapshotCreateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ImageRequestWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotCreateRequest: " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +SnapshotCreateRequest<I>::SnapshotCreateRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t journal_op_tid, + bool skip_object_map) + : Request<I>(image_ctx, on_finish, journal_op_tid), + m_snap_namespace(snap_namespace), m_snap_name(snap_name), + m_skip_object_map(skip_object_map), m_ret_val(0), m_snap_id(CEPH_NOSNAP) { +} + +template <typename I> +void SnapshotCreateRequest<I>::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(cct) << "missing data pool" << dendl; + this->async_complete(-ENODEV); + return; + } + + send_suspend_requests(); +} + +template <typename I> +void SnapshotCreateRequest<I>::send_suspend_requests() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + // TODO suspend (shrink) resize to ensure consistent RBD mirror + send_suspend_aio(); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_suspend_requests(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + // TODO + send_suspend_aio(); + return nullptr; +} + +template <typename I> +void SnapshotCreateRequest<I>::send_suspend_aio() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + image_ctx.io_work_queue->block_writes(create_context_callback< + SnapshotCreateRequest<I>, + &SnapshotCreateRequest<I>::handle_suspend_aio>(this)); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_suspend_aio(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(*result); + } + + send_append_op_event(); + return nullptr; +} + +template <typename I> +void SnapshotCreateRequest<I>::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + if (!this->template append_op_event< + SnapshotCreateRequest<I>, + &SnapshotCreateRequest<I>::handle_append_op_event>(this)) { + send_allocate_snap_id(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + image_ctx.io_work_queue->unblock_writes(); + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + return this->create_context_finisher(*result); + } + + send_allocate_snap_id(); + return nullptr; +} + +template <typename I> +void SnapshotCreateRequest<I>::send_allocate_snap_id() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + librados::AioCompletion *rados_completion = create_rados_callback< + SnapshotCreateRequest<I>, + &SnapshotCreateRequest<I>::handle_allocate_snap_id>(this); + image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion); + rados_completion->release(); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_allocate_snap_id(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << ", " + << "snap_id=" << m_snap_id << dendl; + + if (*result < 0) { + save_result(result); + image_ctx.io_work_queue->unblock_writes(); + lderr(cct) << "failed to allocate snapshot id: " << cpp_strerror(*result) + << dendl; + return this->create_context_finisher(*result); + } + + send_create_snap(); + return nullptr; +} + +template <typename I> +void SnapshotCreateRequest<I>::send_create_snap() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + RWLock::RLocker parent_locker(image_ctx.parent_lock); + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // save current size / parent info for creating snapshot record in ImageCtx + m_size = image_ctx.size; + m_parent_info = image_ctx.parent_md; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + cls_client::old_snapshot_add(&op, m_snap_id, m_snap_name); + } else { + cls_client::snapshot_add(&op, m_snap_id, m_snap_name, m_snap_namespace); + } + + librados::AioCompletion *rados_completion = create_rados_callback< + SnapshotCreateRequest<I>, + &SnapshotCreateRequest<I>::handle_create_snap>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, + rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_create_snap(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -ESTALE) { + send_allocate_snap_id(); + return nullptr; + } else if (*result < 0) { + save_result(result); + send_release_snap_id(); + return nullptr; + } + + return send_create_object_map(); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::send_create_object_map() { + I &image_ctx = this->m_image_ctx; + + image_ctx.snap_lock.get_read(); + if (image_ctx.object_map == nullptr || m_skip_object_map) { + image_ctx.snap_lock.put_read(); + + update_snap_context(); + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(0); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + { + RWLock::RLocker object_map_lock(image_ctx.object_map_lock); + image_ctx.object_map->snapshot_add( + m_snap_id, create_context_callback< + SnapshotCreateRequest<I>, + &SnapshotCreateRequest<I>::handle_create_object_map>(this)); + } + image_ctx.snap_lock.put_read(); + return nullptr; +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_create_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + update_snap_context(); + image_ctx.io_work_queue->unblock_writes(); + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to snapshot object map: " + << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + return this->create_context_finisher(0); +} + +template <typename I> +void SnapshotCreateRequest<I>::send_release_snap_id() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + ceph_assert(m_snap_id != CEPH_NOSNAP); + + librados::AioCompletion *rados_completion = create_rados_callback< + SnapshotCreateRequest<I>, + &SnapshotCreateRequest<I>::handle_release_snap_id>(this); + image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion); + rados_completion->release(); +} + +template <typename I> +Context *SnapshotCreateRequest<I>::handle_release_snap_id(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + ceph_assert(m_ret_val < 0); + *result = m_ret_val; + + image_ctx.io_work_queue->unblock_writes(); + return this->create_context_finisher(m_ret_val); +} + +template <typename I> +void SnapshotCreateRequest<I>::update_snap_context() { + I &image_ctx = this->m_image_ctx; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::WLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.old_format) { + return; + } + + if (image_ctx.get_snap_info(m_snap_id) != NULL) { + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // immediately add a reference to the new snapshot + utime_t snap_time = ceph_clock_now(); + image_ctx.add_snap(m_snap_namespace, m_snap_name, m_snap_id, m_size, + m_parent_info, RBD_PROTECTION_STATUS_UNPROTECTED, + 0, snap_time); + + // immediately start using the new snap context if we + // own the exclusive lock + std::vector<snapid_t> snaps; + snaps.push_back(m_snap_id); + snaps.insert(snaps.end(), image_ctx.snapc.snaps.begin(), + image_ctx.snapc.snaps.end()); + + image_ctx.snapc.seq = m_snap_id; + image_ctx.snapc.snaps.swap(snaps); + image_ctx.data_ctx.selfmanaged_snap_set_write_ctx( + image_ctx.snapc.seq, image_ctx.snaps); + + if (!image_ctx.migration_info.empty()) { + auto it = image_ctx.migration_info.snap_map.find(CEPH_NOSNAP); + ceph_assert(it != image_ctx.migration_info.snap_map.end()); + ceph_assert(!it->second.empty()); + if (it->second[0] == CEPH_NOSNAP) { + ldout(cct, 5) << this << " " << __func__ + << ": updating migration snap_map" << dendl; + it->second[0] = m_snap_id; + } + } +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotCreateRequest.h b/src/librbd/operation/SnapshotCreateRequest.h new file mode 100644 index 00000000..406d2f01 --- /dev/null +++ b/src/librbd/operation/SnapshotCreateRequest.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Types.h" +#include "librbd/operation/Request.h" +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotCreateRequest : public Request<ImageCtxT> { +public: + /** + * Snap Create goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_SUSPEND_REQUESTS + * | + * v + * STATE_SUSPEND_AIO * * * * * * * * * * * * * + * | * + * v * + * STATE_APPEND_OP_EVENT (skip if journal * + * | disabled) * + * (retry) v * + * . . . > STATE_ALLOCATE_SNAP_ID * + * . | * + * . v * + * . . . . STATE_CREATE_SNAP * * * * * * * * * * * + * | * * + * v * * + * STATE_CREATE_OBJECT_MAP (skip if * * + * | disabled) * * + * | * * + * | v * + * | STATE_RELEASE_SNAP_ID * + * | | * + * | v * + * \----------------> <finish> < * * * * * + * + * @endverbatim + * + * The _CREATE_STATE state may repeat back to the _ALLOCATE_SNAP_ID state + * if a stale snapshot context is allocated. If the create operation needs + * to abort, the error path is followed to record the result in the journal + * (if enabled) and bubble the originating error code back to the client. + */ + SnapshotCreateRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t journal_op_tid, + bool skip_object_map); + +protected: + void send_op() override; + bool should_complete(int r) override { + return true; + } + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapCreateEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + bool m_skip_object_map; + + int m_ret_val; + + uint64_t m_snap_id; + uint64_t m_size; + ParentImageInfo m_parent_info; + + void send_suspend_requests(); + Context *handle_suspend_requests(int *result); + + void send_suspend_aio(); + Context *handle_suspend_aio(int *result); + + void send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_allocate_snap_id(); + Context *handle_allocate_snap_id(int *result); + + void send_create_snap(); + Context *handle_create_snap(int *result); + + Context *send_create_object_map(); + Context *handle_create_object_map(int *result); + + void send_release_snap_id(); + Context *handle_release_snap_id(int *result); + + void update_snap_context(); + + void save_result(int *result) { + if (m_ret_val == 0 && *result < 0) { + m_ret_val = *result; + } + } +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H diff --git a/src/librbd/operation/SnapshotLimitRequest.cc b/src/librbd/operation/SnapshotLimitRequest.cc new file mode 100644 index 00000000..d47dc3a8 --- /dev/null +++ b/src/librbd/operation/SnapshotLimitRequest.cc @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotLimitRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotLimitRequest: " + +namespace librbd { +namespace operation { + +template <typename I> +SnapshotLimitRequest<I>::SnapshotLimitRequest(I &image_ctx, + Context *on_finish, + uint64_t limit) + : Request<I>(image_ctx, on_finish), m_snap_limit(limit) { +} + +template <typename I> +void SnapshotLimitRequest<I>::send_op() { + send_limit_snaps(); +} + +template <typename I> +bool SnapshotLimitRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void SnapshotLimitRequest<I>::send_limit_snaps() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + { + RWLock::RLocker md_locker(image_ctx.md_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + librados::ObjectWriteOperation op; + cls_client::snapshot_set_limit(&op, m_snap_limit); + + librados::AioCompletion *rados_completion = + this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion, + &op); + ceph_assert(r == 0); + rados_completion->release(); + } +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotLimitRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotLimitRequest.h b/src/librbd/operation/SnapshotLimitRequest.h new file mode 100644 index 00000000..09622a45 --- /dev/null +++ b/src/librbd/operation/SnapshotLimitRequest.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H + +#include "librbd/operation/Request.h" +#include <iosfwd> +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotLimitRequest : public Request<ImageCtxT> { +public: + SnapshotLimitRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t limit); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapLimitEvent(op_tid, m_snap_limit); + } + +private: + uint64_t m_snap_limit; + + void send_limit_snaps(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotLimitRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H diff --git a/src/librbd/operation/SnapshotProtectRequest.cc b/src/librbd/operation/SnapshotProtectRequest.cc new file mode 100644 index 00000000..9a0375fc --- /dev/null +++ b/src/librbd/operation/SnapshotProtectRequest.cc @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotProtectRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotProtectRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template <typename I> +std::ostream& operator<<(std::ostream& os, + const typename SnapshotProtectRequest<I>::State& state) { + switch(state) { + case SnapshotProtectRequest<I>::STATE_PROTECT_SNAP: + os << "PROTECT_SNAP"; + break; + } + return os; +} + +} // anonymous namespace + +template <typename I> +SnapshotProtectRequest<I>::SnapshotProtectRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) + : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_state(STATE_PROTECT_SNAP) { +} + +template <typename I> +void SnapshotProtectRequest<I>::send_op() { + send_protect_snap(); +} + +template <typename I> +bool SnapshotProtectRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0) { + if (r == -EBUSY) { + ldout(cct, 1) << "snapshot is already protected" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + } + return true; +} + +template <typename I> +void SnapshotProtectRequest<I>::send_protect_snap() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + int r = verify_and_send_protect_snap(); + if (r < 0) { + this->async_complete(r); + return; + } +} + +template <typename I> +int SnapshotProtectRequest<I>::verify_and_send_protect_snap() { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker md_locker(image_ctx.md_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + CephContext *cct = image_ctx.cct; + if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) { + lderr(cct) << "image must support layering" << dendl; + return -ENOSYS; + } + + uint64_t snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name); + if (snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + + bool is_protected; + int r = image_ctx.is_snap_protected(snap_id, &is_protected); + if (r < 0) { + return r; + } + + if (is_protected) { + return -EBUSY; + } + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, snap_id, + RBD_PROTECTION_STATUS_PROTECTED); + + librados::AioCompletion *rados_completion = + this->create_callback_completion(); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion, + &op); + ceph_assert(r == 0); + rados_completion->release(); + return 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotProtectRequest.h b/src/librbd/operation/SnapshotProtectRequest.h new file mode 100644 index 00000000..bef80229 --- /dev/null +++ b/src/librbd/operation/SnapshotProtectRequest.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H + +#include "librbd/operation/Request.h" +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotProtectRequest : public Request<ImageCtxT> { +public: + /** + * Snap Protect goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_PROTECT_SNAP + * | + * v + * <finish> + * + * @endverbatim + * + */ + enum State { + STATE_PROTECT_SNAP + }; + + SnapshotProtectRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapProtectEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + State m_state; + + void send_protect_snap(); + + int verify_and_send_protect_snap(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H diff --git a/src/librbd/operation/SnapshotRemoveRequest.cc b/src/librbd/operation/SnapshotRemoveRequest.cc new file mode 100644 index 00000000..69fbc3e7 --- /dev/null +++ b/src/librbd/operation/SnapshotRemoveRequest.cc @@ -0,0 +1,469 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/DetachChildRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotRemoveRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +SnapshotRemoveRequest<I>::SnapshotRemoveRequest( + I &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, uint64_t snap_id) + : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_snap_id(snap_id) { +} + +template <typename I> +void SnapshotRemoveRequest<I>::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + ceph_assert(image_ctx.owner_lock.is_locked()); + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + RWLock::RLocker object_map_locker(image_ctx.object_map_lock); + if (image_ctx.snap_info.find(m_snap_id) == image_ctx.snap_info.end()) { + lderr(cct) << "snapshot doesn't exist" << dendl; + this->async_complete(-ENOENT); + return; + } + } + + trash_snap(); +} + +template <typename I> +bool SnapshotRemoveRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void SnapshotRemoveRequest<I>::trash_snap() { + I &image_ctx = this->m_image_ctx; + if (image_ctx.old_format) { + release_snap_id(); + return; + } else if (cls::rbd::get_snap_namespace_type(m_snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) { + get_snap(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + cls_client::snapshot_trash_add(&op, m_snap_id); + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_trash_snap>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_trash_snap(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + // trash / clone v2 not supported + detach_child(); + return; + } else if (r < 0 && r != -EEXIST) { + lderr(cct) << "failed to move snapshot to trash: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + m_trashed_snapshot = true; + get_snap(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::get_snap() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::snapshot_get_start(&op, m_snap_id); + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_get_snap>(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_get_snap(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == 0) { + cls::rbd::SnapshotInfo snap_info; + + auto it = m_out_bl.cbegin(); + r = cls_client::snapshot_get_finish(&it, &snap_info); + m_child_attached = (snap_info.child_count > 0); + if (r == 0 && m_child_attached) { + list_children(); + return; + } + } + + if (r < 0) { + lderr(cct) << "failed to retrieve snapshot: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + detach_child(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::list_children() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::children_list_start(&op, m_snap_id); + + m_out_bl.clear(); + m_child_images.clear(); + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_list_children>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_list_children(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::children_list_finish(&it, &m_child_images); + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve child: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + detach_stale_child(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::detach_stale_child() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + for (auto& child_image : m_child_images) { + m_child_attached = true; + IoCtx ioctx; + int r = util::create_ioctx(image_ctx.md_ctx, "child image", + child_image.pool_id, + child_image.pool_namespace, &ioctx); + if (r == -ENOENT) { + librados::ObjectWriteOperation op; + cls_client::child_detach(&op, m_snap_id, + {child_image.pool_id, + child_image.pool_namespace, + child_image.image_id}); + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_detach_stale_child>(this); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); + return; + } else if (r < 0) { + this->async_complete(r); + return; + } + } + + detach_child(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_detach_stale_child(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to detach stale child: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + m_child_attached = false; + list_children(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::detach_child() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + bool detach_child = false; + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + RWLock::RLocker parent_locker(image_ctx.parent_lock); + + cls::rbd::ParentImageSpec our_pspec; + int r = image_ctx.get_parent_spec(m_snap_id, &our_pspec); + if (r < 0) { + if (r == -ENOENT) { + ldout(cct, 1) << "No such snapshot" << dendl; + } else { + lderr(cct) << "failed to retrieve parent spec" << dendl; + } + + this->async_complete(r); + return; + } + + if (image_ctx.parent_md.spec != our_pspec && + (scan_for_parents(our_pspec) == -ENOENT)) { + // no other references to the parent image + detach_child = true; + } + } + + if (!detach_child) { + // HEAD image or other snapshots still associated with parent + remove_object_map(); + return; + } + + ldout(cct, 5) << dendl; + auto ctx = create_context_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_detach_child>(this); + auto req = image::DetachChildRequest<I>::create(image_ctx, ctx); + req->send(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_detach_child(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to detach child from parent: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + remove_object_map(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::remove_object_map() { + I &image_ctx = this->m_image_ctx; + if (m_child_attached) { + // if a clone v2 child is attached to this snapshot, we cannot + // proceed. It's only an error if the snap was already in the trash + this->complete(m_trashed_snapshot ? 0 : -EBUSY); + return; + } + + CephContext *cct = image_ctx.cct; + + { + RWLock::RLocker owner_lock(image_ctx.owner_lock); + RWLock::WLocker snap_locker(image_ctx.snap_lock); + RWLock::RLocker object_map_locker(image_ctx.object_map_lock); + if (image_ctx.object_map != nullptr) { + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_remove_object_map>(this); + image_ctx.object_map->snapshot_remove(m_snap_id, ctx); + return; + } + } + + // object map disabled + release_snap_id(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_remove_object_map(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove snapshot object map: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + release_snap_id(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::release_snap_id() { + I &image_ctx = this->m_image_ctx; + + if (!image_ctx.data_ctx.is_valid()) { + remove_snap(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_snap_id << dendl; + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_release_snap_id>(this); + image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, aio_comp); + aio_comp->release(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_release_snap_id(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to release snap id: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + remove_snap(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::remove_snap() { + I &image_ctx = this->m_image_ctx; + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + cls_client::old_snapshot_remove(&op, m_snap_name); + } else { + cls_client::snapshot_remove(&op, m_snap_id); + } + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest<I>, + &SnapshotRemoveRequest<I>::handle_remove_snap>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void SnapshotRemoveRequest<I>::handle_remove_snap(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove snapshot: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + remove_snap_context(); + this->complete(0); +} + +template <typename I> +void SnapshotRemoveRequest<I>::remove_snap_context() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + RWLock::WLocker snap_locker(image_ctx.snap_lock); + image_ctx.rm_snap(m_snap_namespace, m_snap_name, m_snap_id); +} + +template <typename I> +int SnapshotRemoveRequest<I>::scan_for_parents( + cls::rbd::ParentImageSpec &pspec) { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.snap_lock.is_locked()); + ceph_assert(image_ctx.parent_lock.is_locked()); + + if (pspec.pool_id != -1) { + map<uint64_t, SnapInfo>::iterator it; + for (it = image_ctx.snap_info.begin(); + it != image_ctx.snap_info.end(); ++it) { + // skip our snap id (if checking base image, CEPH_NOSNAP won't match) + if (it->first == m_snap_id) { + continue; + } + if (it->second.parent.spec == pspec) { + break; + } + } + if (it == image_ctx.snap_info.end()) { + return -ENOENT; + } + } + return 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotRemoveRequest.h b/src/librbd/operation/SnapshotRemoveRequest.h new file mode 100644 index 00000000..a47bbc0a --- /dev/null +++ b/src/librbd/operation/SnapshotRemoveRequest.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H + +#include "librbd/operation/Request.h" +#include "include/buffer.h" +#include "librbd/Types.h" +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotRemoveRequest : public Request<ImageCtxT> { +public: + /** + * @verbatim + * + * <start> + * | + * v + * TRASH_SNAP + * | + * v (skip if unsupported) + * GET_SNAP + * | + * v (skip if unnecessary) + * LIST_CHILDREN <-------------\ + * | | + * v (skip if unnecessary) | (repeat as needed) + * DETACH_STALE_CHILD ---------/ + * | + * v (skip if unnecessary) + * DETACH_CHILD + * | + * v (skip if disabled/in-use) + * REMOVE_OBJECT_MAP + * | + * v (skip if in-use) + * RELEASE_SNAP_ID + * | + * v (skip if in-use) + * REMOVE_SNAP + * | + * v + * <finish> + * + * @endverbatim + */ + + static SnapshotRemoveRequest *create( + ImageCtxT &image_ctx, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, uint64_t snap_id, Context *on_finish) { + return new SnapshotRemoveRequest(image_ctx, on_finish, snap_namespace, + snap_name, snap_id); + } + + SnapshotRemoveRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t snap_id); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapRemoveEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + cls::rbd::ChildImageSpecs m_child_images; + std::string m_snap_name; + uint64_t m_snap_id; + bool m_trashed_snapshot = false; + bool m_child_attached = false; + + ceph::bufferlist m_out_bl; + + void trash_snap(); + void handle_trash_snap(int r); + + void get_snap(); + void handle_get_snap(int r); + + void list_children(); + void handle_list_children(int r); + + void detach_stale_child(); + void handle_detach_stale_child(int r); + + void detach_child(); + void handle_detach_child(int r); + + void remove_object_map(); + void handle_remove_object_map(int r); + + void release_snap_id(); + void handle_release_snap_id(int r); + + void remove_snap(); + void handle_remove_snap(int r); + + void remove_snap_context(); + int scan_for_parents(cls::rbd::ParentImageSpec &pspec); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H diff --git a/src/librbd/operation/SnapshotRenameRequest.cc b/src/librbd/operation/SnapshotRenameRequest.cc new file mode 100644 index 00000000..b25b991f --- /dev/null +++ b/src/librbd/operation/SnapshotRenameRequest.cc @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotRenameRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotRenameRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template <typename I> +std::ostream& operator<<(std::ostream& os, + const typename SnapshotRenameRequest<I>::State& state) { + switch(state) { + case SnapshotRenameRequest<I>::STATE_RENAME_SNAP: + os << "RENAME_SNAP"; + break; + } + return os; +} + +} // anonymous namespace + +template <typename I> +SnapshotRenameRequest<I>::SnapshotRenameRequest(I &image_ctx, + Context *on_finish, + uint64_t snap_id, + const std::string &snap_name) + : Request<I>(image_ctx, on_finish), m_snap_id(snap_id), + m_snap_name(snap_name), m_state(STATE_RENAME_SNAP) { +} + +template <typename I> +journal::Event SnapshotRenameRequest<I>::create_event(uint64_t op_tid) const { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.snap_lock.is_locked()); + + std::string src_snap_name; + auto snap_info_it = image_ctx.snap_info.find(m_snap_id); + if (snap_info_it != image_ctx.snap_info.end()) { + src_snap_name = snap_info_it->second.name; + } + + return journal::SnapRenameEvent(op_tid, m_snap_id, src_snap_name, + m_snap_name); +} + +template <typename I> +void SnapshotRenameRequest<I>::send_op() { + send_rename_snap(); +} + +template <typename I> +bool SnapshotRenameRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0) { + if (r == -EEXIST) { + ldout(cct, 1) << "snapshot already exists" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + } + return true; +} + +template <typename I> +void SnapshotRenameRequest<I>::send_rename_snap() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + RWLock::RLocker md_locker(image_ctx.md_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + cls_client::old_snapshot_rename(&op, m_snap_id, m_snap_name); + } else { + cls_client::snapshot_rename(&op, m_snap_id, m_snap_name); + } + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, + rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotRenameRequest.h b/src/librbd/operation/SnapshotRenameRequest.h new file mode 100644 index 00000000..697772e0 --- /dev/null +++ b/src/librbd/operation/SnapshotRenameRequest.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H + +#include "librbd/operation/Request.h" +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotRenameRequest : public Request<ImageCtxT> { +public: + /** + * Snap Rename goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_RENAME_SNAP + * | + * v + * <finish> + * + * @endverbatim + * + */ + enum State { + STATE_RENAME_SNAP + }; + + SnapshotRenameRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t snap_id, const std::string &snap_name); + + journal::Event create_event(uint64_t op_tid) const override; + +protected: + void send_op() override; + bool should_complete(int r) override; + +private: + uint64_t m_snap_id; + std::string m_snap_name; + State m_state; + + void send_rename_snap(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H diff --git a/src/librbd/operation/SnapshotRollbackRequest.cc b/src/librbd/operation/SnapshotRollbackRequest.cc new file mode 100644 index 00000000..596570a3 --- /dev/null +++ b/src/librbd/operation/SnapshotRollbackRequest.cc @@ -0,0 +1,412 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotRollbackRequest.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ImageRequestWQ.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/operation/ResizeRequest.h" +#include "osdc/Striper.h" +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotRollbackRequest: " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace { + +template <typename I> +class C_RollbackObject : public C_AsyncObjectThrottle<I> { +public: + C_RollbackObject(AsyncObjectThrottle<I> &throttle, I *image_ctx, + uint64_t snap_id, uint64_t object_num, + uint64_t head_num_objects, + decltype(I::object_map) snap_object_map) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snap_id(snap_id), + m_object_num(object_num), m_head_num_objects(head_num_objects), + m_snap_object_map(snap_object_map) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << "C_RollbackObject: " << __func__ << ": object_num=" + << m_object_num << dendl; + + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (m_object_num < m_head_num_objects && + m_snap_object_map != nullptr && + !image_ctx.object_map->object_may_exist(m_object_num) && + !m_snap_object_map->object_may_exist(m_object_num)) { + return 1; + } + } + + std::string oid = image_ctx.get_object_name(m_object_num); + + librados::ObjectWriteOperation op; + op.selfmanaged_snap_rollback(m_snap_id); + + librados::AioCompletion *rados_completion = + util::create_rados_callback(this); + image_ctx.data_ctx.aio_operate(oid, rados_completion, &op); + rados_completion->release(); + return 0; + } + +private: + uint64_t m_snap_id; + uint64_t m_object_num; + uint64_t m_head_num_objects; + decltype(I::object_map) m_snap_object_map; +}; + +} // anonymous namespace + +template <typename I> +SnapshotRollbackRequest<I>::SnapshotRollbackRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t snap_id, + uint64_t snap_size, + ProgressContext &prog_ctx) + : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_snap_id(snap_id), + m_snap_size(snap_size), m_prog_ctx(prog_ctx), + m_object_map(nullptr), m_snap_object_map(nullptr) { +} + +template <typename I> +SnapshotRollbackRequest<I>::~SnapshotRollbackRequest() { + I &image_ctx = this->m_image_ctx; + if (m_blocking_writes) { + image_ctx.io_work_queue->unblock_writes(); + } + delete m_object_map; + delete m_snap_object_map; +} + +template <typename I> +void SnapshotRollbackRequest<I>::send_op() { + send_block_writes(); +} + +template <typename I> +void SnapshotRollbackRequest<I>::send_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_blocking_writes = true; + image_ctx.io_work_queue->block_writes(create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_block_writes>(this)); +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_resize_image(); + return nullptr; +} + +template <typename I> +void SnapshotRollbackRequest<I>::send_resize_image() { + I &image_ctx = this->m_image_ctx; + + uint64_t current_size; + { + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + current_size = image_ctx.get_image_size(CEPH_NOSNAP); + } + + m_head_num_objects = Striper::get_num_objects(image_ctx.layout, current_size); + + if (current_size == m_snap_size) { + send_get_snap_object_map(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + Context *ctx = create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_resize_image>(this); + ResizeRequest<I> *req = ResizeRequest<I>::create(image_ctx, ctx, m_snap_size, + true, m_no_op_prog_ctx, 0, true); + req->send(); +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_resize_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to resize image for rollback: " + << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_get_snap_object_map(); + return nullptr; +} + +template <typename I> +void SnapshotRollbackRequest<I>::send_get_snap_object_map() { + I &image_ctx = this->m_image_ctx; + + uint64_t flags = 0; + bool object_map_enabled; + CephContext *cct = image_ctx.cct; + { + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + object_map_enabled = (image_ctx.object_map != nullptr); + int r = image_ctx.get_flags(m_snap_id, &flags); + if (r < 0) { + object_map_enabled = false; + } + } + if (object_map_enabled && + (flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) { + lderr(cct) << "warning: object-map is invalid for snapshot" << dendl; + object_map_enabled = false; + } + if (!object_map_enabled) { + send_rollback_object_map(); + return; + } + + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_snap_object_map = image_ctx.create_object_map(m_snap_id); + + Context *ctx = create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_get_snap_object_map>(this); + m_snap_object_map->open(ctx); + return; +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_get_snap_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to open object map: " + << cpp_strerror(*result) << dendl; + delete m_snap_object_map; + m_snap_object_map = nullptr; + } + + send_rollback_object_map(); + return nullptr; +} + +template <typename I> +void SnapshotRollbackRequest<I>::send_rollback_object_map() { + I &image_ctx = this->m_image_ctx; + + { + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + RWLock::WLocker object_map_lock(image_ctx.object_map_lock); + if (image_ctx.object_map != nullptr) { + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_rollback_object_map>(this); + image_ctx.object_map->rollback(m_snap_id, ctx); + return; + } + } + + send_rollback_objects(); +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_rollback_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to roll back object " + << "map: " << cpp_strerror(*result) << dendl; + + ceph_assert(m_object_map == nullptr); + apply(); + return this->create_context_finisher(*result); + } + + send_rollback_objects(); + return nullptr; +} + +template <typename I> +void SnapshotRollbackRequest<I>::send_rollback_objects() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + uint64_t num_objects; + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + num_objects = Striper::get_num_objects(image_ctx.layout, + image_ctx.get_current_size()); + } + + Context *ctx = create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_rollback_objects>(this); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_RollbackObject<I> >(), + boost::lambda::_1, &image_ctx, m_snap_id, boost::lambda::_2, + m_head_num_objects, m_snap_object_map)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, num_objects); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_rollback_objects(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -ERESTART) { + ldout(cct, 5) << "snapshot rollback operation interrupted" << dendl; + return this->create_context_finisher(*result); + } else if (*result < 0) { + lderr(cct) << "failed to rollback objects: " << cpp_strerror(*result) + << dendl; + return this->create_context_finisher(*result); + } + + return send_refresh_object_map(); +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::send_refresh_object_map() { + I &image_ctx = this->m_image_ctx; + + bool object_map_enabled; + { + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + object_map_enabled = (image_ctx.object_map != nullptr); + } + if (!object_map_enabled) { + return send_invalidate_cache(); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_object_map = image_ctx.create_object_map(CEPH_NOSNAP); + + Context *ctx = create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_refresh_object_map>(this); + m_object_map->open(ctx); + return nullptr; +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_refresh_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to open object map: " + << cpp_strerror(*result) << dendl; + delete m_object_map; + m_object_map = nullptr; + apply(); + + return this->create_context_finisher(*result); + } + + return send_invalidate_cache(); +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::send_invalidate_cache() { + I &image_ctx = this->m_image_ctx; + + apply(); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + RWLock::RLocker owner_lock(image_ctx.owner_lock); + Context *ctx = create_context_callback< + SnapshotRollbackRequest<I>, + &SnapshotRollbackRequest<I>::handle_invalidate_cache>(this); + image_ctx.io_object_dispatcher->invalidate_cache(ctx); + return nullptr; +} + +template <typename I> +Context *SnapshotRollbackRequest<I>::handle_invalidate_cache(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result) + << dendl; + } + return this->create_context_finisher(*result); +} + +template <typename I> +void SnapshotRollbackRequest<I>::apply() { + I &image_ctx = this->m_image_ctx; + + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::WLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.object_map != nullptr) { + std::swap(m_object_map, image_ctx.object_map); + } +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotRollbackRequest.h b/src/librbd/operation/SnapshotRollbackRequest.h new file mode 100644 index 00000000..e58a618f --- /dev/null +++ b/src/librbd/operation/SnapshotRollbackRequest.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H + +#include "librbd/operation/Request.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/journal/Types.h" +#include <string> + +class Context; + +namespace librbd { + +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotRollbackRequest : public Request<ImageCtxT> { +public: + /** + * Snap Rollback goes through the following state machine: + * + * @verbatim + * + * <start> ---------\ + * | + * v + * STATE_BLOCK_WRITES + * | + * v + * STATE_RESIZE_IMAGE (skip if resize not + * | required) + * v + * STATE_GET_SNAP_OBJECT_MAP (skip if object) + * | map disabled) + * v + * STATE_ROLLBACK_OBJECT_MAP (skip if object + * | map disabled) + * v + * STATE_ROLLBACK_OBJECTS + * | + * v + * STATE_REFRESH_OBJECT_MAP (skip if object + * | map disabled) + * v + * STATE_INVALIDATE_CACHE (skip if cache + * | disabled) + * v + * <finish> + * + * @endverbatim + * + * The _RESIZE_IMAGE state is skipped if the image doesn't need to be resized. + * The _ROLLBACK_OBJECT_MAP state is skipped if the object map isn't enabled. + * The _INVALIDATE_CACHE state is skipped if the cache isn't enabled. + */ + + SnapshotRollbackRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t snap_id, + uint64_t snap_size, ProgressContext &prog_ctx); + ~SnapshotRollbackRequest() override; + +protected: + void send_op() override; + bool should_complete(int r) override { + return true; + } + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapRollbackEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + uint64_t m_snap_id; + uint64_t m_snap_size; + uint64_t m_head_num_objects; + ProgressContext &m_prog_ctx; + + NoOpProgressContext m_no_op_prog_ctx; + + bool m_blocking_writes = false; + decltype(ImageCtxT::object_map) m_object_map; + decltype(ImageCtxT::object_map) m_snap_object_map; + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_resize_image(); + Context *handle_resize_image(int *result); + + void send_get_snap_object_map(); + Context *handle_get_snap_object_map(int *result); + + void send_rollback_object_map(); + Context *handle_rollback_object_map(int *result); + + void send_rollback_objects(); + Context *handle_rollback_objects(int *result); + + Context *send_refresh_object_map(); + Context *handle_refresh_object_map(int *result); + + Context *send_invalidate_cache(); + Context *handle_invalidate_cache(int *result); + + void apply(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H diff --git a/src/librbd/operation/SnapshotUnprotectRequest.cc b/src/librbd/operation/SnapshotUnprotectRequest.cc new file mode 100644 index 00000000..4a58dbc0 --- /dev/null +++ b/src/librbd/operation/SnapshotUnprotectRequest.cc @@ -0,0 +1,354 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotUnprotectRequest.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include <list> +#include <set> +#include <vector> +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotUnprotectRequest: " + +namespace librbd { +namespace operation { + +namespace { + +typedef std::pair<int64_t, std::string> Pool; +typedef std::vector<Pool> Pools; + +template <typename I> +std::ostream& operator<<(std::ostream& os, + const typename SnapshotUnprotectRequest<I>::State& state) { + switch(state) { + case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_START: + os << "UNPROTECT_SNAP_START"; + break; + case SnapshotUnprotectRequest<I>::STATE_SCAN_POOL_CHILDREN: + os << "SCAN_POOL_CHILDREN"; + break; + case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_FINISH: + os << "UNPROTECT_SNAP_FINISH"; + break; + case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_ROLLBACK: + os << "UNPROTECT_SNAP_ROLLBACK"; + break; + default: + os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")"; + break; + } + return os; +} + +template <typename I> +class C_ScanPoolChildren : public C_AsyncObjectThrottle<I> { +public: + C_ScanPoolChildren(AsyncObjectThrottle<I> &throttle, I *image_ctx, + const cls::rbd::ParentImageSpec &pspec, const Pools &pools, + size_t pool_idx) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_pspec(pspec), + m_pool(pools[pool_idx]) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " scanning pool '" << m_pool.second << "'" + << dendl; + + librados::Rados rados(image_ctx.md_ctx); + int64_t base_tier; + int r = rados.pool_get_base_tier(m_pool.first, &base_tier); + if (r == -ENOENT) { + ldout(cct, 1) << "pool '" << m_pool.second << "' no longer exists" + << dendl; + return 1; + } else if (r < 0) { + lderr(cct) << "error retrieving base tier for pool '" + << m_pool.second << "'" << dendl; + return r; + } + if (m_pool.first != base_tier) { + // pool is a cache; skip it + return 1; + } + + r = util::create_ioctx(image_ctx.md_ctx, "child image", m_pool.first, {}, + &m_pool_ioctx); + if (r == -ENOENT) { + return 1; + } else if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + cls_client::get_children_start(&op, m_pspec); + + librados::AioCompletion *rados_completion = + util::create_rados_callback(this); + r = m_pool_ioctx.aio_operate(RBD_CHILDREN, rados_completion, &op, + &m_children_bl); + ceph_assert(r == 0); + rados_completion->release(); + return 0; + } + +protected: + void finish(int r) override { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (r == 0) { + auto it = m_children_bl.cbegin(); + r= cls_client::get_children_finish(&it, &m_children); + } + + ldout(cct, 10) << this << " retrieved children: r=" << r << dendl; + if (r == -ENOENT) { + // no children -- proceed with unprotect + r = 0; + } else if (r < 0) { + lderr(cct) << "cannot get children for pool '" << m_pool.second << "'" + << dendl; + } else { + lderr(cct) << "cannot unprotect: at least " << m_children.size() << " " + << "child(ren) [" << joinify(m_children.begin(), + m_children.end(), + std::string(",")) << "] " + << "in pool '" << m_pool.second << "'" << dendl; + r = -EBUSY; + } + C_AsyncObjectThrottle<I>::finish(r); + } + +private: + cls::rbd::ParentImageSpec m_pspec; + Pool m_pool; + + IoCtx m_pool_ioctx; + std::set<std::string> m_children; + bufferlist m_children_bl; +}; + +} // anonymous namespace + +template <typename I> +SnapshotUnprotectRequest<I>::SnapshotUnprotectRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) + : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_state(STATE_UNPROTECT_SNAP_START), + m_ret_val(0), m_snap_id(CEPH_NOSNAP) { +} + +template <typename I> +void SnapshotUnprotectRequest<I>::send_op() { + send_unprotect_snap_start(); +} + +template <typename I> +bool SnapshotUnprotectRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0) { + if (r == -EINVAL) { + ldout(cct, 1) << "snapshot is already unprotected" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + if (m_ret_val == 0) { + m_ret_val = r; + } + } + + // use a different state machine once an error is encountered + if (m_ret_val < 0) { + return should_complete_error(); + } + + RWLock::RLocker owner_lock(image_ctx.owner_lock); + bool finished = false; + switch (m_state) { + case STATE_UNPROTECT_SNAP_START: + send_scan_pool_children(); + break; + case STATE_SCAN_POOL_CHILDREN: + send_unprotect_snap_finish(); + break; + case STATE_UNPROTECT_SNAP_FINISH: + finished = true; + break; + default: + ceph_abort(); + break; + } + return finished; +} + +template <typename I> +bool SnapshotUnprotectRequest<I>::should_complete_error() { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker owner_locker(image_ctx.owner_lock); + CephContext *cct = image_ctx.cct; + lderr(cct) << this << " " << __func__ << ": " + << "ret_val=" << m_ret_val << dendl; + + bool finished = true; + if (m_state == STATE_SCAN_POOL_CHILDREN || + m_state == STATE_UNPROTECT_SNAP_FINISH) { + send_unprotect_snap_rollback(); + finished = false; + } + return finished; +} + +template <typename I> +void SnapshotUnprotectRequest<I>::send_unprotect_snap_start() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + int r = verify_and_send_unprotect_snap_start(); + if (r < 0) { + this->async_complete(r); + return; + } +} + +template <typename I> +void SnapshotUnprotectRequest<I>::send_scan_pool_children() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_SCAN_POOL_CHILDREN; + + // search all pools for children depending on this snapshot + // TODO add async version of wait_for_latest_osdmap + librados::Rados rados(image_ctx.md_ctx); + rados.wait_for_latest_osdmap(); + + // protect against pools being renamed/deleted + std::list<Pool> pool_list; + rados.pool_list2(pool_list); + + cls::rbd::ParentImageSpec pspec(image_ctx.md_ctx.get_id(), + image_ctx.md_ctx.get_namespace(), + image_ctx.id, m_snap_id); + Pools pools(pool_list.begin(), pool_list.end()); + + Context *ctx = this->create_callback_context(); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_ScanPoolChildren<I> >(), + boost::lambda::_1, &image_ctx, pspec, pools, boost::lambda::_2)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + nullptr, image_ctx, context_factory, ctx, NULL, 0, pools.size()); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void SnapshotUnprotectRequest<I>::send_unprotect_snap_finish() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_state = STATE_UNPROTECT_SNAP_FINISH; + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, m_snap_id, + RBD_PROTECTION_STATUS_UNPROTECTED); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +void SnapshotUnprotectRequest<I>::send_unprotect_snap_rollback() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_state = STATE_UNPROTECT_SNAP_ROLLBACK; + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, m_snap_id, + RBD_PROTECTION_STATUS_PROTECTED); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template <typename I> +int SnapshotUnprotectRequest<I>::verify_and_send_unprotect_snap_start() { + I &image_ctx = this->m_image_ctx; + RWLock::RLocker md_locker(image_ctx.md_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + CephContext *cct = image_ctx.cct; + if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) { + lderr(cct) << "image must support layering" << dendl; + return -ENOSYS; + } + + m_snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name); + if (m_snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + + bool is_unprotected; + int r = image_ctx.is_snap_unprotected(m_snap_id, &is_unprotected); + if (r < 0) { + return r; + } + + if (is_unprotected) { + lderr(cct) << "snapshot is already unprotected" << dendl; + return -EINVAL; + } + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, m_snap_id, + RBD_PROTECTION_STATUS_UNPROTECTING); + + librados::AioCompletion *comp = this->create_callback_completion(); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + + // TODO legacy code threw a notification post UNPROTECTING update -- required? + return 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SnapshotUnprotectRequest.h b/src/librbd/operation/SnapshotUnprotectRequest.h new file mode 100644 index 00000000..19cc6d32 --- /dev/null +++ b/src/librbd/operation/SnapshotUnprotectRequest.h @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H + +#include "librbd/operation/Request.h" +#include <string> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SnapshotUnprotectRequest : public Request<ImageCtxT> { +public: + /** + * Snap Unprotect goes through the following state machine: + * + * @verbatim + * + * <start> + * | + * v + * STATE_UNPROTECT_SNAP_START + * | + * v + * STATE_SCAN_POOL_CHILDREN * * * * > STATE_UNPROTECT_SNAP_ROLLBACK + * | | + * v | + * STATE_UNPROTECT_SNAP_FINISH | + * | | + * v | + * <finish> <----------------------------/ + * + * @endverbatim + * + * If the unprotect operation needs to abort, the error path is followed + * to rollback the unprotect in-progress status on the image. + */ + enum State { + STATE_UNPROTECT_SNAP_START, + STATE_SCAN_POOL_CHILDREN, + STATE_UNPROTECT_SNAP_FINISH, + STATE_UNPROTECT_SNAP_ROLLBACK + }; + + SnapshotUnprotectRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name); + +protected: + void send_op() override; + bool should_complete(int r) override; + + int filter_return_code(int r) const override { + if (m_ret_val < 0) { + return m_ret_val; + } + return 0; + } + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapUnprotectEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + State m_state; + + int m_ret_val; + uint64_t m_snap_id; + + bool should_complete_error(); + + void send_unprotect_snap_start(); + void send_scan_pool_children(); + void send_unprotect_snap_finish(); + void send_unprotect_snap_rollback(); + + int verify_and_send_unprotect_snap_start(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H diff --git a/src/librbd/operation/SparsifyRequest.cc b/src/librbd/operation/SparsifyRequest.cc new file mode 100644 index 00000000..53049f88 --- /dev/null +++ b/src/librbd/operation/SparsifyRequest.cc @@ -0,0 +1,519 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SparsifyRequest.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/err.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include "librbd/io/ObjectRequest.h" +#include "osdc/Striper.h" +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { +namespace operation { + +namespace { + +bool may_be_trimmed(const std::map<uint64_t,uint64_t> &extent_map, + const bufferlist &bl, size_t sparse_size, + uint64_t *new_end_ptr) { + if (extent_map.empty()) { + *new_end_ptr = 0; + return true; + } + + uint64_t end = extent_map.rbegin()->first + extent_map.rbegin()->second; + uint64_t new_end = end; + uint64_t bl_off = bl.length(); + + for (auto it = extent_map.rbegin(); it != extent_map.rend(); it++) { + auto off = it->first; + auto len = it->second; + + new_end = p2roundup<uint64_t>(off + len, sparse_size); + + uint64_t extent_left = len; + uint64_t sub_len = len % sparse_size; + if (sub_len == 0) { + sub_len = sparse_size; + } + while (extent_left > 0) { + ceph_assert(bl_off >= sub_len); + bl_off -= sub_len; + bufferlist sub_bl; + sub_bl.substr_of(bl, bl_off, sub_len); + if (!sub_bl.is_zero()) { + break; + } + new_end -= sparse_size; + extent_left -= sub_len; + sub_len = sparse_size; + } + if (extent_left > 0) { + break; + } + } + + if (new_end < end) { + *new_end_ptr = new_end; + return true; + } + + return false; +} + +} // anonymous namespace + +using util::create_context_callback; +using util::create_rados_callback; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::SparsifyObject: " << this \ + << " " << m_oid << " " << __func__ << ": " + +template <typename I> +class C_SparsifyObject : public C_AsyncObjectThrottle<I> { +public: + + /** + * @verbatim + * + * <start> + * | + * v (not supported) + * SPARSIFY * * * * * * * * * * * * > READ < * * * * * * * * * * (concurrent + * | | * update is + * | (object map disabled) | (can trim) * detected) + * |------------------------\ V * + * | | PRE UPDATE OBJECT MAP * + * | (object map enabled) | | (if needed) * + * v | V * + * PRE UPDATE OBJECT MAP | TRIM * * * * * * * * * * * + * | | | + * v | V + * CHECK EXISTS | POST UPDATE OBJECT MAP + * | | | (if needed) + * v | | + * POST UPDATE OBJECT MAP | | + * | | | + * v | | + * <finish> <------------------/<-------/ + * + * @endverbatim + * + */ + + C_SparsifyObject(AsyncObjectThrottle<I> &throttle, I *image_ctx, + uint64_t object_no, size_t sparse_size) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_cct(image_ctx->cct), + m_object_no(object_no), m_sparse_size(sparse_size), + m_oid(image_ctx->get_object_name(object_no)) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + ldout(m_cct, 20) << dendl; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(m_cct) << "missing data pool" << dendl; + return -ENODEV; + } + + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + ldout(m_cct, 1) << "lost exclusive lock during sparsify" << dendl; + return -ERESTART; + } + + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.object_map != nullptr && + !image_ctx.object_map->object_may_exist(m_object_no)) { + // can skip because the object does not exist + return 1; + } + + RWLock::RLocker parent_locker(image_ctx.parent_lock); + uint64_t overlap_objects = 0; + uint64_t overlap; + int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &overlap); + if (r == 0 && overlap > 0) { + overlap_objects = Striper::get_num_objects(image_ctx.layout, overlap); + } + m_remove_empty = (m_object_no >= overlap_objects); + } + + send_sparsify(); + return 0; + } + + void send_sparsify() { + I &image_ctx = this->m_image_ctx; + ldout(m_cct, 20) << dendl; + + librados::ObjectWriteOperation op; + cls_client::sparsify(&op, m_sparse_size, m_remove_empty); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_sparsify>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void handle_sparsify(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + m_trying_trim = true; + send_read(); + return; + } + + if (r == -ENOENT) { + finish_op(0); + return; + } + + if (r < 0) { + lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + send_pre_update_object_map(); + } + + void send_pre_update_object_map() { + I &image_ctx = this->m_image_ctx; + + if (m_trying_trim) { + if (!m_remove_empty || m_new_end != 0 || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + send_trim(); + return; + } + } else if (!m_remove_empty || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + finish_op(0); + return; + } + + ldout(m_cct, 20) << dendl; + + image_ctx.owner_lock.get_read(); + image_ctx.snap_lock.get_read(); + if (image_ctx.object_map == nullptr) { + // possible that exclusive lock was lost in background + lderr(m_cct) << "object map is not initialized" << dendl; + + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + finish_op(-EINVAL); + return; + } + + int r; + m_finish_op_ctx = image_ctx.exclusive_lock->start_op(&r); + if (m_finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + finish_op(r); + return; + } + + auto ctx = create_context_callback< + C_SparsifyObject<I>, + &C_SparsifyObject<I>::handle_pre_update_object_map>(this); + + image_ctx.object_map_lock.get_write(); + bool sent = image_ctx.object_map->template aio_update< + Context, &Context::complete>(CEPH_NOSNAP, m_object_no, OBJECT_PENDING, + OBJECT_EXISTS, {}, false, ctx); + + // NOTE: state machine might complete before we reach here + image_ctx.object_map_lock.put_write(); + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + if (!sent) { + finish_op(0); + } + } + + void handle_pre_update_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + finish_op(r); + return; + } + + if (m_trying_trim) { + send_trim(); + } else { + send_check_exists(); + } + } + + void send_check_exists() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + m_bl.clear(); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_check_exists>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl); + ceph_assert(r == 0); + comp->release(); + } + + void handle_check_exists(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "stat failed: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + send_post_update_object_map(r == 0); + } + + void send_post_update_object_map(bool exists) { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + C_SparsifyObject<I>, + &C_SparsifyObject<I>::handle_post_update_object_map>(this); + bool sent; + { + RWLock::RLocker owner_locker(image_ctx.owner_lock); + RWLock::RLocker snap_locker(image_ctx.snap_lock); + + assert(image_ctx.exclusive_lock->is_lock_owner()); + assert(image_ctx.object_map != nullptr); + + RWLock::WLocker object_map_locker(image_ctx.object_map_lock); + + sent = image_ctx.object_map->template aio_update< + Context, &Context::complete>(CEPH_NOSNAP, m_object_no, + exists ? OBJECT_EXISTS : OBJECT_NONEXISTENT, + OBJECT_PENDING, {}, false, ctx); + } + if (!sent) { + ctx->complete(0); + } + } + + void handle_post_update_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + finish_op(r); + return; + } + + finish_op(0); + } + + void send_read() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + m_bl.clear(); + op.sparse_read(0, image_ctx.layout.object_size, &m_extent_map, &m_bl, + nullptr); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_read>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl); + ceph_assert(r == 0); + comp->release(); + } + + void handle_read(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (r == -ENOENT) { + r = 0; + } else { + lderr(m_cct) << "failed to read object: " << cpp_strerror(r) << dendl; + } + finish_op(r); + return; + } + + if (!may_be_trimmed(m_extent_map, m_bl, m_sparse_size, &m_new_end)) { + finish_op(0); + return; + } + + send_pre_update_object_map(); + } + + void send_trim() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + ceph_assert(m_new_end < image_ctx.layout.object_size); + + librados::ObjectWriteOperation op; + m_bl.clear(); + m_bl.append_zero(image_ctx.layout.object_size - m_new_end); + op.cmpext(m_new_end, m_bl, nullptr); + if (m_new_end == 0 && m_remove_empty) { + op.remove(); + } else { + op.truncate(m_new_end); + } + + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_trim>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void handle_trim(int r) { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r <= -MAX_ERRNO) { + m_finish_op_ctx->complete(0); + m_finish_op_ctx = nullptr; + send_read(); + return; + } + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to trim: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + if (!m_remove_empty || m_new_end != 0 || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + finish_op(0); + return; + } + + send_post_update_object_map(false); + } + + void finish_op(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (m_finish_op_ctx != nullptr) { + m_finish_op_ctx->complete(0); + } + this->complete(r); + } + +private: + CephContext *m_cct; + uint64_t m_object_no; + size_t m_sparse_size; + std::string m_oid; + + bool m_remove_empty = false; + bool m_trying_trim = false; + bufferlist m_bl; + std::map<uint64_t,uint64_t> m_extent_map; + uint64_t m_new_end = 0; + Context *m_finish_op_ctx = nullptr; +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::SparsifyRequest: " << this \ + << " " << __func__ << ": " + +template <typename I> +bool SparsifyRequest<I>::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template <typename I> +void SparsifyRequest<I>::send_op() { + sparsify_objects(); +} + +template <typename I> +void SparsifyRequest<I>::sparsify_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + assert(image_ctx.owner_lock.is_locked()); + + uint64_t objects = 0; + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + objects = image_ctx.get_object_count(CEPH_NOSNAP); + } + + auto ctx = create_context_callback< + SparsifyRequest<I>, + &SparsifyRequest<I>::handle_sparsify_objects>(this); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_SparsifyObject<I> >(), + boost::lambda::_1, &image_ctx, boost::lambda::_2, m_sparse_size)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, objects); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void SparsifyRequest<I>::handle_sparsify_objects(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -ERESTART) { + ldout(cct, 5) << "sparsify operation interrupted" << dendl; + this->complete(r); + return; + } else if (r < 0) { + lderr(cct) << "sparsify encountered an error: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + this->complete(0); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SparsifyRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/SparsifyRequest.h b/src/librbd/operation/SparsifyRequest.h new file mode 100644 index 00000000..74f9eb72 --- /dev/null +++ b/src/librbd/operation/SparsifyRequest.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H + +#include "librbd/operation/Request.h" +#include "common/snap_types.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class SparsifyRequest : public Request<ImageCtxT> +{ +public: + SparsifyRequest(ImageCtxT &image_ctx, size_t sparse_size, Context *on_finish, + ProgressContext &prog_ctx) + : Request<ImageCtxT>(image_ctx, on_finish), m_sparse_size(sparse_size), + m_prog_ctx(prog_ctx) { + } + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + ceph_abort(); + return journal::UnknownEvent(); + } + +private: + /** + * @verbatim + * + * <start> + * | + * v + * SPARSIFY OBJECTS + * | + * v + * <finish> + * + * @endverbatim + */ + + size_t m_sparse_size; + ProgressContext &m_prog_ctx; + + void sparsify_objects(); + void handle_sparsify_objects(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SparsifyRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H diff --git a/src/librbd/operation/TrimRequest.cc b/src/librbd/operation/TrimRequest.cc new file mode 100644 index 00000000..15f6a5df --- /dev/null +++ b/src/librbd/operation/TrimRequest.cc @@ -0,0 +1,377 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/TrimRequest.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "common/ContextCompletion.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" + +#include <boost/bind.hpp> +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> +#include <boost/scope_exit.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::TrimRequest: " + +namespace librbd { +namespace operation { + +template <typename I> +class C_CopyupObject : public C_AsyncObjectThrottle<I> { +public: + C_CopyupObject(AsyncObjectThrottle<I> &throttle, I *image_ctx, + ::SnapContext snapc, uint64_t object_no) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc), + m_object_no(object_no) + { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + string oid = image_ctx.get_object_name(m_object_no); + ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl; + + auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard( + &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, oid, m_object_no, 0, + image_ctx.layout.object_size, m_snapc, + io::OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE, 0, {}, this); + object_dispatch_spec->send(); + return 0; + } +private: + ::SnapContext m_snapc; + uint64_t m_object_no; +}; + +template <typename I> +class C_RemoveObject : public C_AsyncObjectThrottle<I> { +public: + C_RemoveObject(AsyncObjectThrottle<I> &throttle, I *image_ctx, + uint64_t object_no) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no) + { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.object_map != nullptr && + !image_ctx.object_map->object_may_exist(m_object_no)) { + return 1; + } + } + + string oid = image_ctx.get_object_name(m_object_no); + ldout(image_ctx.cct, 10) << "removing " << oid << dendl; + + librados::AioCompletion *rados_completion = + util::create_rados_callback(this); + int r = image_ctx.data_ctx.aio_remove(oid, rados_completion); + ceph_assert(r == 0); + rados_completion->release(); + return 0; + } + +private: + uint64_t m_object_no; +}; + +template <typename I> +TrimRequest<I>::TrimRequest(I &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx) + : AsyncRequest<I>(image_ctx, on_finish), m_new_size(new_size), + m_prog_ctx(prog_ctx) +{ + uint64_t period = image_ctx.get_stripe_period(); + uint64_t new_num_periods = ((m_new_size + period - 1) / period); + m_delete_off = std::min(new_num_periods * period, original_size); + // first object we can delete free and clear + m_delete_start = new_num_periods * image_ctx.get_stripe_count(); + m_delete_start_min = m_delete_start; + m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size); + + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " trim image " << original_size << " -> " + << m_new_size << " periods " << new_num_periods + << " discard to offset " << m_delete_off + << " delete objects " << m_delete_start + << " to " << m_num_objects << dendl; +} + +template <typename I> +bool TrimRequest<I>::should_complete(int r) +{ + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " should_complete: r=" << r << dendl; + if (r == -ERESTART) { + ldout(cct, 5) << "trim operation interrupted" << dendl; + return true; + } else if (r < 0) { + lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl; + return true; + } + + RWLock::RLocker owner_lock(image_ctx.owner_lock); + switch (m_state) { + case STATE_PRE_TRIM: + ldout(cct, 5) << " PRE_TRIM" << dendl; + send_copyup_objects(); + break; + + case STATE_COPYUP_OBJECTS: + ldout(cct, 5) << " COPYUP_OBJECTS" << dendl; + send_remove_objects(); + break; + + case STATE_REMOVE_OBJECTS: + ldout(cct, 5) << " REMOVE_OBJECTS" << dendl; + send_post_trim(); + break; + + case STATE_POST_TRIM: + ldout(cct, 5) << " POST_TRIM" << dendl; + send_clean_boundary(); + break; + + case STATE_CLEAN_BOUNDARY: + ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl; + send_finish(0); + break; + + case STATE_FINISHED: + ldout(cct, 5) << "FINISHED" << dendl; + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + ceph_abort(); + break; + } + return false; +} + +template <typename I> +void TrimRequest<I>::send() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(cct) << "missing data pool" << dendl; + send_finish(-ENODEV); + return; + } + + send_pre_trim(); +} + +template<typename I> +void TrimRequest<I>::send_pre_trim() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + if (m_delete_start >= m_num_objects) { + send_clean_boundary(); + return; + } + + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.object_map != nullptr) { + ldout(image_ctx.cct, 5) << this << " send_pre_trim: " + << " delete_start_min=" << m_delete_start_min + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_PRE_TRIM; + + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + RWLock::WLocker object_map_locker(image_ctx.object_map_lock); + if (image_ctx.object_map->template aio_update<AsyncRequest<I> >( + CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING, + OBJECT_EXISTS, {}, false, this)) { + return; + } + } + } + + send_copyup_objects(); +} + +template<typename I> +void TrimRequest<I>::send_copyup_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + ::SnapContext snapc; + bool has_snapshots; + uint64_t parent_overlap; + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + RWLock::RLocker parent_locker(image_ctx.parent_lock); + + snapc = image_ctx.snapc; + has_snapshots = !image_ctx.snaps.empty(); + int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap); + ceph_assert(r == 0); + } + + // copyup is only required for portion of image that overlaps parent + uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout, + parent_overlap); + + // TODO: protect against concurrent shrink and snap create? + // skip to remove if no copyup is required. + if (copyup_end <= m_delete_start || !has_snapshots) { + send_remove_objects(); + return; + } + + uint64_t copyup_start = m_delete_start; + m_delete_start = copyup_end; + + ldout(image_ctx.cct, 5) << this << " send_copyup_objects: " + << " start object=" << copyup_start << ", " + << " end object=" << copyup_end << dendl; + m_state = STATE_COPYUP_OBJECTS; + + Context *ctx = this->create_callback_context(); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(), + boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start, + copyup_end); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void TrimRequest<I>::send_remove_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + ldout(image_ctx.cct, 5) << this << " send_remove_objects: " + << " delete_start=" << m_delete_start + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_REMOVE_OBJECTS; + + Context *ctx = this->create_callback_context(); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(), + boost::lambda::_1, &image_ctx, boost::lambda::_2)); + AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>( + this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start, + m_num_objects); + throttle->start_ops( + image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template<typename I> +void TrimRequest<I>::send_post_trim() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + if (image_ctx.object_map != nullptr) { + ldout(image_ctx.cct, 5) << this << " send_post_trim:" + << " delete_start_min=" << m_delete_start_min + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_POST_TRIM; + + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + RWLock::WLocker object_map_locker(image_ctx.object_map_lock); + if (image_ctx.object_map->template aio_update<AsyncRequest<I> >( + CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT, + OBJECT_PENDING, {}, false, this)) { + return; + } + } + } + + send_clean_boundary(); +} + +template <typename I> +void TrimRequest<I>::send_clean_boundary() { + I &image_ctx = this->m_image_ctx; + ceph_assert(image_ctx.owner_lock.is_locked()); + CephContext *cct = image_ctx.cct; + if (m_delete_off <= m_new_size) { + send_finish(0); + return; + } + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + uint64_t delete_len = m_delete_off - m_new_size; + ldout(image_ctx.cct, 5) << this << " send_clean_boundary: " + << " delete_off=" << m_delete_off + << " length=" << delete_len << dendl; + m_state = STATE_CLEAN_BOUNDARY; + + ::SnapContext snapc; + { + RWLock::RLocker snap_locker(image_ctx.snap_lock); + snapc = image_ctx.snapc; + } + + // discard the weird boundary + std::vector<ObjectExtent> extents; + Striper::file_to_extents(cct, image_ctx.format_string, + &image_ctx.layout, m_new_size, delete_len, 0, + extents); + + ContextCompletion *completion = + new ContextCompletion(this->create_async_callback_context(), true); + for (vector<ObjectExtent>::iterator p = extents.begin(); + p != extents.end(); ++p) { + ldout(cct, 20) << " ex " << *p << dendl; + Context *req_comp = new C_ContextCompletion(*completion); + + if (p->offset == 0) { + // treat as a full object delete on the boundary + p->length = image_ctx.layout.object_size; + } + + auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard( + &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, p->oid.name, p->objectno, + p->offset, p->length, snapc, 0, 0, {}, req_comp); + object_dispatch_spec->send(); + } + completion->finish_adding_requests(); +} + +template <typename I> +void TrimRequest<I>::send_finish(int r) { + m_state = STATE_FINISHED; + this->async_complete(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::TrimRequest<librbd::ImageCtx>; diff --git a/src/librbd/operation/TrimRequest.h b/src/librbd/operation/TrimRequest.h new file mode 100644 index 00000000..8526046c --- /dev/null +++ b/src/librbd/operation/TrimRequest.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H +#define CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H + +#include "librbd/AsyncRequest.h" + +namespace librbd +{ + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template <typename ImageCtxT = ImageCtx> +class TrimRequest : public AsyncRequest<ImageCtxT> +{ +public: + static TrimRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx) { + return new TrimRequest(image_ctx, on_finish, original_size, new_size, + prog_ctx); + } + + TrimRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx); + + void send() override; + +protected: + /** + * Trim goes through the following state machine to remove whole objects, + * clean partially trimmed objects, and update the object map: + * + * @verbatim + * + * <start> . . . . . . . . . . . . . . . . . + * | . + * v (skip if not needed) . + * STATE_PRE_TRIM . + * | . + * v (skip if not needed) . + * STATE_COPYUP_OBJECTS . + * | . + * v (skip if not needed) . + * STATE_REMOVE_OBJECTS . + * | . + * v (skip if not needed) . + * STATE_POST_TRIM . + * | . + * v (skip if not needed) . + * STATE_CLEAN_BOUNDARY . + * | . + * v . + * STATE_FINISHED < . . . . . . . . . . . . . . . + * | + * v + * <finish> + * + * The _COPYUP_OBJECTS state is skipped if there is no parent overlap + * within the new image size and the image does not have any snapshots. + * The _PRE_TRIM/_POST_TRIM states are skipped if the object map + * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects + * are removed. The _CLEAN_BOUNDARY state is skipped if no boundary + * objects are cleaned. The state machine will immediately transition + * to _FINISHED state if there are no bytes to trim. + */ + + enum State { + STATE_PRE_TRIM, + STATE_COPYUP_OBJECTS, + STATE_REMOVE_OBJECTS, + STATE_POST_TRIM, + STATE_CLEAN_BOUNDARY, + STATE_FINISHED + }; + + bool should_complete(int r) override; + + State m_state = STATE_PRE_TRIM; + +private: + uint64_t m_delete_start; + uint64_t m_delete_start_min = 0; + uint64_t m_num_objects; + uint64_t m_delete_off; + uint64_t m_new_size; + ProgressContext &m_prog_ctx; + + void send_pre_trim(); + void send_copyup_objects(); + void send_remove_objects(); + void send_post_trim(); + + void send_clean_boundary(); + void send_finish(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::TrimRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H diff --git a/src/librbd/trash/MoveRequest.cc b/src/librbd/trash/MoveRequest.cc new file mode 100644 index 00000000..526e92ab --- /dev/null +++ b/src/librbd/trash/MoveRequest.cc @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/trash/MoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::trash::MoveRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace trash { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +void MoveRequest<I>::send() { + trash_add(); +} + +template <typename I> +void MoveRequest<I>::trash_add() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::trash_add(&op, m_image_id, m_trash_image_spec); + + auto aio_comp = create_rados_callback< + MoveRequest<I>, &MoveRequest<I>::handle_trash_add>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void MoveRequest<I>::handle_trash_add(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 10) << "previous unfinished deferred remove for image: " + << m_image_id << dendl; + } else if (r < 0) { + lderr(m_cct) << "failed to add image to trash: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + remove_id(); +} + +template <typename I> +void MoveRequest<I>::remove_id() { + ldout(m_cct, 10) << dendl; + + auto aio_comp = create_rados_callback< + MoveRequest<I>, &MoveRequest<I>::handle_remove_id>(this); + int r = m_io_ctx.aio_remove(util::id_obj_name(m_trash_image_spec.name), + aio_comp); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void MoveRequest<I>::handle_remove_id(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image id object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + directory_remove(); +} + +template <typename I> +void MoveRequest<I>::directory_remove() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::dir_remove_image(&op, m_trash_image_spec.name, + m_image_id); + + auto aio_comp = create_rados_callback< + MoveRequest<I>, &MoveRequest<I>::handle_directory_remove>(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void MoveRequest<I>::handle_directory_remove(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image from directory: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template <typename I> +void MoveRequest<I>::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace trash +} // namespace librbd + +template class librbd::trash::MoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/trash/MoveRequest.h b/src/librbd/trash/MoveRequest.h new file mode 100644 index 00000000..f0d470c1 --- /dev/null +++ b/src/librbd/trash/MoveRequest.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_MOVE_REQUEST_H +#define CEPH_LIBRBD_TRASH_MOVE_REQUEST_H + +#include "include/utime.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include <string> + +struct CephContext; +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace trash { + +template <typename ImageCtxT = librbd::ImageCtx> +class MoveRequest { +public: + static MoveRequest* create(librados::IoCtx& io_ctx, + const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec, + Context* on_finish) { + return new MoveRequest(io_ctx, image_id, trash_image_spec, on_finish); + } + + MoveRequest(librados::IoCtx& io_ctx, const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec, + Context* on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), + m_trash_image_spec(trash_image_spec), m_on_finish(on_finish), + m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) { + } + + void send(); + +private: + /* + * @verbatim + * + * <start> + * | + * v + * TRASH_ADD + * | + * v + * REMOVE_ID + * | + * v + * DIRECTORY_REMOVE + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + cls::rbd::TrashImageSpec m_trash_image_spec; + Context *m_on_finish; + + CephContext *m_cct; + + void trash_add(); + void handle_trash_add(int r); + + void remove_id(); + void handle_remove_id(int r); + + void directory_remove(); + void handle_directory_remove(int r); + + void finish(int r); + +}; + +} // namespace trash +} // namespace librbd + +extern template class librbd::trash::MoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_TRASH_MOVE_REQUEST_H diff --git a/src/librbd/trash/RemoveRequest.cc b/src/librbd/trash/RemoveRequest.cc new file mode 100644 index 00000000..77cbfd81 --- /dev/null +++ b/src/librbd/trash/RemoveRequest.cc @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/trash/RemoveRequest.h" +#include "common/WorkQueue.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/image/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::trash::RemoveRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace trash { + +using util::create_context_callback; +using util::create_rados_callback; + +template <typename I> +void RemoveRequest<I>::send() { + set_state(); +} + +template <typename I> +void RemoveRequest<I>::set_state() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + cls_client::trash_state_set(&op, m_image_id, m_trash_set_state, + m_trash_expect_state); + + auto aio_comp = create_rados_callback< + RemoveRequest<I>, &RemoveRequest<I>::handle_set_state>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void RemoveRequest<I>::handle_set_state(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -EOPNOTSUPP) { + lderr(m_cct) << "error setting trash image state: " << cpp_strerror(r) + << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + close_image(); + } else { + finish(m_ret_val); + } + return; + } + + if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + remove_image(); + } else { + ceph_assert(m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_NORMAL); + finish(m_ret_val < 0 ? m_ret_val : r); + }; +} + +template <typename I> +void RemoveRequest<I>::close_image() { + if (m_image_ctx == nullptr) { + finish(m_ret_val); + return; + } + + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + RemoveRequest<I>, &RemoveRequest<I>::handle_close_image>(this); + m_image_ctx->state->close(ctx); +} + +template <typename I> +void RemoveRequest<I>::handle_close_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + ldout(m_cct, 5) << "failed to close image:" << cpp_strerror(r) << dendl; + } + + m_image_ctx->destroy(); + m_image_ctx = nullptr; + finish(m_ret_val); +} + +template <typename I> +void RemoveRequest<I>::remove_image() { + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + RemoveRequest<I>, &RemoveRequest<I>::handle_remove_image>(this); + if (m_image_ctx != nullptr) { + auto req = librbd::image::RemoveRequest<I>::create( + m_io_ctx, m_image_ctx, m_force, true, m_prog_ctx, m_op_work_queue, ctx); + req->send(); + } else { + auto req = librbd::image::RemoveRequest<I>::create( + m_io_ctx, "", m_image_id, m_force, true, m_prog_ctx, m_op_work_queue, + ctx); + req->send(); + } +} + +template <typename I> +void RemoveRequest<I>::handle_remove_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + ldout(m_cct, 5) << "failed to remove image:" << cpp_strerror(r) << dendl; + + m_ret_val = r; + m_trash_set_state = cls::rbd::TRASH_IMAGE_STATE_NORMAL; + m_trash_expect_state = cls::rbd::TRASH_IMAGE_STATE_REMOVING; + set_state(); + return; + } + + m_image_ctx = nullptr; + remove_trash_entry(); +} + +template <typename I> +void RemoveRequest<I>::remove_trash_entry() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + cls_client::trash_remove(&op, m_image_id); + + auto aio_comp = create_rados_callback< + RemoveRequest<I>, &RemoveRequest<I>::handle_remove_trash_entry>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void RemoveRequest<I>::handle_remove_trash_entry(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing trash entry: " << cpp_strerror(r) << dendl; + } + + finish(0); +} + +template <typename I> +void RemoveRequest<I>::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace trash +} // namespace librbd + +template class librbd::trash::RemoveRequest<librbd::ImageCtx>; diff --git a/src/librbd/trash/RemoveRequest.h b/src/librbd/trash/RemoveRequest.h new file mode 100644 index 00000000..5f65a57c --- /dev/null +++ b/src/librbd/trash/RemoveRequest.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H +#define CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H + +#include "include/utime.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include <string> + +class CephContext; +class Context; +class ContextWQ; + +namespace librbd { + +class ProgressContext; + +struct ImageCtx; + +namespace trash { + +template <typename ImageCtxT = librbd::ImageCtx> +class RemoveRequest { +public: + static RemoveRequest* create(librados::IoCtx &io_ctx, + const std::string &image_id, + ContextWQ *op_work_queue, bool force, + ProgressContext &prog_ctx, Context *on_finish) { + return new RemoveRequest(io_ctx, image_id, op_work_queue, force, prog_ctx, + on_finish); + } + + static RemoveRequest* create(librados::IoCtx &io_ctx, ImageCtxT *image_ctx, + ContextWQ *op_work_queue, bool force, + ProgressContext &prog_ctx, Context *on_finish) { + return new RemoveRequest(io_ctx, image_ctx, op_work_queue, force, prog_ctx, + on_finish); + } + + + RemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id, + ContextWQ *op_work_queue, bool force, ProgressContext &prog_ctx, + Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_op_work_queue(op_work_queue), + m_force(force), m_prog_ctx(prog_ctx), m_on_finish(on_finish), + m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) { + } + + RemoveRequest(librados::IoCtx &io_ctx, ImageCtxT *image_ctx, + ContextWQ *op_work_queue, bool force, ProgressContext &prog_ctx, + Context *on_finish) + : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(m_image_ctx->id), + m_op_work_queue(op_work_queue), m_force(force), m_prog_ctx(prog_ctx), + m_on_finish(on_finish), + m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) { + } + + void send(); + +private: + /* + * @verbatim + * + * <start> + * | + * v + * SET_STATE (removing) * * * * * * *> CLOSE_IMAGE + * | | + * v | + * REMOVE_IMAGE * * *> SET_STATE (normal) | + * | | | + * v | | + * REMOVE_TRASH_ENTRY | | + * | | | + * v | | + * <finish> <-------------/<---------------/ + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + ImageCtxT *m_image_ctx = nullptr; + std::string m_image_id; + ContextWQ *m_op_work_queue; + bool m_force; + ProgressContext &m_prog_ctx; + Context *m_on_finish; + + CephContext *m_cct; + + cls::rbd::TrashImageState m_trash_set_state = + cls::rbd::TRASH_IMAGE_STATE_REMOVING; + cls::rbd::TrashImageState m_trash_expect_state = + cls::rbd::TRASH_IMAGE_STATE_NORMAL; + int m_ret_val = 0; + + void set_state(); + void handle_set_state(int r); + + void close_image(); + void handle_close_image(int r); + + void remove_image(); + void handle_remove_image(int r); + + void remove_trash_entry(); + void handle_remove_trash_entry(int r); + + void finish(int r); +}; + +} // namespace trash +} // namespace librbd + +extern template class librbd::trash::RemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H diff --git a/src/librbd/trash_watcher/Types.cc b/src/librbd/trash_watcher/Types.cc new file mode 100644 index 00000000..c95ea223 --- /dev/null +++ b/src/librbd/trash_watcher/Types.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Formatter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "librbd/trash_watcher/Types.h" +#include "librbd/watcher/Utils.h" + +namespace librbd { +namespace trash_watcher { + +namespace { + +class DumpPayloadVisitor : public boost::static_visitor<void> { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void ImageAddedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(image_id, bl); + encode(trash_image_spec, bl); +} + +void ImageAddedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(image_id, iter); + decode(trash_image_spec, iter); +} + +void ImageAddedPayload::dump(Formatter *f) const { + f->dump_string("image_id", image_id); + f->open_object_section("trash_image_spec"); + trash_image_spec.dump(f); + f->close_section(); +} + +void ImageRemovedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(image_id, bl); +} + +void ImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(image_id, iter); +} + +void ImageRemovedPayload::dump(Formatter *f) const { + f->dump_string("image_id", image_id); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_IMAGE_ADDED: + payload = ImageAddedPayload(); + break; + case NOTIFY_OP_IMAGE_REMOVED: + payload = ImageRemovedPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) { + o.push_back(new NotifyMessage{ImageAddedPayload{ + "id", {cls::rbd::TRASH_IMAGE_SOURCE_USER, "name", {}, {}}}}); + o.push_back(new NotifyMessage{ImageRemovedPayload{"id"}}); +} + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op) { + switch (op) { + case NOTIFY_OP_IMAGE_ADDED: + out << "ImageAdded"; + break; + case NOTIFY_OP_IMAGE_REMOVED: + out << "ImageRemoved"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(op) << ")"; + break; + } + return out; +} + +} // namespace trash_watcher +} // namespace librbd diff --git a/src/librbd/trash_watcher/Types.h b/src/librbd/trash_watcher/Types.h new file mode 100644 index 00000000..22c2b437 --- /dev/null +++ b/src/librbd/trash_watcher/Types.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_WATCHER_TYPES_H +#define CEPH_LIBRBD_TRASH_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "cls/rbd/cls_rbd_types.h" +#include <iosfwd> +#include <list> +#include <string> +#include <boost/variant.hpp> + + +namespace librbd { +namespace trash_watcher { + +enum NotifyOp { + NOTIFY_OP_IMAGE_ADDED = 0, + NOTIFY_OP_IMAGE_REMOVED = 1 +}; + +struct ImageAddedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ADDED; + + std::string image_id; + cls::rbd::TrashImageSpec trash_image_spec; + + ImageAddedPayload() { + } + ImageAddedPayload(const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec) + : image_id(image_id), trash_image_spec(trash_image_spec) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ImageRemovedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_REMOVED; + + std::string image_id; + + ImageRemovedPayload() { + } + ImageRemovedPayload(const std::string& image_id) + : image_id(image_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1); + + UnknownPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant<ImageAddedPayload, + ImageRemovedPayload, + UnknownPayload> Payload; + +struct NotifyMessage { + NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) { + } + + Payload payload; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<NotifyMessage *> &o); +}; + +WRITE_CLASS_ENCODER(NotifyMessage); + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op); + +} // namespace trash_watcher +} // namespace librbd + +using librbd::trash_watcher::encode; +using librbd::trash_watcher::decode; + +#endif // CEPH_LIBRBD_TRASH_WATCHER_TYPES_H diff --git a/src/librbd/watcher/Notifier.cc b/src/librbd/watcher/Notifier.cc new file mode 100644 index 00000000..dfb95aec --- /dev/null +++ b/src/librbd/watcher/Notifier.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/watcher/Notifier.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/watcher/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::watcher::Notifier: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace watcher { + +const uint64_t Notifier::NOTIFY_TIMEOUT = 5000; + +Notifier::C_AioNotify::C_AioNotify(Notifier *notifier, NotifyResponse *response, + Context *on_finish) + : notifier(notifier), response(response), on_finish(on_finish) { +} + +void Notifier::C_AioNotify::finish(int r) { + if (response != nullptr) { + if (r == 0 || r == -ETIMEDOUT) { + try { + auto it = out_bl.cbegin(); + decode(*response, it); + } catch (const buffer::error &err) { + r = -EBADMSG; + } + } + } + notifier->handle_notify(r, on_finish); +} + +Notifier::Notifier(ContextWQ *work_queue, IoCtx &ioctx, const std::string &oid) + : m_work_queue(work_queue), m_ioctx(ioctx), m_oid(oid), + m_aio_notify_lock(util::unique_lock_name( + "librbd::object_watcher::Notifier::m_aio_notify_lock", this)) { + m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); +} + +Notifier::~Notifier() { + Mutex::Locker aio_notify_locker(m_aio_notify_lock); + ceph_assert(m_pending_aio_notifies == 0); +} + +void Notifier::flush(Context *on_finish) { + Mutex::Locker aio_notify_locker(m_aio_notify_lock); + if (m_pending_aio_notifies == 0) { + m_work_queue->queue(on_finish, 0); + return; + } + + m_aio_notify_flush_ctxs.push_back(on_finish); +} + +void Notifier::notify(bufferlist &bl, NotifyResponse *response, + Context *on_finish) { + { + Mutex::Locker aio_notify_locker(m_aio_notify_lock); + ++m_pending_aio_notifies; + + ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl; + } + + C_AioNotify *ctx = new C_AioNotify(this, response, on_finish); + librados::AioCompletion *comp = util::create_rados_callback(ctx); + int r = m_ioctx.aio_notify(m_oid, comp, bl, NOTIFY_TIMEOUT, &ctx->out_bl); + ceph_assert(r == 0); + comp->release(); +} + +void Notifier::handle_notify(int r, Context *on_finish) { + ldout(m_cct, 20) << "r=" << r << dendl; + + Mutex::Locker aio_notify_locker(m_aio_notify_lock); + ceph_assert(m_pending_aio_notifies > 0); + --m_pending_aio_notifies; + + ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl; + if (m_pending_aio_notifies == 0) { + for (auto ctx : m_aio_notify_flush_ctxs) { + m_work_queue->queue(ctx, 0); + } + m_aio_notify_flush_ctxs.clear(); + } + + if (on_finish != nullptr) { + m_work_queue->queue(on_finish, r); + } +} + +} // namespace watcher +} // namespace librbd diff --git a/src/librbd/watcher/Notifier.h b/src/librbd/watcher/Notifier.h new file mode 100644 index 00000000..8b0ad37b --- /dev/null +++ b/src/librbd/watcher/Notifier.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_NOTIFIER_H +#define CEPH_LIBRBD_WATCHER_NOTIFIER_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/Mutex.h" +#include "common/WorkQueue.h" +#include <list> + +namespace librbd { + +namespace watcher { + +struct NotifyResponse; + +class Notifier { +public: + static const uint64_t NOTIFY_TIMEOUT; + + Notifier(ContextWQ *work_queue, librados::IoCtx &ioctx, + const std::string &oid); + ~Notifier(); + + void flush(Context *on_finish); + void notify(bufferlist &bl, NotifyResponse *response, Context *on_finish); + +private: + typedef std::list<Context*> Contexts; + + struct C_AioNotify : public Context { + Notifier *notifier; + NotifyResponse *response; + Context *on_finish; + bufferlist out_bl; + + C_AioNotify(Notifier *notifier, NotifyResponse *response, + Context *on_finish); + + void finish(int r) override; + }; + + ContextWQ *m_work_queue; + librados::IoCtx &m_ioctx; + CephContext *m_cct; + std::string m_oid; + + Mutex m_aio_notify_lock; + size_t m_pending_aio_notifies = 0; + Contexts m_aio_notify_flush_ctxs; + + void handle_notify(int r, Context *on_finish); + +}; + +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_NOTIFIER_H diff --git a/src/librbd/watcher/RewatchRequest.cc b/src/librbd/watcher/RewatchRequest.cc new file mode 100644 index 00000000..40c3dfe7 --- /dev/null +++ b/src/librbd/watcher/RewatchRequest.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/watcher/RewatchRequest.h" +#include "common/RWLock.h" +#include "common/errno.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::watcher::RewatchRequest: " \ + << this << " " << __func__ << " " + +namespace librbd { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace watcher { + +using std::string; + +RewatchRequest::RewatchRequest(librados::IoCtx& ioctx, const string& oid, + RWLock &watch_lock, + librados::WatchCtx2 *watch_ctx, + uint64_t *watch_handle, Context *on_finish) + : m_ioctx(ioctx), m_oid(oid), m_watch_lock(watch_lock), + m_watch_ctx(watch_ctx), m_watch_handle(watch_handle), + m_on_finish(on_finish) { +} + +void RewatchRequest::send() { + unwatch(); +} + +void RewatchRequest::unwatch() { + ceph_assert(m_watch_lock.is_wlocked()); + if (*m_watch_handle == 0) { + rewatch(); + return; + } + + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << dendl; + + uint64_t watch_handle = 0; + std::swap(*m_watch_handle, watch_handle); + + librados::AioCompletion *aio_comp = create_rados_callback< + RewatchRequest, &RewatchRequest::handle_unwatch>(this); + int r = m_ioctx.aio_unwatch(watch_handle, aio_comp); + ceph_assert(r == 0); + aio_comp->release(); +} + +void RewatchRequest::handle_unwatch(int r) { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -EBLACKLISTED) { + lderr(cct) << "client blacklisted" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(cct) << "failed to unwatch: " << cpp_strerror(r) << dendl; + } + rewatch(); +} + +void RewatchRequest::rewatch() { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << dendl; + + librados::AioCompletion *aio_comp = create_rados_callback< + RewatchRequest, &RewatchRequest::handle_rewatch>(this); + int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_rewatch_handle, m_watch_ctx); + ceph_assert(r == 0); + aio_comp->release(); +} + +void RewatchRequest::handle_rewatch(int r) { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "failed to watch object: " << cpp_strerror(r) + << dendl; + m_rewatch_handle = 0; + } + + { + RWLock::WLocker watch_locker(m_watch_lock); + *m_watch_handle = m_rewatch_handle; + } + + finish(r); +} + +void RewatchRequest::finish(int r) { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace watcher +} // namespace librbd + diff --git a/src/librbd/watcher/RewatchRequest.h b/src/librbd/watcher/RewatchRequest.h new file mode 100644 index 00000000..d4fc250a --- /dev/null +++ b/src/librbd/watcher/RewatchRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H +#define CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" + +struct Context; +struct RWLock; + +namespace librbd { + +namespace watcher { + +class RewatchRequest { +public: + + static RewatchRequest *create(librados::IoCtx& ioctx, const std::string& oid, + RWLock &watch_lock, + librados::WatchCtx2 *watch_ctx, + uint64_t *watch_handle, Context *on_finish) { + return new RewatchRequest(ioctx, oid, watch_lock, watch_ctx, watch_handle, + on_finish); + } + + RewatchRequest(librados::IoCtx& ioctx, const std::string& oid, + RWLock &watch_lock, librados::WatchCtx2 *watch_ctx, + uint64_t *watch_handle, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * UNWATCH + * | + * | . . . . + * | . . (recoverable error) + * v v . + * REWATCH . . . + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx& m_ioctx; + std::string m_oid; + RWLock &m_watch_lock; + librados::WatchCtx2 *m_watch_ctx; + uint64_t *m_watch_handle; + Context *m_on_finish; + + uint64_t m_rewatch_handle = 0; + + void unwatch(); + void handle_unwatch(int r); + + void rewatch(); + void handle_rewatch(int r); + + void finish(int r); +}; + +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H diff --git a/src/librbd/watcher/Types.cc b/src/librbd/watcher/Types.cc new file mode 100644 index 00000000..8f1991d7 --- /dev/null +++ b/src/librbd/watcher/Types.cc @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/watcher/Types.h" +#include "common/Formatter.h" + +namespace librbd { +namespace watcher { + +void ClientId::encode(bufferlist &bl) const { + using ceph::encode; + encode(gid, bl); + encode(handle, bl); +} + +void ClientId::decode(bufferlist::const_iterator &iter) { + using ceph::decode; + decode(gid, iter); + decode(handle, iter); +} + +void ClientId::dump(Formatter *f) const { + f->dump_unsigned("gid", gid); + f->dump_unsigned("handle", handle); +} + +void NotifyResponse::encode(bufferlist& bl) const { + using ceph::encode; + encode(acks, bl); + encode(timeouts, bl); +} + +void NotifyResponse::decode(bufferlist::const_iterator& iter) { + using ceph::decode; + decode(acks, iter); + decode(timeouts, iter); +} +std::ostream &operator<<(std::ostream &out, + const ClientId &client_id) { + out << "[" << client_id.gid << "," << client_id.handle << "]"; + return out; +} + +} // namespace watcher +} // namespace librbd diff --git a/src/librbd/watcher/Types.h b/src/librbd/watcher/Types.h new file mode 100644 index 00000000..d1517fb0 --- /dev/null +++ b/src/librbd/watcher/Types.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_TYPES_H +#define CEPH_LIBRBD_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" + +namespace ceph { class Formatter; } + +namespace librbd { + +class Watcher; + +namespace watcher { + +struct ClientId { + uint64_t gid; + uint64_t handle; + + ClientId() : gid(0), handle(0) {} + ClientId(uint64_t gid, uint64_t handle) : gid(gid), handle(handle) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + inline bool is_valid() const { + return (*this != ClientId()); + } + + inline bool operator==(const ClientId &rhs) const { + return (gid == rhs.gid && handle == rhs.handle); + } + inline bool operator!=(const ClientId &rhs) const { + return !(*this == rhs); + } + inline bool operator<(const ClientId &rhs) const { + if (gid != rhs.gid) { + return gid < rhs.gid; + } else { + return handle < rhs.handle; + } + } +}; + +struct NotifyResponse { + std::map<ClientId, bufferlist> acks; + std::vector<ClientId> timeouts; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); +}; + +template <typename ImageCtxT> +struct Traits { + typedef librbd::Watcher Watcher; +}; + +std::ostream &operator<<(std::ostream &out, + const ClientId &client); + +WRITE_CLASS_ENCODER(ClientId); +WRITE_CLASS_ENCODER(NotifyResponse); + +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_TYPES_H diff --git a/src/librbd/watcher/Utils.h b/src/librbd/watcher/Utils.h new file mode 100644 index 00000000..d2510aaf --- /dev/null +++ b/src/librbd/watcher/Utils.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_UTILS_H +#define CEPH_LIBRBD_WATCHER_UTILS_H + +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "include/Context.h" +#include "librbd/Watcher.h" + +namespace ceph { class Formatter; } + +namespace librbd { +namespace watcher { +namespace util { + +template <typename Watcher> +struct HandlePayloadVisitor : public boost::static_visitor<void> { + Watcher *watcher; + uint64_t notify_id; + uint64_t handle; + + HandlePayloadVisitor(Watcher *watcher_, uint64_t notify_id_, + uint64_t handle_) + : watcher(watcher_), notify_id(notify_id_), handle(handle_) + { + } + + template <typename P> + inline void operator()(const P &payload) const { + typename Watcher::C_NotifyAck *ctx = + new typename Watcher::C_NotifyAck(watcher, notify_id, handle); + if (watcher->handle_payload(payload, ctx)) { + ctx->complete(0); + } + } +}; + +class EncodePayloadVisitor : public boost::static_visitor<void> { +public: + explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {} + + template <typename P> + inline void operator()(const P &payload) const { + using ceph::encode; + encode(static_cast<uint32_t>(P::NOTIFY_OP), m_bl); + payload.encode(m_bl); + } + +private: + bufferlist &m_bl; +}; + +class DecodePayloadVisitor : public boost::static_visitor<void> { +public: + DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) {} + + template <typename P> + inline void operator()(P &payload) const { + payload.decode(m_version, m_iter); + } + +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +} // namespace util +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_UTILS_H |