summaryrefslogtreecommitdiffstats
path: root/src/librbd
diff options
context:
space:
mode:
Diffstat (limited to 'src/librbd')
-rw-r--r--src/librbd/AsyncObjectThrottle.cc108
-rw-r--r--src/librbd/AsyncObjectThrottle.h79
-rw-r--r--src/librbd/AsyncRequest.cc71
-rw-r--r--src/librbd/AsyncRequest.h76
-rw-r--r--src/librbd/BlockGuard.h174
-rw-r--r--src/librbd/CMakeLists.txt197
-rw-r--r--src/librbd/ConfigWatcher.cc116
-rw-r--r--src/librbd/ConfigWatcher.h47
-rw-r--r--src/librbd/DeepCopyRequest.cc362
-rw-r--r--src/librbd/DeepCopyRequest.h136
-rw-r--r--src/librbd/ExclusiveLock.cc374
-rw-r--r--src/librbd/ExclusiveLock.h108
-rw-r--r--src/librbd/Features.cc107
-rw-r--r--src/librbd/Features.h16
-rw-r--r--src/librbd/ImageCtx.cc927
-rw-r--r--src/librbd/ImageCtx.h331
-rw-r--r--src/librbd/ImageState.cc741
-rw-r--r--src/librbd/ImageState.h145
-rw-r--r--src/librbd/ImageWatcher.cc1126
-rw-r--r--src/librbd/ImageWatcher.h268
-rw-r--r--src/librbd/Journal.cc1780
-rw-r--r--src/librbd/Journal.h386
-rw-r--r--src/librbd/LibrbdAdminSocketHook.cc103
-rw-r--r--src/librbd/LibrbdAdminSocketHook.h32
-rw-r--r--src/librbd/LibrbdWriteback.cc270
-rw-r--r--src/librbd/LibrbdWriteback.h72
-rw-r--r--src/librbd/ManagedLock.cc852
-rw-r--r--src/librbd/ManagedLock.h270
-rw-r--r--src/librbd/MirroringWatcher.cc142
-rw-r--r--src/librbd/MirroringWatcher.h66
-rw-r--r--src/librbd/ObjectMap.cc364
-rw-r--r--src/librbd/ObjectMap.h158
-rw-r--r--src/librbd/Operations.cc1793
-rw-r--r--src/librbd/Operations.h129
-rw-r--r--src/librbd/TaskFinisher.h159
-rw-r--r--src/librbd/TrashWatcher.cc115
-rw-r--r--src/librbd/TrashWatcher.h57
-rw-r--r--src/librbd/Types.h124
-rw-r--r--src/librbd/Utils.cc145
-rw-r--r--src/librbd/Utils.h221
-rw-r--r--src/librbd/WatchNotifyTypes.cc525
-rw-r--r--src/librbd/WatchNotifyTypes.h400
-rw-r--r--src/librbd/Watcher.cc367
-rw-r--r--src/librbd/Watcher.h184
-rw-r--r--src/librbd/api/Config.cc240
-rw-r--r--src/librbd/api/Config.h37
-rw-r--r--src/librbd/api/DiffIterate.cc554
-rw-r--r--src/librbd/api/DiffIterate.h66
-rw-r--r--src/librbd/api/Group.cc1207
-rw-r--r--src/librbd/api/Group.h59
-rw-r--r--src/librbd/api/Image.cc836
-rw-r--r--src/librbd/api/Image.h78
-rw-r--r--src/librbd/api/Migration.cc1885
-rw-r--r--src/librbd/api/Migration.h105
-rw-r--r--src/librbd/api/Mirror.cc1541
-rw-r--r--src/librbd/api/Mirror.h87
-rw-r--r--src/librbd/api/Namespace.cc227
-rw-r--r--src/librbd/api/Namespace.h33
-rw-r--r--src/librbd/api/Pool.cc377
-rw-r--r--src/librbd/api/Pool.h38
-rw-r--r--src/librbd/api/PoolMetadata.cc151
-rw-r--r--src/librbd/api/PoolMetadata.h36
-rw-r--r--src/librbd/api/Snapshot.cc196
-rw-r--r--src/librbd/api/Snapshot.h37
-rw-r--r--src/librbd/api/Trash.cc681
-rw-r--r--src/librbd/api/Trash.h53
-rw-r--r--src/librbd/cache/ImageCache.h56
-rw-r--r--src/librbd/cache/ImageWriteback.cc127
-rw-r--r--src/librbd/cache/ImageWriteback.h54
-rw-r--r--src/librbd/cache/ObjectCacherObjectDispatch.cc408
-rw-r--r--src/librbd/cache/ObjectCacherObjectDispatch.h112
-rw-r--r--src/librbd/cache/PassthroughImageCache.cc135
-rw-r--r--src/librbd/cache/PassthroughImageCache.h59
-rw-r--r--src/librbd/deep_copy/ImageCopyRequest.cc187
-rw-r--r--src/librbd/deep_copy/ImageCopyRequest.h109
-rw-r--r--src/librbd/deep_copy/MetadataCopyRequest.cc123
-rw-r--r--src/librbd/deep_copy/MetadataCopyRequest.h77
-rw-r--r--src/librbd/deep_copy/ObjectCopyRequest.cc998
-rw-r--r--src/librbd/deep_copy/ObjectCopyRequest.h210
-rw-r--r--src/librbd/deep_copy/SetHeadRequest.cc224
-rw-r--r--src/librbd/deep_copy/SetHeadRequest.h87
-rw-r--r--src/librbd/deep_copy/SnapshotCopyRequest.cc730
-rw-r--r--src/librbd/deep_copy/SnapshotCopyRequest.h148
-rw-r--r--src/librbd/deep_copy/SnapshotCreateRequest.cc187
-rw-r--r--src/librbd/deep_copy/SnapshotCreateRequest.h95
-rw-r--r--src/librbd/deep_copy/Types.h22
-rw-r--r--src/librbd/deep_copy/Utils.cc61
-rw-r--r--src/librbd/deep_copy/Utils.h30
-rw-r--r--src/librbd/exclusive_lock/AutomaticPolicy.cc29
-rw-r--r--src/librbd/exclusive_lock/AutomaticPolicy.h34
-rw-r--r--src/librbd/exclusive_lock/Policy.h30
-rw-r--r--src/librbd/exclusive_lock/PostAcquireRequest.cc308
-rw-r--r--src/librbd/exclusive_lock/PostAcquireRequest.h111
-rw-r--r--src/librbd/exclusive_lock/PreAcquireRequest.cc94
-rw-r--r--src/librbd/exclusive_lock/PreAcquireRequest.h75
-rw-r--r--src/librbd/exclusive_lock/PreReleaseRequest.cc299
-rw-r--r--src/librbd/exclusive_lock/PreReleaseRequest.h117
-rw-r--r--src/librbd/exclusive_lock/StandardPolicy.cc27
-rw-r--r--src/librbd/exclusive_lock/StandardPolicy.h34
-rw-r--r--src/librbd/image/AttachChildRequest.cc261
-rw-r--r--src/librbd/image/AttachChildRequest.h105
-rw-r--r--src/librbd/image/AttachParentRequest.cc91
-rw-r--r--src/librbd/image/AttachParentRequest.h79
-rw-r--r--src/librbd/image/CloneRequest.cc647
-rw-r--r--src/librbd/image/CloneRequest.h178
-rw-r--r--src/librbd/image/CloseRequest.cc342
-rw-r--r--src/librbd/image/CloseRequest.h126
-rw-r--r--src/librbd/image/CreateRequest.cc830
-rw-r--r--src/librbd/image/CreateRequest.h186
-rw-r--r--src/librbd/image/DetachChildRequest.cc303
-rw-r--r--src/librbd/image/DetachChildRequest.h108
-rw-r--r--src/librbd/image/DetachParentRequest.cc82
-rw-r--r--src/librbd/image/DetachParentRequest.h66
-rw-r--r--src/librbd/image/ListWatchersRequest.cc166
-rw-r--r--src/librbd/image/ListWatchersRequest.h81
-rw-r--r--src/librbd/image/OpenRequest.cc662
-rw-r--r--src/librbd/image/OpenRequest.h141
-rw-r--r--src/librbd/image/PreRemoveRequest.cc321
-rw-r--r--src/librbd/image/PreRemoveRequest.h99
-rw-r--r--src/librbd/image/RefreshParentRequest.cc245
-rw-r--r--src/librbd/image/RefreshParentRequest.h109
-rw-r--r--src/librbd/image/RefreshRequest.cc1551
-rw-r--r--src/librbd/image/RefreshRequest.h270
-rw-r--r--src/librbd/image/RemoveRequest.cc605
-rw-r--r--src/librbd/image/RemoveRequest.h198
-rw-r--r--src/librbd/image/SetFlagsRequest.cc78
-rw-r--r--src/librbd/image/SetFlagsRequest.h62
-rw-r--r--src/librbd/image/SetSnapRequest.cc367
-rw-r--r--src/librbd/image/SetSnapRequest.h118
-rw-r--r--src/librbd/image/TypeTraits.h20
-rw-r--r--src/librbd/image/ValidatePoolRequest.cc235
-rw-r--r--src/librbd/image/ValidatePoolRequest.h96
-rw-r--r--src/librbd/image_watcher/NotifyLockOwner.cc95
-rw-r--r--src/librbd/image_watcher/NotifyLockOwner.h50
-rw-r--r--src/librbd/internal.cc2021
-rw-r--r--src/librbd/internal.h155
-rw-r--r--src/librbd/io/AioCompletion.cc216
-rw-r--r--src/librbd/io/AioCompletion.h203
-rw-r--r--src/librbd/io/AsyncOperation.cc94
-rw-r--r--src/librbd/io/AsyncOperation.h52
-rw-r--r--src/librbd/io/CopyupRequest.cc625
-rw-r--r--src/librbd/io/CopyupRequest.h135
-rw-r--r--src/librbd/io/ImageDispatchSpec.cc154
-rw-r--r--src/librbd/io/ImageDispatchSpec.h182
-rw-r--r--src/librbd/io/ImageRequest.cc824
-rw-r--r--src/librbd/io/ImageRequest.h365
-rw-r--r--src/librbd/io/ImageRequestWQ.cc1043
-rw-r--r--src/librbd/io/ImageRequestWQ.h154
-rw-r--r--src/librbd/io/ObjectDispatch.cc135
-rw-r--r--src/librbd/io/ObjectDispatch.h104
-rw-r--r--src/librbd/io/ObjectDispatchInterface.h86
-rw-r--r--src/librbd/io/ObjectDispatchSpec.cc46
-rw-r--r--src/librbd/io/ObjectDispatchSpec.h266
-rw-r--r--src/librbd/io/ObjectDispatcher.cc354
-rw-r--r--src/librbd/io/ObjectDispatcher.h89
-rw-r--r--src/librbd/io/ObjectRequest.cc729
-rw-r--r--src/librbd/io/ObjectRequest.h478
-rw-r--r--src/librbd/io/ReadResult.cc170
-rw-r--r--src/librbd/io/ReadResult.h106
-rw-r--r--src/librbd/io/Types.h83
-rw-r--r--src/librbd/io/Utils.cc57
-rw-r--r--src/librbd/io/Utils.h26
-rw-r--r--src/librbd/journal/CreateRequest.cc235
-rw-r--r--src/librbd/journal/CreateRequest.h106
-rw-r--r--src/librbd/journal/DemoteRequest.cc255
-rw-r--r--src/librbd/journal/DemoteRequest.h107
-rw-r--r--src/librbd/journal/DisabledPolicy.h31
-rw-r--r--src/librbd/journal/ObjectDispatch.cc213
-rw-r--r--src/librbd/journal/ObjectDispatch.h113
-rw-r--r--src/librbd/journal/OpenRequest.cc144
-rw-r--r--src/librbd/journal/OpenRequest.h85
-rw-r--r--src/librbd/journal/Policy.h25
-rw-r--r--src/librbd/journal/PromoteRequest.cc237
-rw-r--r--src/librbd/journal/PromoteRequest.h109
-rw-r--r--src/librbd/journal/RemoveRequest.cc154
-rw-r--r--src/librbd/journal/RemoveRequest.h82
-rw-r--r--src/librbd/journal/Replay.cc1190
-rw-r--r--src/librbd/journal/Replay.h210
-rw-r--r--src/librbd/journal/ResetRequest.cc162
-rw-r--r--src/librbd/journal/ResetRequest.h111
-rw-r--r--src/librbd/journal/StandardPolicy.cc32
-rw-r--r--src/librbd/journal/StandardPolicy.h38
-rw-r--r--src/librbd/journal/TypeTraits.h26
-rw-r--r--src/librbd/journal/Types.cc950
-rw-r--r--src/librbd/journal/Types.h685
-rw-r--r--src/librbd/journal/Utils.cc86
-rw-r--r--src/librbd/journal/Utils.h81
-rw-r--r--src/librbd/librbd.cc6356
-rw-r--r--src/librbd/managed_lock/AcquireRequest.cc181
-rw-r--r--src/librbd/managed_lock/AcquireRequest.h101
-rw-r--r--src/librbd/managed_lock/BreakRequest.cc235
-rw-r--r--src/librbd/managed_lock/BreakRequest.h111
-rw-r--r--src/librbd/managed_lock/GetLockerRequest.cc131
-rw-r--r--src/librbd/managed_lock/GetLockerRequest.h58
-rw-r--r--src/librbd/managed_lock/ReacquireRequest.cc79
-rw-r--r--src/librbd/managed_lock/ReacquireRequest.h69
-rw-r--r--src/librbd/managed_lock/ReleaseRequest.cc94
-rw-r--r--src/librbd/managed_lock/ReleaseRequest.h71
-rw-r--r--src/librbd/managed_lock/Types.h46
-rw-r--r--src/librbd/managed_lock/Utils.cc43
-rw-r--r--src/librbd/managed_lock/Utils.h23
-rw-r--r--src/librbd/mirror/DemoteRequest.cc198
-rw-r--r--src/librbd/mirror/DemoteRequest.h85
-rw-r--r--src/librbd/mirror/DisableRequest.cc492
-rw-r--r--src/librbd/mirror/DisableRequest.h141
-rw-r--r--src/librbd/mirror/EnableRequest.cc190
-rw-r--r--src/librbd/mirror/EnableRequest.h96
-rw-r--r--src/librbd/mirror/GetInfoRequest.cc115
-rw-r--r--src/librbd/mirror/GetInfoRequest.h81
-rw-r--r--src/librbd/mirror/GetStatusRequest.cc112
-rw-r--r--src/librbd/mirror/GetStatusRequest.h85
-rw-r--r--src/librbd/mirror/PromoteRequest.cc103
-rw-r--r--src/librbd/mirror/PromoteRequest.h75
-rw-r--r--src/librbd/mirror/Types.h20
-rw-r--r--src/librbd/mirroring_watcher/Types.cc136
-rw-r--r--src/librbd/mirroring_watcher/Types.h102
-rw-r--r--src/librbd/object_map/CreateRequest.cc94
-rw-r--r--src/librbd/object_map/CreateRequest.h60
-rw-r--r--src/librbd/object_map/InvalidateRequest.cc83
-rw-r--r--src/librbd/object_map/InvalidateRequest.h49
-rw-r--r--src/librbd/object_map/LockRequest.cc157
-rw-r--r--src/librbd/object_map/LockRequest.h75
-rw-r--r--src/librbd/object_map/RefreshRequest.cc303
-rw-r--r--src/librbd/object_map/RefreshRequest.h96
-rw-r--r--src/librbd/object_map/RemoveRequest.cc89
-rw-r--r--src/librbd/object_map/RemoveRequest.h62
-rw-r--r--src/librbd/object_map/Request.cc70
-rw-r--r--src/librbd/object_map/Request.h62
-rw-r--r--src/librbd/object_map/ResizeRequest.cc66
-rw-r--r--src/librbd/object_map/ResizeRequest.h48
-rw-r--r--src/librbd/object_map/SnapshotCreateRequest.cc148
-rw-r--r--src/librbd/object_map/SnapshotCreateRequest.h76
-rw-r--r--src/librbd/object_map/SnapshotRemoveRequest.cc227
-rw-r--r--src/librbd/object_map/SnapshotRemoveRequest.h85
-rw-r--r--src/librbd/object_map/SnapshotRollbackRequest.cc131
-rw-r--r--src/librbd/object_map/SnapshotRollbackRequest.h74
-rw-r--r--src/librbd/object_map/UnlockRequest.cc66
-rw-r--r--src/librbd/object_map/UnlockRequest.h47
-rw-r--r--src/librbd/object_map/UpdateRequest.cc129
-rw-r--r--src/librbd/object_map/UpdateRequest.h101
-rw-r--r--src/librbd/operation/DisableFeaturesRequest.cc646
-rw-r--r--src/librbd/operation/DisableFeaturesRequest.h171
-rw-r--r--src/librbd/operation/EnableFeaturesRequest.cc489
-rw-r--r--src/librbd/operation/EnableFeaturesRequest.h135
-rw-r--r--src/librbd/operation/FlattenRequest.cc230
-rw-r--r--src/librbd/operation/FlattenRequest.h77
-rw-r--r--src/librbd/operation/MetadataRemoveRequest.cc60
-rw-r--r--src/librbd/operation/MetadataRemoveRequest.h44
-rw-r--r--src/librbd/operation/MetadataSetRequest.cc62
-rw-r--r--src/librbd/operation/MetadataSetRequest.h47
-rw-r--r--src/librbd/operation/MigrateRequest.cc235
-rw-r--r--src/librbd/operation/MigrateRequest.h69
-rw-r--r--src/librbd/operation/ObjectMapIterate.cc310
-rw-r--r--src/librbd/operation/ObjectMapIterate.h65
-rw-r--r--src/librbd/operation/RebuildObjectMapRequest.cc248
-rw-r--r--src/librbd/operation/RebuildObjectMapRequest.h84
-rw-r--r--src/librbd/operation/RenameRequest.cc212
-rw-r--r--src/librbd/operation/RenameRequest.h89
-rw-r--r--src/librbd/operation/Request.cc183
-rw-r--r--src/librbd/operation/Request.h108
-rw-r--r--src/librbd/operation/ResizeRequest.cc467
-rw-r--r--src/librbd/operation/ResizeRequest.h156
-rw-r--r--src/librbd/operation/SnapshotCreateRequest.cc342
-rw-r--r--src/librbd/operation/SnapshotCreateRequest.h126
-rw-r--r--src/librbd/operation/SnapshotLimitRequest.cc67
-rw-r--r--src/librbd/operation/SnapshotLimitRequest.h44
-rw-r--r--src/librbd/operation/SnapshotProtectRequest.cc119
-rw-r--r--src/librbd/operation/SnapshotProtectRequest.h68
-rw-r--r--src/librbd/operation/SnapshotRemoveRequest.cc469
-rw-r--r--src/librbd/operation/SnapshotRemoveRequest.h122
-rw-r--r--src/librbd/operation/SnapshotRenameRequest.cc104
-rw-r--r--src/librbd/operation/SnapshotRenameRequest.h63
-rw-r--r--src/librbd/operation/SnapshotRollbackRequest.cc412
-rw-r--r--src/librbd/operation/SnapshotRollbackRequest.h122
-rw-r--r--src/librbd/operation/SnapshotUnprotectRequest.cc354
-rw-r--r--src/librbd/operation/SnapshotUnprotectRequest.h94
-rw-r--r--src/librbd/operation/SparsifyRequest.cc519
-rw-r--r--src/librbd/operation/SparsifyRequest.h64
-rw-r--r--src/librbd/operation/TrimRequest.cc377
-rw-r--r--src/librbd/operation/TrimRequest.h107
-rw-r--r--src/librbd/trash/MoveRequest.cc124
-rw-r--r--src/librbd/trash/MoveRequest.h87
-rw-r--r--src/librbd/trash/RemoveRequest.cc171
-rw-r--r--src/librbd/trash/RemoveRequest.h119
-rw-r--r--src/librbd/trash_watcher/Types.cc130
-rw-r--r--src/librbd/trash_watcher/Types.h97
-rw-r--r--src/librbd/watcher/Notifier.cc98
-rw-r--r--src/librbd/watcher/Notifier.h63
-rw-r--r--src/librbd/watcher/RewatchRequest.cc108
-rw-r--r--src/librbd/watcher/RewatchRequest.h75
-rw-r--r--src/librbd/watcher/Types.cc45
-rw-r--r--src/librbd/watcher/Types.h71
-rw-r--r--src/librbd/watcher/Utils.h74
293 files changed, 72585 insertions, 0 deletions
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
new file mode 100644
index 00000000..b6dbcb26
--- /dev/null
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "librbd/AsyncObjectThrottle.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "librbd/AsyncRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+namespace librbd
+{
+
+template <typename T>
+AsyncObjectThrottle<T>::AsyncObjectThrottle(
+ const AsyncRequest<T>* async_request, T &image_ctx,
+ const ContextFactory& context_factory, Context *ctx,
+ ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no)
+ : m_lock(util::unique_lock_name("librbd::AsyncThrottle::m_lock", this)),
+ m_async_request(async_request), m_image_ctx(image_ctx),
+ m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx),
+ m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0),
+ m_ret(0)
+{
+}
+
+template <typename T>
+void AsyncObjectThrottle<T>::start_ops(uint64_t max_concurrent) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ bool complete;
+ {
+ Mutex::Locker l(m_lock);
+ for (uint64_t i = 0; i < max_concurrent; ++i) {
+ start_next_op();
+ if (m_ret < 0 && m_current_ops == 0) {
+ break;
+ }
+ }
+ complete = (m_current_ops == 0);
+ }
+ if (complete) {
+ // avoid re-entrant callback
+ m_image_ctx.op_work_queue->queue(m_ctx, m_ret);
+ delete this;
+ }
+}
+
+template <typename T>
+void AsyncObjectThrottle<T>::finish_op(int r) {
+ bool complete;
+ {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ Mutex::Locker locker(m_lock);
+ --m_current_ops;
+ if (r < 0 && r != -ENOENT && m_ret == 0) {
+ m_ret = r;
+ }
+
+ start_next_op();
+ complete = (m_current_ops == 0);
+ }
+ if (complete) {
+ m_ctx->complete(m_ret);
+ delete this;
+ }
+}
+
+template <typename T>
+void AsyncObjectThrottle<T>::start_next_op() {
+ bool done = false;
+ while (!done) {
+ if (m_async_request != NULL && m_async_request->is_canceled() &&
+ m_ret == 0) {
+ // allow in-flight ops to complete, but don't start new ops
+ m_ret = -ERESTART;
+ return;
+ } else if (m_ret != 0 || m_object_no >= m_end_object_no) {
+ return;
+ }
+
+ uint64_t ono = m_object_no++;
+ C_AsyncObjectThrottle<T> *ctx = m_context_factory(*this, ono);
+
+ int r = ctx->send();
+ if (r < 0) {
+ m_ret = r;
+ delete ctx;
+ return;
+ } else if (r > 0) {
+ // op completed immediately
+ delete ctx;
+ } else {
+ ++m_current_ops;
+ done = true;
+ }
+ if (m_prog_ctx != NULL) {
+ r = m_prog_ctx->update_progress(ono, m_end_object_no);
+ if (r < 0) {
+ m_ret = r;
+ }
+ }
+ }
+}
+
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h
new file mode 100644
index 00000000..e1b08962
--- /dev/null
+++ b/src/librbd/AsyncObjectThrottle.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
+#define CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+
+#include <boost/function.hpp>
+
+namespace librbd
+{
+template <typename ImageCtxT> class AsyncRequest;
+class ProgressContext;
+struct ImageCtx;
+
+class AsyncObjectThrottleFinisher {
+public:
+ virtual ~AsyncObjectThrottleFinisher() {};
+ virtual void finish_op(int r) = 0;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class C_AsyncObjectThrottle : public Context {
+public:
+ C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher,
+ ImageCtxT &image_ctx)
+ : m_image_ctx(image_ctx), m_finisher(finisher) {
+ }
+
+ virtual int send() = 0;
+
+protected:
+ ImageCtxT &m_image_ctx;
+
+ void finish(int r) override {
+ m_finisher.finish_op(r);
+ }
+
+private:
+ AsyncObjectThrottleFinisher &m_finisher;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AsyncObjectThrottle : public AsyncObjectThrottleFinisher {
+public:
+ typedef boost::function<
+ C_AsyncObjectThrottle<ImageCtxT>* (AsyncObjectThrottle&,
+ uint64_t)> ContextFactory;
+
+ AsyncObjectThrottle(const AsyncRequest<ImageCtxT> *async_request,
+ ImageCtxT &image_ctx,
+ const ContextFactory& context_factory, Context *ctx,
+ ProgressContext *prog_ctx, uint64_t object_no,
+ uint64_t end_object_no);
+
+ void start_ops(uint64_t max_concurrent);
+ void finish_op(int r) override;
+
+private:
+ Mutex m_lock;
+ const AsyncRequest<ImageCtxT> *m_async_request;
+ ImageCtxT &m_image_ctx;
+ ContextFactory m_context_factory;
+ Context *m_ctx;
+ ProgressContext *m_prog_ctx;
+ uint64_t m_object_no;
+ uint64_t m_end_object_no;
+ uint64_t m_current_ops;
+ int m_ret;
+
+ void start_next_op();
+};
+
+} // namespace librbd
+
+extern template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
new file mode 100644
index 00000000..8a76a226
--- /dev/null
+++ b/src/librbd/AsyncRequest.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "librbd/AsyncRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "common/WorkQueue.h"
+
+namespace librbd
+{
+
+template <typename T>
+AsyncRequest<T>::AsyncRequest(T &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false),
+ m_xlist_item(this) {
+ ceph_assert(m_on_finish != NULL);
+ start_request();
+}
+
+template <typename T>
+AsyncRequest<T>::~AsyncRequest() {
+}
+
+template <typename T>
+void AsyncRequest<T>::async_complete(int r) {
+ m_image_ctx.op_work_queue->queue(create_callback_context(), r);
+}
+
+template <typename T>
+librados::AioCompletion *AsyncRequest<T>::create_callback_completion() {
+ return util::create_rados_callback(this);
+}
+
+template <typename T>
+Context *AsyncRequest<T>::create_callback_context() {
+ return util::create_context_callback(this);
+}
+
+template <typename T>
+Context *AsyncRequest<T>::create_async_callback_context() {
+ return util::create_context_callback<AsyncRequest<T>,
+ &AsyncRequest<T>::async_complete>(this);
+}
+
+template <typename T>
+void AsyncRequest<T>::start_request() {
+ Mutex::Locker async_ops_locker(m_image_ctx.async_ops_lock);
+ m_image_ctx.async_requests.push_back(&m_xlist_item);
+}
+
+template <typename T>
+void AsyncRequest<T>::finish_request() {
+ decltype(m_image_ctx.async_requests_waiters) waiters;
+ {
+ Mutex::Locker async_ops_locker(m_image_ctx.async_ops_lock);
+ ceph_assert(m_xlist_item.remove_myself());
+
+ if (m_image_ctx.async_requests.empty()) {
+ waiters = std::move(m_image_ctx.async_requests_waiters);
+ }
+ }
+
+ for (auto ctx : waiters) {
+ ctx->complete(0);
+ }
+}
+
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::AsyncRequest<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h
new file mode 100644
index 00000000..f74368dc
--- /dev/null
+++ b/src/librbd/AsyncRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_ASYNC_REQUEST_H
+#define CEPH_LIBRBD_ASYNC_REQUEST_H
+
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "include/xlist.h"
+#include "include/compat.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class AsyncRequest
+{
+public:
+ AsyncRequest(ImageCtxT &image_ctx, Context *on_finish);
+ virtual ~AsyncRequest();
+
+ void complete(int r) {
+ if (should_complete(r)) {
+ r = filter_return_code(r);
+ finish_and_destroy(r);
+ }
+ }
+
+ virtual void send() = 0;
+
+ inline bool is_canceled() const {
+ return m_canceled;
+ }
+ inline void cancel() {
+ m_canceled = true;
+ }
+
+protected:
+ ImageCtxT &m_image_ctx;
+
+ librados::AioCompletion *create_callback_completion();
+ Context *create_callback_context();
+ Context *create_async_callback_context();
+
+ void async_complete(int r);
+
+ virtual bool should_complete(int r) = 0;
+ virtual int filter_return_code(int r) const {
+ return r;
+ }
+
+ // NOTE: temporary until converted to new state machine format
+ virtual void finish_and_destroy(int r) {
+ finish(r);
+ delete this;
+ }
+
+ virtual void finish(int r) {
+ finish_request();
+ m_on_finish->complete(r);
+ }
+
+private:
+ Context *m_on_finish;
+ bool m_canceled;
+ typename xlist<AsyncRequest<ImageCtxT> *>::item m_xlist_item;
+
+ void start_request();
+ void finish_request();
+};
+
+} // namespace librbd
+
+extern template class librbd::AsyncRequest<librbd::ImageCtx>;
+
+#endif //CEPH_LIBRBD_ASYNC_REQUEST_H
diff --git a/src/librbd/BlockGuard.h b/src/librbd/BlockGuard.h
new file mode 100644
index 00000000..062f1901
--- /dev/null
+++ b/src/librbd/BlockGuard.h
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_BLOCK_GUARD_H
+#define CEPH_LIBRBD_IO_BLOCK_GUARD_H
+
+#include "include/int_types.h"
+#include "common/dout.h"
+#include "common/Mutex.h"
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/set.hpp>
+#include <deque>
+#include <list>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::BlockGuard: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+struct BlockExtent {
+ uint64_t block_start = 0;
+ uint64_t block_end = 0;
+
+ BlockExtent() {
+ }
+ BlockExtent(uint64_t block_start, uint64_t block_end)
+ : block_start(block_start), block_end(block_end) {
+ }
+};
+
+struct BlockGuardCell {
+};
+
+/**
+ * Helper class to restrict and order concurrent IO to the same block. The
+ * definition of a block is dependent upon the user of this class. It might
+ * represent a backing object, 512 byte sectors, etc.
+ */
+template <typename BlockOperation>
+class BlockGuard {
+private:
+ struct DetainedBlockExtent;
+
+public:
+ typedef std::list<BlockOperation> BlockOperations;
+
+ BlockGuard(CephContext *cct)
+ : m_cct(cct), m_lock("librbd::BlockGuard::m_lock") {
+ }
+
+ BlockGuard(const BlockGuard&) = delete;
+ BlockGuard &operator=(const BlockGuard&) = delete;
+
+ /**
+ * Detain future IO for a range of blocks. the guard will assume
+ * ownership of the provided operation if the operation is blocked.
+ * @return 0 upon success and IO can be issued
+ * >0 if the IO is blocked,
+ * <0 upon error
+ */
+ int detain(const BlockExtent &block_extent, BlockOperation *block_operation,
+ BlockGuardCell **cell) {
+ Mutex::Locker locker(m_lock);
+ ldout(m_cct, 20) << "block_start=" << block_extent.block_start << ", "
+ << "block_end=" << block_extent.block_end << ", "
+ << "free_slots=" << m_free_detained_block_extents.size()
+ << dendl;
+
+ DetainedBlockExtent *detained_block_extent;
+ auto it = m_detained_block_extents.find(block_extent);
+ if (it != m_detained_block_extents.end()) {
+ // request against an already detained block
+ detained_block_extent = &(*it);
+ if (block_operation != nullptr) {
+ detained_block_extent->block_operations.emplace_back(
+ std::move(*block_operation));
+ }
+
+ // alert the caller that the IO was detained
+ *cell = nullptr;
+ return detained_block_extent->block_operations.size();
+ } else {
+ if (!m_free_detained_block_extents.empty()) {
+ detained_block_extent = &m_free_detained_block_extents.front();
+ detained_block_extent->block_operations.clear();
+ m_free_detained_block_extents.pop_front();
+ } else {
+ ldout(m_cct, 20) << "no free detained block cells" << dendl;
+ m_detained_block_extent_pool.emplace_back();
+ detained_block_extent = &m_detained_block_extent_pool.back();
+ }
+
+ detained_block_extent->block_extent = block_extent;
+ m_detained_block_extents.insert(*detained_block_extent);
+ *cell = reinterpret_cast<BlockGuardCell*>(detained_block_extent);
+ return 0;
+ }
+ }
+
+ /**
+ * Release any detained IO operations from the provided cell.
+ */
+ void release(BlockGuardCell *cell, BlockOperations *block_operations) {
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(cell != nullptr);
+ auto &detained_block_extent = reinterpret_cast<DetainedBlockExtent &>(
+ *cell);
+ ldout(m_cct, 20) << "block_start="
+ << detained_block_extent.block_extent.block_start << ", "
+ << "block_end="
+ << detained_block_extent.block_extent.block_end << ", "
+ << "pending_ops="
+ << (detained_block_extent.block_operations.empty() ?
+ 0 : detained_block_extent.block_operations.size() - 1)
+ << dendl;
+
+ *block_operations = std::move(detained_block_extent.block_operations);
+ m_detained_block_extents.erase(detained_block_extent.block_extent);
+ m_free_detained_block_extents.push_back(detained_block_extent);
+ }
+
+private:
+ struct DetainedBlockExtent : public boost::intrusive::list_base_hook<>,
+ public boost::intrusive::set_base_hook<> {
+ BlockExtent block_extent;
+ BlockOperations block_operations;
+ };
+
+ struct DetainedBlockExtentKey {
+ typedef BlockExtent type;
+ const BlockExtent &operator()(const DetainedBlockExtent &value) {
+ return value.block_extent;
+ }
+ };
+
+ struct DetainedBlockExtentCompare {
+ bool operator()(const BlockExtent &lhs,
+ const BlockExtent &rhs) const {
+ // check for range overlap (lhs < rhs)
+ if (lhs.block_end <= rhs.block_start) {
+ return true;
+ }
+ return false;
+ }
+ };
+
+ typedef std::deque<DetainedBlockExtent> DetainedBlockExtentsPool;
+ typedef boost::intrusive::list<DetainedBlockExtent> DetainedBlockExtents;
+ typedef boost::intrusive::set<
+ DetainedBlockExtent,
+ boost::intrusive::compare<DetainedBlockExtentCompare>,
+ boost::intrusive::key_of_value<DetainedBlockExtentKey> >
+ BlockExtentToDetainedBlockExtents;
+
+ CephContext *m_cct;
+
+ Mutex m_lock;
+ DetainedBlockExtentsPool m_detained_block_extent_pool;
+ DetainedBlockExtents m_free_detained_block_extents;
+ BlockExtentToDetainedBlockExtents m_detained_block_extents;
+
+};
+
+} // namespace librbd
+
+#undef dout_subsys
+#undef dout_prefix
+#define dout_prefix *_dout
+
+#endif // CEPH_LIBRBD_IO_BLOCK_GUARD_H
diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt
new file mode 100644
index 00000000..149c19df
--- /dev/null
+++ b/src/librbd/CMakeLists.txt
@@ -0,0 +1,197 @@
+if(Boost_VERSION VERSION_GREATER 1.73)
+ add_definitions(-DBOOST_ASIO_USE_TS_EXECUTOR_AS_DEFAULT)
+endif()
+
+add_library(rbd_types STATIC
+ journal/Types.cc
+ mirroring_watcher/Types.cc
+ trash_watcher/Types.cc
+ watcher/Types.cc
+ WatchNotifyTypes.cc)
+
+set(librbd_internal_srcs
+ AsyncObjectThrottle.cc
+ AsyncRequest.cc
+ ConfigWatcher.cc
+ DeepCopyRequest.cc
+ ExclusiveLock.cc
+ ImageCtx.cc
+ ImageState.cc
+ ImageWatcher.cc
+ internal.cc
+ Journal.cc
+ LibrbdAdminSocketHook.cc
+ LibrbdWriteback.cc
+ ManagedLock.cc
+ MirroringWatcher.cc
+ ObjectMap.cc
+ Operations.cc
+ TrashWatcher.cc
+ Utils.cc
+ Watcher.cc
+ api/Config.cc
+ api/DiffIterate.cc
+ api/Group.cc
+ api/Image.cc
+ api/Migration.cc
+ api/Mirror.cc
+ api/Namespace.cc
+ api/Pool.cc
+ api/PoolMetadata.cc
+ api/Snapshot.cc
+ api/Trash.cc
+ cache/ImageWriteback.cc
+ cache/ObjectCacherObjectDispatch.cc
+ cache/PassthroughImageCache.cc
+ deep_copy/ImageCopyRequest.cc
+ deep_copy/MetadataCopyRequest.cc
+ deep_copy/ObjectCopyRequest.cc
+ deep_copy/SetHeadRequest.cc
+ deep_copy/SnapshotCopyRequest.cc
+ deep_copy/SnapshotCreateRequest.cc
+ deep_copy/Utils.cc
+ exclusive_lock/AutomaticPolicy.cc
+ exclusive_lock/PreAcquireRequest.cc
+ exclusive_lock/PostAcquireRequest.cc
+ exclusive_lock/PreReleaseRequest.cc
+ exclusive_lock/StandardPolicy.cc
+ image/AttachChildRequest.cc
+ image/AttachParentRequest.cc
+ image/CloneRequest.cc
+ image/CloseRequest.cc
+ image/CreateRequest.cc
+ image/DetachChildRequest.cc
+ image/DetachParentRequest.cc
+ image/ListWatchersRequest.cc
+ image/OpenRequest.cc
+ image/PreRemoveRequest.cc
+ image/RefreshParentRequest.cc
+ image/RefreshRequest.cc
+ image/RemoveRequest.cc
+ image/SetFlagsRequest.cc
+ image/SetSnapRequest.cc
+ image/ValidatePoolRequest.cc
+ image_watcher/NotifyLockOwner.cc
+ io/AioCompletion.cc
+ io/AsyncOperation.cc
+ io/CopyupRequest.cc
+ io/ImageDispatchSpec.cc
+ io/ImageRequest.cc
+ io/ImageRequestWQ.cc
+ io/ObjectDispatch.cc
+ io/ObjectDispatchSpec.cc
+ io/ObjectDispatcher.cc
+ io/ObjectRequest.cc
+ io/ReadResult.cc
+ io/Utils.cc
+ journal/CreateRequest.cc
+ journal/DemoteRequest.cc
+ journal/ObjectDispatch.cc
+ journal/OpenRequest.cc
+ journal/PromoteRequest.cc
+ journal/RemoveRequest.cc
+ journal/Replay.cc
+ journal/ResetRequest.cc
+ journal/StandardPolicy.cc
+ journal/Utils.cc
+ managed_lock/AcquireRequest.cc
+ managed_lock/BreakRequest.cc
+ managed_lock/GetLockerRequest.cc
+ managed_lock/ReacquireRequest.cc
+ managed_lock/ReleaseRequest.cc
+ managed_lock/Utils.cc
+ mirror/DemoteRequest.cc
+ mirror/DisableRequest.cc
+ mirror/EnableRequest.cc
+ mirror/GetInfoRequest.cc
+ mirror/GetStatusRequest.cc
+ mirror/PromoteRequest.cc
+ object_map/CreateRequest.cc
+ object_map/InvalidateRequest.cc
+ object_map/LockRequest.cc
+ object_map/RefreshRequest.cc
+ object_map/RemoveRequest.cc
+ object_map/Request.cc
+ object_map/ResizeRequest.cc
+ object_map/SnapshotCreateRequest.cc
+ object_map/SnapshotRemoveRequest.cc
+ object_map/SnapshotRollbackRequest.cc
+ object_map/UnlockRequest.cc
+ object_map/UpdateRequest.cc
+ operation/DisableFeaturesRequest.cc
+ operation/EnableFeaturesRequest.cc
+ operation/FlattenRequest.cc
+ operation/MetadataRemoveRequest.cc
+ operation/MetadataSetRequest.cc
+ operation/MigrateRequest.cc
+ operation/ObjectMapIterate.cc
+ operation/RebuildObjectMapRequest.cc
+ operation/RenameRequest.cc
+ operation/Request.cc
+ operation/ResizeRequest.cc
+ operation/SnapshotCreateRequest.cc
+ operation/SnapshotProtectRequest.cc
+ operation/SnapshotRemoveRequest.cc
+ operation/SnapshotRenameRequest.cc
+ operation/SnapshotRollbackRequest.cc
+ operation/SnapshotUnprotectRequest.cc
+ operation/SnapshotLimitRequest.cc
+ operation/SparsifyRequest.cc
+ operation/TrimRequest.cc
+ trash/MoveRequest.cc
+ trash/RemoveRequest.cc
+ watcher/Notifier.cc
+ watcher/RewatchRequest.cc
+ ${CMAKE_SOURCE_DIR}/src/common/ContextCompletion.cc)
+
+add_library(rbd_api STATIC librbd.cc)
+add_library(rbd_internal STATIC
+ ${librbd_internal_srcs}
+ $<TARGET_OBJECTS:rados_snap_set_diff_obj>)
+if(WITH_LTTNG)
+ # librbd.cc includes tracing/librbd.h
+ add_dependencies(rbd_api librbd-tp)
+ # io/AioCompletion.cc includes tracing/librbd.h
+ add_dependencies(rbd_internal librbd-tp)
+endif()
+if(WITH_LTTNG AND WITH_EVENTTRACE)
+ add_dependencies(rbd_internal eventtrace_tp)
+endif()
+target_link_libraries(rbd_internal PRIVATE
+ osdc)
+
+add_library(librbd ${CEPH_SHARED}
+ librbd.cc)
+if(WITH_LTTNG)
+ add_dependencies(librbd librbd-tp)
+endif()
+
+target_link_libraries(librbd PRIVATE
+ rbd_internal
+ rbd_types
+ journal
+ librados
+ cls_rbd_client
+ cls_lock_client
+ cls_journal_client
+ ceph-common
+ pthread
+ ${CMAKE_DL_LIBS}
+ ${EXTRALIBS} ${GSSAPI_LIBRARIES})
+if(HAVE_UDEV)
+ target_link_libraries(librbd PRIVATE
+ udev)
+endif()
+if(ENABLE_SHARED)
+ set_target_properties(librbd PROPERTIES
+ OUTPUT_NAME rbd
+ VERSION 1.12.0
+ SOVERSION 1
+ CXX_VISIBILITY_PRESET hidden
+ VISIBILITY_INLINES_HIDDEN ON)
+ if(NOT APPLE)
+ set_property(TARGET librbd APPEND_STRING PROPERTY
+ LINK_FLAGS " -Wl,--exclude-libs,ALL")
+ endif()
+endif(ENABLE_SHARED)
+install(TARGETS librbd DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/src/librbd/ConfigWatcher.cc b/src/librbd/ConfigWatcher.cc
new file mode 100644
index 00000000..f8f17016
--- /dev/null
+++ b/src/librbd/ConfigWatcher.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ConfigWatcher.h"
+#include "common/config_obs.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/api/Config.h"
+#include <deque>
+#include <string>
+#include <vector>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ConfigWatcher: " \
+ << __func__ << ": "
+
+namespace librbd {
+
+template <typename I>
+struct ConfigWatcher<I>::Observer : public md_config_obs_t {
+ ConfigWatcher<I>* m_config_watcher;
+
+ std::deque<std::string> m_config_key_strs;
+ mutable std::vector<const char*> m_config_keys;
+
+ Observer(CephContext* cct, ConfigWatcher<I>* config_watcher)
+ : m_config_watcher(config_watcher) {
+ const std::string rbd_key_prefix("rbd_");
+ auto& schema = cct->_conf.get_schema();
+ for (auto& pair : schema) {
+ // watch all "rbd_" keys for simplicity
+ if (!boost::starts_with(pair.first, rbd_key_prefix)) {
+ continue;
+ }
+
+ m_config_key_strs.emplace_back(pair.first);
+ }
+
+ m_config_keys.reserve(m_config_key_strs.size());
+ for (auto& key : m_config_key_strs) {
+ m_config_keys.emplace_back(key.c_str());
+ }
+ m_config_keys.emplace_back(nullptr);
+ }
+
+ const char** get_tracked_conf_keys() const override {
+ ceph_assert(!m_config_keys.empty());
+ return &m_config_keys[0];
+ }
+
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override {
+ m_config_watcher->handle_global_config_change(changed);
+ }
+};
+
+template <typename I>
+ConfigWatcher<I>::ConfigWatcher(I& image_ctx)
+ : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+ConfigWatcher<I>::~ConfigWatcher() {
+ ceph_assert(m_observer == nullptr);
+}
+
+template <typename I>
+void ConfigWatcher<I>::init() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ m_observer = new Observer(cct, this);
+ cct->_conf.add_observer(m_observer);
+}
+
+template <typename I>
+void ConfigWatcher<I>::shut_down() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(m_observer != nullptr);
+ cct->_conf.remove_observer(m_observer);
+
+ delete m_observer;
+ m_observer = nullptr;
+}
+
+template <typename I>
+void ConfigWatcher<I>::handle_global_config_change(
+ std::set<std::string> changed_keys) {
+
+ {
+ // ignore any global changes that are being overridden
+ RWLock::RLocker md_locker(m_image_ctx.md_lock);
+ for (auto& key : m_image_ctx.config_overrides) {
+ changed_keys.erase(key);
+ }
+ }
+ if (changed_keys.empty()) {
+ return;
+ }
+
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 10) << "changed_keys=" << changed_keys << dendl;
+
+ // refresh the image to pick up any global config overrides
+ m_image_ctx.state->handle_update_notification();
+}
+
+} // namespace librbd
+
+template class librbd::ConfigWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/ConfigWatcher.h b/src/librbd/ConfigWatcher.h
new file mode 100644
index 00000000..1f10c8cb
--- /dev/null
+++ b/src/librbd/ConfigWatcher.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CONFIG_WATCHER_H
+#define CEPH_LIBRBD_CONFIG_WATCHER_H
+
+#include <set>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+template <typename ImageCtxT>
+class ConfigWatcher {
+public:
+ static ConfigWatcher* create(ImageCtxT& image_ctx) {
+ return new ConfigWatcher(image_ctx);
+ }
+
+ ConfigWatcher(ImageCtxT& image_ctx);
+ ~ConfigWatcher();
+
+ ConfigWatcher(const ConfigWatcher&) = delete;
+ ConfigWatcher& operator=(const ConfigWatcher&) = delete;
+
+ void init();
+ void shut_down();
+
+private:
+ struct Observer;
+
+ ImageCtxT& m_image_ctx;
+
+ Observer* m_observer = nullptr;
+
+ void handle_global_config_change(std::set<std::string> changed);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ConfigWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CONFIG_WATCHER_H
diff --git a/src/librbd/DeepCopyRequest.cc b/src/librbd/DeepCopyRequest.cc
new file mode 100644
index 00000000..3350fa3b
--- /dev/null
+++ b/src/librbd/DeepCopyRequest.cc
@@ -0,0 +1,362 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "DeepCopyRequest.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/ImageCopyRequest.h"
+#include "librbd/deep_copy/MetadataCopyRequest.h"
+#include "librbd/deep_copy/SnapshotCopyRequest.h"
+#include "librbd/internal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DeepCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+
+using namespace librbd::deep_copy;
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+DeepCopyRequest<I>::DeepCopyRequest(I *src_image_ctx, I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const ObjectNumber &object_number,
+ ContextWQ *work_queue, SnapSeqs *snap_seqs,
+ ProgressContext *prog_ctx,
+ Context *on_finish)
+ : RefCountedObject(dst_image_ctx->cct, 1), m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start),
+ m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start),
+ m_flatten(flatten), m_object_number(object_number),
+ m_work_queue(work_queue), m_snap_seqs(snap_seqs), m_prog_ctx(prog_ctx),
+ m_on_finish(on_finish), m_cct(dst_image_ctx->cct),
+ m_lock(unique_lock_name("DeepCopyRequest::m_lock", this)) {
+}
+
+template <typename I>
+DeepCopyRequest<I>::~DeepCopyRequest() {
+ ceph_assert(m_snapshot_copy_request == nullptr);
+ ceph_assert(m_image_copy_request == nullptr);
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send() {
+ if (!m_src_image_ctx->data_ctx.is_valid()) {
+ lderr(m_cct) << "missing data pool for source image" << dendl;
+ finish(-ENODEV);
+ return;
+ }
+
+ if (!m_dst_image_ctx->data_ctx.is_valid()) {
+ lderr(m_cct) << "missing data pool for destination image" << dendl;
+ finish(-ENODEV);
+ return;
+ }
+
+ int r = validate_copy_points();
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ send_copy_snapshots();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::cancel() {
+ Mutex::Locker locker(m_lock);
+
+ ldout(m_cct, 20) << dendl;
+
+ m_canceled = true;
+
+ if (m_snapshot_copy_request != nullptr) {
+ m_snapshot_copy_request->cancel();
+ }
+
+ if (m_image_copy_request != nullptr) {
+ m_image_copy_request->cancel();
+ }
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_snapshots() {
+ m_lock.Lock();
+ if (m_canceled) {
+ m_lock.Unlock();
+ finish(-ECANCELED);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = create_context_callback<
+ DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_snapshots>(this);
+ m_snapshot_copy_request = SnapshotCopyRequest<I>::create(
+ m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end,
+ m_dst_snap_id_start, m_flatten, m_work_queue, m_snap_seqs, ctx);
+ m_snapshot_copy_request->get();
+ m_lock.Unlock();
+
+ m_snapshot_copy_request->send();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_snapshots(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_snapshot_copy_request->put();
+ m_snapshot_copy_request = nullptr;
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ }
+
+ if (r == -ECANCELED) {
+ ldout(m_cct, 10) << "snapshot copy canceled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to copy snapshot metadata: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_src_snap_id_end == CEPH_NOSNAP) {
+ (*m_snap_seqs)[CEPH_NOSNAP] = CEPH_NOSNAP;
+ }
+
+ send_copy_image();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_image() {
+ m_lock.Lock();
+ if (m_canceled) {
+ m_lock.Unlock();
+ finish(-ECANCELED);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = create_context_callback<
+ DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_image>(this);
+ m_image_copy_request = ImageCopyRequest<I>::create(
+ m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end,
+ m_dst_snap_id_start, m_flatten, m_object_number, *m_snap_seqs, m_prog_ctx,
+ ctx);
+ m_image_copy_request->get();
+ m_lock.Unlock();
+
+ m_image_copy_request->send();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_image_copy_request->put();
+ m_image_copy_request = nullptr;
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ }
+
+ if (r == -ECANCELED) {
+ ldout(m_cct, 10) << "image copy canceled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to copy image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_copy_object_map();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_object_map() {
+ m_dst_image_ctx->owner_lock.get_read();
+ m_dst_image_ctx->snap_lock.get_read();
+
+ if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP,
+ m_dst_image_ctx->snap_lock)) {
+ m_dst_image_ctx->snap_lock.put_read();
+ m_dst_image_ctx->owner_lock.put_read();
+ send_copy_metadata();
+ return;
+ }
+ if (m_src_snap_id_end == CEPH_NOSNAP) {
+ m_dst_image_ctx->snap_lock.put_read();
+ m_dst_image_ctx->owner_lock.put_read();
+ send_refresh_object_map();
+ return;
+ }
+
+ ceph_assert(m_dst_image_ctx->object_map != nullptr);
+
+ ldout(m_cct, 20) << dendl;
+
+ Context *finish_op_ctx = nullptr;
+ int r;
+ if (m_dst_image_ctx->exclusive_lock != nullptr) {
+ finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r);
+ }
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ m_dst_image_ctx->snap_lock.put_read();
+ m_dst_image_ctx->owner_lock.put_read();
+ finish(r);
+ return;
+ }
+
+ // rollback the object map (copy snapshot object map to HEAD)
+ RWLock::WLocker object_map_locker(m_dst_image_ctx->object_map_lock);
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_copy_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+ ceph_assert(m_snap_seqs->count(m_src_snap_id_end) > 0);
+ librados::snap_t copy_snap_id = (*m_snap_seqs)[m_src_snap_id_end];
+ m_dst_image_ctx->object_map->rollback(copy_snap_id, ctx);
+ m_dst_image_ctx->snap_lock.put_read();
+ m_dst_image_ctx->owner_lock.put_read();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_object_map(int r) {
+ ldout(m_cct, 20) << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to roll back object map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_refresh_object_map();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_refresh_object_map() {
+ int r;
+ Context *finish_op_ctx = nullptr;
+ {
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ if (m_dst_image_ctx->exclusive_lock != nullptr) {
+ finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r);
+ }
+ }
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_refresh_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+ m_object_map = m_dst_image_ctx->create_object_map(CEPH_NOSNAP);
+ m_object_map->open(ctx);
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_refresh_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to open object map: " << cpp_strerror(r)
+ << dendl;
+ delete m_object_map;
+
+ finish(r);
+ return;
+ }
+
+ {
+ RWLock::WLocker snap_locker(m_dst_image_ctx->snap_lock);
+ RWLock::WLocker object_map_locker(m_dst_image_ctx->object_map_lock);
+ std::swap(m_dst_image_ctx->object_map, m_object_map);
+ }
+ delete m_object_map;
+
+ send_copy_metadata();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_metadata() {
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = create_context_callback<
+ DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_metadata>(this);
+ auto request = MetadataCopyRequest<I>::create(m_src_image_ctx,
+ m_dst_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_metadata(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+int DeepCopyRequest<I>::validate_copy_points() {
+ RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock);
+
+ if (m_src_snap_id_start != 0 &&
+ m_src_image_ctx->snap_info.find(m_src_snap_id_start) ==
+ m_src_image_ctx->snap_info.end()) {
+ lderr(m_cct) << "invalid start snap_id " << m_src_snap_id_start << dendl;
+ return -EINVAL;
+ }
+
+ if (m_src_snap_id_end != CEPH_NOSNAP &&
+ m_src_image_ctx->snap_info.find(m_src_snap_id_end) ==
+ m_src_image_ctx->snap_info.end()) {
+ lderr(m_cct) << "invalid end snap_id " << m_src_snap_id_end << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void DeepCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ put();
+}
+
+} // namespace librbd
+
+template class librbd::DeepCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/DeepCopyRequest.h b/src/librbd/DeepCopyRequest.h
new file mode 100644
index 00000000..24687c96
--- /dev/null
+++ b/src/librbd/DeepCopyRequest.h
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_REQUEST_H
+
+#include "common/Mutex.h"
+#include "common/RefCountedObj.h"
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include "librbd/deep_copy/Types.h"
+
+#include <map>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace deep_copy {
+
+template <typename> class ImageCopyRequest;
+template <typename> class SnapshotCopyRequest;
+
+}
+
+template <typename ImageCtxT = ImageCtx>
+class DeepCopyRequest : public RefCountedObject {
+public:
+ static DeepCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const deep_copy::ObjectNumber &object_number,
+ ContextWQ *work_queue,
+ SnapSeqs *snap_seqs,
+ ProgressContext *prog_ctx,
+ Context *on_finish) {
+ return new DeepCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start,
+ src_snap_id_end, dst_snap_id_start, flatten,
+ object_number, work_queue, snap_seqs, prog_ctx,
+ on_finish);
+ }
+
+ DeepCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, const deep_copy::ObjectNumber &object_number,
+ ContextWQ *work_queue, SnapSeqs *snap_seqs,
+ ProgressContext *prog_ctx, Context *on_finish);
+ ~DeepCopyRequest();
+
+ void send();
+ void cancel();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * COPY_SNAPSHOTS
+ * |
+ * v
+ * COPY_IMAGE . . . . . . . . . . . . . .
+ * | .
+ * v .
+ * COPY_OBJECT_MAP (skip if object .
+ * | map disabled) .
+ * v .
+ * REFRESH_OBJECT_MAP (skip if object . (image copy canceled)
+ * | map disabled) .
+ * v .
+ * COPY_METADATA .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ typedef std::vector<librados::snap_t> SnapIds;
+ typedef std::map<librados::snap_t, SnapIds> SnapMap;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_src_snap_id_end;
+ librados::snap_t m_dst_snap_id_start;
+ bool m_flatten;
+ deep_copy::ObjectNumber m_object_number;
+ ContextWQ *m_work_queue;
+ SnapSeqs *m_snap_seqs;
+ ProgressContext *m_prog_ctx;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ Mutex m_lock;
+ bool m_canceled = false;
+
+ deep_copy::SnapshotCopyRequest<ImageCtxT> *m_snapshot_copy_request = nullptr;
+ deep_copy::ImageCopyRequest<ImageCtxT> *m_image_copy_request = nullptr;
+ decltype(ImageCtxT::object_map) m_object_map = nullptr;
+
+ void send_copy_snapshots();
+ void handle_copy_snapshots(int r);
+
+ void send_copy_image();
+ void handle_copy_image(int r);
+
+ void send_copy_object_map();
+ void handle_copy_object_map(int r);
+
+ void send_refresh_object_map();
+ void handle_refresh_object_map(int r);
+
+ void send_copy_metadata();
+ void handle_copy_metadata(int r);
+
+ int validate_copy_points();
+
+ void finish(int r);
+};
+
+} // namespace librbd
+
+extern template class librbd::DeepCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_REQUEST_H
diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc
new file mode 100644
index 00000000..4db3f29a
--- /dev/null
+++ b/src/librbd/ExclusiveLock.cc
@@ -0,0 +1,374 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ImageState.h"
+#include "librbd/exclusive_lock/PreAcquireRequest.h"
+#include "librbd/exclusive_lock/PostAcquireRequest.h"
+#include "librbd/exclusive_lock/PreReleaseRequest.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/Utils.h"
+#include "common/Mutex.h"
+#include "common/dout.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock: " << this << " " \
+ << __func__
+
+namespace librbd {
+
+using namespace exclusive_lock;
+
+template <typename I>
+using ML = ManagedLock<I>;
+
+template <typename I>
+ExclusiveLock<I>::ExclusiveLock(I &image_ctx)
+ : ML<I>(image_ctx.md_ctx, image_ctx.op_work_queue, image_ctx.header_oid,
+ image_ctx.image_watcher, managed_lock::EXCLUSIVE,
+ image_ctx.config.template get_val<bool>("rbd_blacklist_on_break_lock"),
+ image_ctx.config.template get_val<uint64_t>("rbd_blacklist_expire_seconds")),
+ m_image_ctx(image_ctx) {
+ Mutex::Locker locker(ML<I>::m_lock);
+ ML<I>::set_state_uninitialized();
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_request(OperationRequestType request_type,
+ int *ret_val) const {
+ Mutex::Locker locker(ML<I>::m_lock);
+
+ bool accept_request =
+ (!ML<I>::is_state_shutdown() && ML<I>::is_state_locked() &&
+ (m_request_blocked_count == 0 ||
+ m_image_ctx.get_exclusive_lock_policy()->accept_blocked_request(
+ request_type)));
+ if (ret_val != nullptr) {
+ *ret_val = accept_request ? 0 : m_request_blocked_ret_val;
+ }
+
+ ldout(m_image_ctx.cct, 20) << "=" << accept_request << " (request_type="
+ << request_type << ")" << dendl;
+ return accept_request;
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_ops() const {
+ Mutex::Locker locker(ML<I>::m_lock);
+ bool accept = accept_ops(ML<I>::m_lock);
+ ldout(m_image_ctx.cct, 20) << "=" << accept << dendl;
+ return accept;
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_ops(const Mutex &lock) const {
+ return (!ML<I>::is_state_shutdown() &&
+ (ML<I>::is_state_locked() || ML<I>::is_state_post_acquiring()));
+}
+
+template <typename I>
+void ExclusiveLock<I>::block_requests(int r) {
+ Mutex::Locker locker(ML<I>::m_lock);
+
+ m_request_blocked_count++;
+ if (m_request_blocked_ret_val == 0) {
+ m_request_blocked_ret_val = r;
+ }
+
+ ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+}
+
+template <typename I>
+void ExclusiveLock<I>::unblock_requests() {
+ Mutex::Locker locker(ML<I>::m_lock);
+
+ ceph_assert(m_request_blocked_count > 0);
+ m_request_blocked_count--;
+ if (m_request_blocked_count == 0) {
+ m_request_blocked_ret_val = 0;
+ }
+
+ ldout(m_image_ctx.cct, 20) << dendl;
+}
+
+template <typename I>
+int ExclusiveLock<I>::get_unlocked_op_error() const {
+ if (m_image_ctx.image_watcher->is_blacklisted()) {
+ return -EBLACKLISTED;
+ }
+ return -EROFS;
+}
+
+template <typename I>
+void ExclusiveLock<I>::init(uint64_t features, Context *on_init) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ {
+ Mutex::Locker locker(ML<I>::m_lock);
+ ML<I>::set_state_initializing();
+ }
+
+ m_image_ctx.io_work_queue->block_writes(new C_InitComplete(this, features,
+ on_init));
+}
+
+template <typename I>
+void ExclusiveLock<I>::shut_down(Context *on_shut_down) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ ML<I>::shut_down(on_shut_down);
+
+ // if stalled in request state machine -- abort
+ handle_peer_notification(0);
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_peer_notification(int r) {
+ Mutex::Locker locker(ML<I>::m_lock);
+ if (!ML<I>::is_state_waiting_for_lock()) {
+ return;
+ }
+
+ ldout(m_image_ctx.cct, 10) << dendl;
+ ceph_assert(ML<I>::is_action_acquire_lock());
+
+ m_acquire_lock_peer_ret_val = r;
+ ML<I>::execute_next_action();
+}
+
+template <typename I>
+Context *ExclusiveLock<I>::start_op(int* ret_val) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ Mutex::Locker locker(ML<I>::m_lock);
+
+ if (!accept_ops(ML<I>::m_lock)) {
+ *ret_val = get_unlocked_op_error();
+ return nullptr;
+ }
+
+ m_async_op_tracker.start_op();
+ return new FunctionContext([this](int r) {
+ m_async_op_tracker.finish_op();
+ });
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_init_complete(uint64_t features) {
+ ldout(m_image_ctx.cct, 10) << ": features=" << features << dendl;
+
+ {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.clone_copy_on_read ||
+ (features & RBD_FEATURE_JOURNALING) != 0) {
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, true);
+ } else {
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_WRITE, true);
+ }
+ }
+
+ Mutex::Locker locker(ML<I>::m_lock);
+ ML<I>::set_state_unlocked();
+}
+
+template <typename I>
+void ExclusiveLock<I>::shutdown_handler(int r, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ {
+ RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, false);
+ m_image_ctx.exclusive_lock = nullptr;
+ }
+
+ m_image_ctx.io_work_queue->unblock_writes();
+ m_image_ctx.image_watcher->flush(on_finish);
+}
+
+template <typename I>
+void ExclusiveLock<I>::pre_acquire_lock_handler(Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ int acquire_lock_peer_ret_val = 0;
+ {
+ Mutex::Locker locker(ML<I>::m_lock);
+ std::swap(acquire_lock_peer_ret_val, m_acquire_lock_peer_ret_val);
+ }
+
+ if (acquire_lock_peer_ret_val == -EROFS) {
+ ldout(m_image_ctx.cct, 10) << ": peer nacked lock request" << dendl;
+ on_finish->complete(acquire_lock_peer_ret_val);
+ return;
+ }
+
+ PreAcquireRequest<I> *req = PreAcquireRequest<I>::create(m_image_ctx,
+ on_finish);
+ m_image_ctx.op_work_queue->queue(new FunctionContext([req](int r) {
+ req->send();
+ }));
+}
+
+template <typename I>
+void ExclusiveLock<I>::post_acquire_lock_handler(int r, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl;
+
+ if (r == -EROFS) {
+ // peer refused to release the exclusive lock
+ on_finish->complete(r);
+ return;
+ } else if (r < 0) {
+ ML<I>::m_lock.Lock();
+ ceph_assert(ML<I>::is_state_acquiring());
+
+ // PostAcquire state machine will not run, so we need complete prepare
+ m_image_ctx.state->handle_prepare_lock_complete();
+
+ // if lock is in-use by another client, request the lock
+ if (ML<I>::is_action_acquire_lock() && (r == -EBUSY || r == -EAGAIN)) {
+ ML<I>::set_state_waiting_for_lock();
+ ML<I>::m_lock.Unlock();
+
+ // request the lock from a peer
+ m_image_ctx.image_watcher->notify_request_lock();
+
+ // inform manage lock that we have interrupted the state machine
+ r = -ECANCELED;
+ } else {
+ ML<I>::m_lock.Unlock();
+
+ // clear error if peer owns lock
+ if (r == -EAGAIN) {
+ r = 0;
+ }
+ }
+
+ on_finish->complete(r);
+ return;
+ }
+
+ Mutex::Locker locker(ML<I>::m_lock);
+ m_pre_post_callback = on_finish;
+ using EL = ExclusiveLock<I>;
+ PostAcquireRequest<I> *req = PostAcquireRequest<I>::create(m_image_ctx,
+ util::create_context_callback<EL, &EL::handle_post_acquiring_lock>(this),
+ util::create_context_callback<EL, &EL::handle_post_acquired_lock>(this));
+
+ m_image_ctx.op_work_queue->queue(new FunctionContext([req](int r) {
+ req->send();
+ }));
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_post_acquiring_lock(int r) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ Mutex::Locker locker(ML<I>::m_lock);
+
+ ceph_assert(r == 0);
+
+ // lock is owned at this point
+ ML<I>::set_state_post_acquiring();
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_post_acquired_lock(int r) {
+ ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(ML<I>::m_lock);
+ ceph_assert(ML<I>::is_state_acquiring() || ML<I>::is_state_post_acquiring());
+
+ assert (m_pre_post_callback != nullptr);
+ std::swap(m_pre_post_callback, on_finish);
+ }
+
+ if (r >= 0) {
+ m_image_ctx.perfcounter->tset(l_librbd_lock_acquired_time,
+ ceph_clock_now());
+ m_image_ctx.image_watcher->notify_acquired_lock();
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, false);
+ m_image_ctx.io_work_queue->unblock_writes();
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ExclusiveLock<I>::pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+ Mutex::Locker locker(ML<I>::m_lock);
+
+ PreReleaseRequest<I> *req = PreReleaseRequest<I>::create(
+ m_image_ctx, shutting_down, m_async_op_tracker, on_finish);
+ m_image_ctx.op_work_queue->queue(new FunctionContext([req](int r) {
+ req->send();
+ }));
+}
+
+template <typename I>
+void ExclusiveLock<I>::post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << ": r=" << r << " shutting_down="
+ << shutting_down << dendl;
+ if (!shutting_down) {
+ {
+ Mutex::Locker locker(ML<I>::m_lock);
+ ceph_assert(ML<I>::is_state_pre_releasing() || ML<I>::is_state_releasing());
+ }
+
+ if (r >= 0) {
+ m_image_ctx.image_watcher->notify_released_lock();
+ }
+ } else {
+ {
+ RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, false);
+ m_image_ctx.exclusive_lock = nullptr;
+ }
+
+ if (r >= 0) {
+ m_image_ctx.io_work_queue->unblock_writes();
+ }
+
+ m_image_ctx.image_watcher->notify_released_lock();
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ExclusiveLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+ if (r >= 0) {
+ m_image_ctx.image_watcher->notify_acquired_lock();
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+struct ExclusiveLock<I>::C_InitComplete : public Context {
+ ExclusiveLock *exclusive_lock;
+ uint64_t features;
+ Context *on_init;
+
+ C_InitComplete(ExclusiveLock *exclusive_lock, uint64_t features,
+ Context *on_init)
+ : exclusive_lock(exclusive_lock), features(features), on_init(on_init) {
+ }
+ void finish(int r) override {
+ if (r == 0) {
+ exclusive_lock->handle_init_complete(features);
+ }
+ on_init->complete(r);
+ }
+};
+
+} // namespace librbd
+
+template class librbd::ExclusiveLock<librbd::ImageCtx>;
diff --git a/src/librbd/ExclusiveLock.h b/src/librbd/ExclusiveLock.h
new file mode 100644
index 00000000..824cb00d
--- /dev/null
+++ b/src/librbd/ExclusiveLock.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_H
+
+#include "common/AsyncOpTracker.h"
+#include "librbd/ManagedLock.h"
+#include "librbd/exclusive_lock/Policy.h"
+
+namespace librbd {
+
+template <typename ImageCtxT = ImageCtx>
+class ExclusiveLock : public ManagedLock<ImageCtxT> {
+public:
+ static ExclusiveLock *create(ImageCtxT &image_ctx) {
+ return new ExclusiveLock<ImageCtxT>(image_ctx);
+ }
+
+ ExclusiveLock(ImageCtxT &image_ctx);
+
+ bool accept_request(exclusive_lock::OperationRequestType request_type,
+ int *ret_val) const;
+ bool accept_ops() const;
+
+ void block_requests(int r);
+ void unblock_requests();
+
+ void init(uint64_t features, Context *on_init);
+ void shut_down(Context *on_shutdown);
+
+ void handle_peer_notification(int r);
+
+ int get_unlocked_op_error() const;
+ Context *start_op(int* ret_val);
+
+protected:
+ void shutdown_handler(int r, Context *on_finish) override;
+ void pre_acquire_lock_handler(Context *on_finish) override;
+ void post_acquire_lock_handler(int r, Context *on_finish) override;
+ void pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) override;
+ void post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) override;
+ void post_reacquire_lock_handler(int r, Context *on_finish) override;
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start> * * > WAITING_FOR_REGISTER --------\
+ * | * (watch not registered) |
+ * | * |
+ * | * * > WAITING_FOR_PEER ------------\
+ * | * (request_lock busy) |
+ * | * |
+ * | * * * * * * * * * * * * * * |
+ * | * |
+ * v (init) (try_lock/request_lock) * |
+ * UNINITIALIZED -------> UNLOCKED ------------------------> ACQUIRING <--/
+ * ^ |
+ * | v
+ * RELEASING POST_ACQUIRING
+ * | |
+ * | |
+ * | (release_lock) v
+ * PRE_RELEASING <------------------------ LOCKED
+ *
+ * <LOCKED state>
+ * |
+ * v
+ * REACQUIRING -------------------------------------> <finish>
+ * . ^
+ * . |
+ * . . . > <RELEASE action> ---> <ACQUIRE action> ---/
+ *
+ * <UNLOCKED/LOCKED states>
+ * |
+ * |
+ * v
+ * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> <finish>
+ *
+ * @endverbatim
+ */
+
+ struct C_InitComplete;
+
+ ImageCtxT& m_image_ctx;
+ Context *m_pre_post_callback = nullptr;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ uint32_t m_request_blocked_count = 0;
+ int m_request_blocked_ret_val = 0;
+
+ int m_acquire_lock_peer_ret_val = 0;
+
+ bool accept_ops(const Mutex &lock) const;
+
+ void handle_init_complete(uint64_t features);
+ void handle_post_acquiring_lock(int r);
+ void handle_post_acquired_lock(int r);
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_H
diff --git a/src/librbd/Features.cc b/src/librbd/Features.cc
new file mode 100644
index 00000000..8623adb4
--- /dev/null
+++ b/src/librbd/Features.cc
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "librbd/Features.h"
+#include "include/rbd/features.h"
+
+#include <map>
+#include <vector>
+
+static const std::map<std::string, uint64_t> RBD_FEATURE_MAP = {
+ {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING},
+ {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2},
+ {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK},
+ {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP},
+ {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF},
+ {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN},
+ {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING},
+ {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL},
+ {RBD_FEATURE_NAME_OPERATIONS, RBD_FEATURE_OPERATIONS},
+ {RBD_FEATURE_NAME_MIGRATING, RBD_FEATURE_MIGRATING},
+};
+static_assert((RBD_FEATURE_MIGRATING << 1) > RBD_FEATURES_ALL,
+ "new RBD feature added");
+
+
+namespace librbd {
+
+std::string rbd_features_to_string(uint64_t features,
+ std::ostream *err)
+{
+ std::string r;
+ for (auto& i : RBD_FEATURE_MAP) {
+ if (features & i.second) {
+ if (r.empty()) {
+ r += ",";
+ }
+ r += i.first;
+ features &= ~i.second;
+ }
+ }
+ if (err && features) {
+ *err << "ignoring unknown feature mask 0x"
+ << std::hex << features << std::dec;
+ }
+ return r;
+}
+
+uint64_t rbd_features_from_string(const std::string& orig_value,
+ std::ostream *err)
+{
+ uint64_t features = 0;
+ std::string value = orig_value;
+ boost::trim(value);
+
+ // empty string means default features
+ if (!value.size()) {
+ return RBD_FEATURES_DEFAULT;
+ }
+
+ try {
+ // numeric?
+ features = boost::lexical_cast<uint64_t>(value);
+
+ // drop unrecognized bits
+ uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL);
+ if (unsupported_features != 0ull) {
+ features &= RBD_FEATURES_ALL;
+ if (err) {
+ *err << "ignoring unknown feature mask 0x"
+ << std::hex << unsupported_features << std::dec;
+ }
+ }
+ uint64_t internal_features = (features & RBD_FEATURES_INTERNAL);
+ if (internal_features != 0ULL) {
+ features &= ~RBD_FEATURES_INTERNAL;
+ if (err) {
+ *err << "ignoring internal feature mask 0x"
+ << std::hex << internal_features;
+ }
+ }
+ } catch (boost::bad_lexical_cast&) {
+ // feature name list?
+ bool errors = false;
+ std::vector<std::string> feature_names;
+ boost::split(feature_names, value, boost::is_any_of(","));
+ for (auto feature_name: feature_names) {
+ boost::trim(feature_name);
+ auto feature_it = RBD_FEATURE_MAP.find(feature_name);
+ if (feature_it != RBD_FEATURE_MAP.end()) {
+ features += feature_it->second;
+ } else if (err) {
+ if (errors) {
+ *err << ", ";
+ } else {
+ errors = true;
+ }
+ *err << "ignoring unknown feature " << feature_name;
+ }
+ }
+ }
+ return features;
+}
+
+} // namespace librbd
diff --git a/src/librbd/Features.h b/src/librbd/Features.h
new file mode 100644
index 00000000..6a88827c
--- /dev/null
+++ b/src/librbd/Features.h
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <ostream>
+
+namespace librbd {
+
+ std::string rbd_features_to_string(uint64_t features,
+ std::ostream *err);
+ uint64_t rbd_features_from_string(const std::string& value,
+ std::ostream *err);
+
+} // librbd
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
new file mode 100644
index 00000000..ed4bc218
--- /dev/null
+++ b/src/librbd/ImageCtx.cc
@@ -0,0 +1,927 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <errno.h>
+#include <boost/assign/list_of.hpp>
+#include <stddef.h>
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+
+#include "librbd/AsyncRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/internal.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/LibrbdAdminSocketHook.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/LibrbdWriteback.h"
+#include "librbd/exclusive_lock/AutomaticPolicy.h"
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/journal/StandardPolicy.h"
+
+#include "osdc/Striper.h"
+#include <boost/bind.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageCtx: "
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+
+namespace librbd {
+
+namespace {
+
+class ThreadPoolSingleton : public ThreadPool {
+public:
+ ContextWQ *op_work_queue;
+
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "librbd::thread_pool", "tp_librbd", 1,
+ "rbd_op_threads"),
+ op_work_queue(new ContextWQ("librbd::op_work_queue",
+ cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"),
+ this)) {
+ start();
+ }
+ ~ThreadPoolSingleton() override {
+ op_work_queue->drain();
+ delete op_work_queue;
+
+ stop();
+ }
+};
+
+class SafeTimerSingleton : public SafeTimer {
+public:
+ Mutex lock;
+
+ explicit SafeTimerSingleton(CephContext *cct)
+ : SafeTimer(cct, lock, true),
+ lock("librbd::Journal::SafeTimerSingleton::lock") {
+ init();
+ }
+ ~SafeTimerSingleton() {
+ Mutex::Locker locker(lock);
+ shutdown();
+ }
+};
+
+} // anonymous namespace
+
+ const string ImageCtx::METADATA_CONF_PREFIX = "conf_";
+
+ ImageCtx::ImageCtx(const string &image_name, const string &image_id,
+ const char *snap, IoCtx& p, bool ro)
+ : cct((CephContext*)p.cct()),
+ config(cct->_conf),
+ perfcounter(NULL),
+ snap_id(CEPH_NOSNAP),
+ snap_exists(true),
+ read_only(ro),
+ exclusive_locked(false),
+ name(image_name),
+ image_watcher(NULL),
+ journal(NULL),
+ owner_lock(util::unique_lock_name("librbd::ImageCtx::owner_lock", this)),
+ md_lock(util::unique_lock_name("librbd::ImageCtx::md_lock", this)),
+ snap_lock(util::unique_lock_name("librbd::ImageCtx::snap_lock", this)),
+ timestamp_lock(util::unique_lock_name("librbd::ImageCtx::timestamp_lock", this)),
+ parent_lock(util::unique_lock_name("librbd::ImageCtx::parent_lock", this)),
+ object_map_lock(util::unique_lock_name("librbd::ImageCtx::object_map_lock", this)),
+ async_ops_lock(util::unique_lock_name("librbd::ImageCtx::async_ops_lock", this)),
+ copyup_list_lock(util::unique_lock_name("librbd::ImageCtx::copyup_list_lock", this)),
+ completed_reqs_lock(util::unique_lock_name("librbd::ImageCtx::completed_reqs_lock", this)),
+ extra_read_flags(0),
+ old_format(false),
+ order(0), size(0), features(0),
+ format_string(NULL),
+ id(image_id), parent(NULL),
+ stripe_unit(0), stripe_count(0), flags(0),
+ readahead(),
+ total_bytes_read(0),
+ state(new ImageState<>(this)),
+ operations(new Operations<>(*this)),
+ exclusive_lock(nullptr), object_map(nullptr),
+ io_work_queue(nullptr), op_work_queue(nullptr),
+ asok_hook(nullptr),
+ trace_endpoint("librbd")
+ {
+ md_ctx.dup(p);
+ data_ctx.dup(p);
+ if (snap)
+ snap_name = snap;
+
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&header, 0, sizeof(header));
+
+ ThreadPool *thread_pool;
+ get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+ io_work_queue = new io::ImageRequestWQ<>(
+ this, "librbd::io_work_queue",
+ cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"),
+ thread_pool);
+ io_object_dispatcher = new io::ObjectDispatcher<>(this);
+
+ if (cct->_conf.get_val<bool>("rbd_auto_exclusive_lock_until_manual_request")) {
+ exclusive_lock_policy = new exclusive_lock::AutomaticPolicy(this);
+ } else {
+ exclusive_lock_policy = new exclusive_lock::StandardPolicy(this);
+ }
+ journal_policy = new journal::StandardPolicy<ImageCtx>(this);
+ }
+
+ ImageCtx::ImageCtx(const string &image_name, const string &image_id,
+ uint64_t snap_id, IoCtx& p, bool ro)
+ : ImageCtx(image_name, image_id, "", p, ro) {
+ open_snap_id = snap_id;
+ }
+
+ ImageCtx::~ImageCtx() {
+ ceph_assert(config_watcher == nullptr);
+ ceph_assert(image_watcher == NULL);
+ ceph_assert(exclusive_lock == NULL);
+ ceph_assert(object_map == NULL);
+ ceph_assert(journal == NULL);
+ ceph_assert(asok_hook == NULL);
+
+ if (perfcounter) {
+ perf_stop();
+ }
+ delete[] format_string;
+
+ md_ctx.aio_flush();
+ if (data_ctx.is_valid()) {
+ data_ctx.aio_flush();
+ }
+ io_work_queue->drain();
+
+ delete io_object_dispatcher;
+
+ delete journal_policy;
+ delete exclusive_lock_policy;
+ delete io_work_queue;
+ delete operations;
+ delete state;
+ }
+
+ void ImageCtx::init() {
+ ceph_assert(!header_oid.empty());
+ ceph_assert(old_format || !id.empty());
+
+ asok_hook = new LibrbdAdminSocketHook(this);
+
+ string pname = string("librbd-") + id + string("-") +
+ md_ctx.get_pool_name() + string("-") + name;
+ if (!snap_name.empty()) {
+ pname += "-";
+ pname += snap_name;
+ }
+
+ trace_endpoint.copy_name(pname);
+ perf_start(pname);
+
+ ceph_assert(image_watcher == NULL);
+ image_watcher = new ImageWatcher<>(*this);
+ }
+
+ void ImageCtx::shutdown() {
+ delete image_watcher;
+ image_watcher = nullptr;
+
+ delete asok_hook;
+ asok_hook = nullptr;
+ }
+
+ void ImageCtx::init_layout(int64_t pool_id)
+ {
+ if (stripe_unit == 0 || stripe_count == 0) {
+ stripe_unit = 1ull << order;
+ stripe_count = 1;
+ }
+
+ vector<uint64_t> alignments;
+ alignments.push_back(stripe_count << order); // object set (in file striping terminology)
+ alignments.push_back(stripe_unit * stripe_count); // stripe
+ alignments.push_back(stripe_unit); // stripe unit
+ readahead.set_alignments(alignments);
+
+ layout = file_layout_t();
+ layout.stripe_unit = stripe_unit;
+ layout.stripe_count = stripe_count;
+ layout.object_size = 1ull << order;
+ layout.pool_id = pool_id; // FIXME: pool id overflow?
+
+ delete[] format_string;
+ size_t len = object_prefix.length() + 16;
+ format_string = new char[len];
+ if (old_format) {
+ snprintf(format_string, len, "%s.%%012llx", object_prefix.c_str());
+ } else {
+ snprintf(format_string, len, "%s.%%016llx", object_prefix.c_str());
+ }
+
+ ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
+ << " stripe_count " << stripe_count
+ << " object_size " << layout.object_size
+ << " prefix " << object_prefix
+ << " format " << format_string
+ << dendl;
+ }
+
+ void ImageCtx::perf_start(string name) {
+ auto perf_prio = PerfCountersBuilder::PRIO_DEBUGONLY;
+ if (child == nullptr) {
+ // ensure top-level IO stats are exported for librbd daemons
+ perf_prio = PerfCountersBuilder::PRIO_USEFUL;
+ }
+
+ PerfCountersBuilder plb(cct, name, l_librbd_first, l_librbd_last);
+
+ plb.add_u64_counter(l_librbd_rd, "rd", "Reads", "r", perf_prio);
+ plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes", "Data size in reads",
+ "rb", perf_prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_rd_latency, "rd_latency", "Latency of reads",
+ "rl", perf_prio);
+ plb.add_u64_counter(l_librbd_wr, "wr", "Writes", "w", perf_prio);
+ plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes", "Written data",
+ "wb", perf_prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_wr_latency, "wr_latency", "Write latency",
+ "wl", perf_prio);
+ plb.add_u64_counter(l_librbd_discard, "discard", "Discards");
+ plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_discard_latency, "discard_latency", "Discard latency");
+ plb.add_u64_counter(l_librbd_flush, "flush", "Flushes");
+ plb.add_time_avg(l_librbd_flush_latency, "flush_latency", "Latency of flushes");
+ plb.add_u64_counter(l_librbd_ws, "ws", "WriteSames");
+ plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_ws_latency, "ws_latency", "WriteSame latency");
+ plb.add_u64_counter(l_librbd_cmp, "cmp", "CompareAndWrites");
+ plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_cmp_latency, "cmp_latency", "Latency of cmps");
+ plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations");
+ plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals");
+ plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks");
+ plb.add_u64_counter(l_librbd_snap_rename, "snap_rename", "Snap rename");
+ plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications");
+ plb.add_u64_counter(l_librbd_resize, "resize", "Resizes");
+ plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead");
+ plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_librbd_invalidate_cache, "invalidate_cache", "Cache invalidates");
+
+ plb.add_time(l_librbd_opened_time, "opened_time", "Opened time",
+ "ots", perf_prio);
+ plb.add_time(l_librbd_lock_acquired_time, "lock_acquired_time",
+ "Lock acquired time", "lats", perf_prio);
+
+ perfcounter = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perfcounter);
+
+ perfcounter->tset(l_librbd_opened_time, ceph_clock_now());
+ }
+
+ void ImageCtx::perf_stop() {
+ ceph_assert(perfcounter);
+ cct->get_perfcounters_collection()->remove(perfcounter);
+ delete perfcounter;
+ }
+
+ void ImageCtx::set_read_flag(unsigned flag) {
+ extra_read_flags |= flag;
+ }
+
+ int ImageCtx::get_read_flags(snap_t snap_id) {
+ int flags = librados::OPERATION_NOFLAG | extra_read_flags;
+ if (snap_id == LIBRADOS_SNAP_HEAD)
+ return flags;
+
+ if (config.get_val<bool>("rbd_balance_snap_reads"))
+ flags |= librados::OPERATION_BALANCE_READS;
+ else if (config.get_val<bool>("rbd_localize_snap_reads"))
+ flags |= librados::OPERATION_LOCALIZE_READS;
+ return flags;
+ }
+
+ int ImageCtx::snap_set(uint64_t in_snap_id) {
+ ceph_assert(snap_lock.is_wlocked());
+ auto it = snap_info.find(in_snap_id);
+ if (in_snap_id != CEPH_NOSNAP && it != snap_info.end()) {
+ snap_id = in_snap_id;
+ snap_namespace = it->second.snap_namespace;
+ snap_name = it->second.name;
+ snap_exists = true;
+ if (data_ctx.is_valid()) {
+ data_ctx.snap_set_read(snap_id);
+ }
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ void ImageCtx::snap_unset()
+ {
+ ceph_assert(snap_lock.is_wlocked());
+ snap_id = CEPH_NOSNAP;
+ snap_namespace = {};
+ snap_name = "";
+ snap_exists = true;
+ if (data_ctx.is_valid()) {
+ data_ctx.snap_set_read(snap_id);
+ }
+ }
+
+ snap_t ImageCtx::get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace,
+ const string& in_snap_name) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ auto it = snap_ids.find({in_snap_namespace, in_snap_name});
+ if (it != snap_ids.end()) {
+ return it->second;
+ }
+ return CEPH_NOSNAP;
+ }
+
+ const SnapInfo* ImageCtx::get_snap_info(snap_t in_snap_id) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ map<snap_t, SnapInfo>::const_iterator it =
+ snap_info.find(in_snap_id);
+ if (it != snap_info.end())
+ return &it->second;
+ return nullptr;
+ }
+
+ int ImageCtx::get_snap_name(snap_t in_snap_id,
+ string *out_snap_name) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *out_snap_name = info->name;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::get_snap_namespace(snap_t in_snap_id,
+ cls::rbd::SnapshotNamespace *out_snap_namespace) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *out_snap_namespace = info->snap_namespace;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::get_parent_spec(snap_t in_snap_id,
+ cls::rbd::ParentImageSpec *out_pspec) const
+ {
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *out_pspec = info->parent.spec;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ uint64_t ImageCtx::get_current_size() const
+ {
+ ceph_assert(snap_lock.is_locked());
+ return size;
+ }
+
+ uint64_t ImageCtx::get_object_size() const
+ {
+ return 1ull << order;
+ }
+
+ string ImageCtx::get_object_name(uint64_t num) const {
+ char buf[object_prefix.length() + 32];
+ snprintf(buf, sizeof(buf), format_string, num);
+ return string(buf);
+ }
+
+ uint64_t ImageCtx::get_stripe_unit() const
+ {
+ return stripe_unit;
+ }
+
+ uint64_t ImageCtx::get_stripe_count() const
+ {
+ return stripe_count;
+ }
+
+ uint64_t ImageCtx::get_stripe_period() const
+ {
+ return stripe_count * (1ull << order);
+ }
+
+ utime_t ImageCtx::get_create_timestamp() const
+ {
+ return create_timestamp;
+ }
+
+ utime_t ImageCtx::get_access_timestamp() const
+ {
+ return access_timestamp;
+ }
+
+ utime_t ImageCtx::get_modify_timestamp() const
+ {
+ return modify_timestamp;
+ }
+
+ void ImageCtx::set_access_timestamp(utime_t at)
+ {
+ ceph_assert(timestamp_lock.is_wlocked());
+ access_timestamp = at;
+ }
+
+ void ImageCtx::set_modify_timestamp(utime_t mt)
+ {
+ ceph_assert(timestamp_lock.is_locked());
+ modify_timestamp = mt;
+ }
+
+ int ImageCtx::is_snap_protected(snap_t in_snap_id,
+ bool *is_protected) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *is_protected =
+ (info->protection_status == RBD_PROTECTION_STATUS_PROTECTED);
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::is_snap_unprotected(snap_t in_snap_id,
+ bool *is_unprotected) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *is_unprotected =
+ (info->protection_status == RBD_PROTECTION_STATUS_UNPROTECTED);
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ void ImageCtx::add_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ string in_snap_name,
+ snap_t id, uint64_t in_size,
+ const ParentImageInfo &parent,
+ uint8_t protection_status, uint64_t flags,
+ utime_t timestamp)
+ {
+ ceph_assert(snap_lock.is_wlocked());
+ snaps.push_back(id);
+ SnapInfo info(in_snap_name, in_snap_namespace,
+ in_size, parent, protection_status, flags, timestamp);
+ snap_info.insert({id, info});
+ snap_ids.insert({{in_snap_namespace, in_snap_name}, id});
+ }
+
+ void ImageCtx::rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ string in_snap_name,
+ snap_t id)
+ {
+ ceph_assert(snap_lock.is_wlocked());
+ snaps.erase(std::remove(snaps.begin(), snaps.end(), id), snaps.end());
+ snap_info.erase(id);
+ snap_ids.erase({in_snap_namespace, in_snap_name});
+ }
+
+ uint64_t ImageCtx::get_image_size(snap_t in_snap_id) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ if (in_snap_id == CEPH_NOSNAP) {
+ if (!resize_reqs.empty() &&
+ resize_reqs.front()->shrinking()) {
+ return resize_reqs.front()->get_image_size();
+ }
+ return size;
+ }
+
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ return info->size;
+ }
+ return 0;
+ }
+
+ uint64_t ImageCtx::get_object_count(snap_t in_snap_id) const {
+ ceph_assert(snap_lock.is_locked());
+ uint64_t image_size = get_image_size(in_snap_id);
+ return Striper::get_num_objects(layout, image_size);
+ }
+
+ bool ImageCtx::test_features(uint64_t features) const
+ {
+ RWLock::RLocker l(snap_lock);
+ return test_features(features, snap_lock);
+ }
+
+ bool ImageCtx::test_features(uint64_t in_features,
+ const RWLock &in_snap_lock) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ return ((features & in_features) == in_features);
+ }
+
+ bool ImageCtx::test_op_features(uint64_t in_op_features) const
+ {
+ RWLock::RLocker snap_locker(snap_lock);
+ return test_op_features(in_op_features, snap_lock);
+ }
+
+ bool ImageCtx::test_op_features(uint64_t in_op_features,
+ const RWLock &in_snap_lock) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ return ((op_features & in_op_features) == in_op_features);
+ }
+
+ int ImageCtx::get_flags(librados::snap_t _snap_id, uint64_t *_flags) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ if (_snap_id == CEPH_NOSNAP) {
+ *_flags = flags;
+ return 0;
+ }
+ const SnapInfo *info = get_snap_info(_snap_id);
+ if (info) {
+ *_flags = info->flags;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::test_flags(librados::snap_t in_snap_id,
+ uint64_t flags, bool *flags_set) const
+ {
+ RWLock::RLocker l(snap_lock);
+ return test_flags(in_snap_id, flags, snap_lock, flags_set);
+ }
+
+ int ImageCtx::test_flags(librados::snap_t in_snap_id,
+ uint64_t flags, const RWLock &in_snap_lock,
+ bool *flags_set) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ uint64_t snap_flags;
+ int r = get_flags(in_snap_id, &snap_flags);
+ if (r < 0) {
+ return r;
+ }
+ *flags_set = ((snap_flags & flags) == flags);
+ return 0;
+ }
+
+ int ImageCtx::update_flags(snap_t in_snap_id, uint64_t flag, bool enabled)
+ {
+ ceph_assert(snap_lock.is_wlocked());
+ uint64_t *_flags;
+ if (in_snap_id == CEPH_NOSNAP) {
+ _flags = &flags;
+ } else {
+ map<snap_t, SnapInfo>::iterator it = snap_info.find(in_snap_id);
+ if (it == snap_info.end()) {
+ return -ENOENT;
+ }
+ _flags = &it->second.flags;
+ }
+
+ if (enabled) {
+ (*_flags) |= flag;
+ } else {
+ (*_flags) &= ~flag;
+ }
+ return 0;
+ }
+
+ const ParentImageInfo* ImageCtx::get_parent_info(snap_t in_snap_id) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ ceph_assert(parent_lock.is_locked());
+ if (in_snap_id == CEPH_NOSNAP)
+ return &parent_md;
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info)
+ return &info->parent;
+ return NULL;
+ }
+
+ int64_t ImageCtx::get_parent_pool_id(snap_t in_snap_id) const
+ {
+ const auto info = get_parent_info(in_snap_id);
+ if (info)
+ return info->spec.pool_id;
+ return -1;
+ }
+
+ string ImageCtx::get_parent_image_id(snap_t in_snap_id) const
+ {
+ const auto info = get_parent_info(in_snap_id);
+ if (info)
+ return info->spec.image_id;
+ return "";
+ }
+
+ uint64_t ImageCtx::get_parent_snap_id(snap_t in_snap_id) const
+ {
+ const auto info = get_parent_info(in_snap_id);
+ if (info)
+ return info->spec.snap_id;
+ return CEPH_NOSNAP;
+ }
+
+ int ImageCtx::get_parent_overlap(snap_t in_snap_id, uint64_t *overlap) const
+ {
+ ceph_assert(snap_lock.is_locked());
+ const auto info = get_parent_info(in_snap_id);
+ if (info) {
+ *overlap = info->overlap;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ void ImageCtx::register_watch(Context *on_finish) {
+ ceph_assert(image_watcher != NULL);
+ image_watcher->register_watch(on_finish);
+ }
+
+ uint64_t ImageCtx::prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
+ uint64_t overlap)
+ {
+ // drop extents completely beyond the overlap
+ while (!objectx.empty() && objectx.back().first >= overlap)
+ objectx.pop_back();
+
+ // trim final overlapping extent
+ if (!objectx.empty() && objectx.back().first + objectx.back().second > overlap)
+ objectx.back().second = overlap - objectx.back().first;
+
+ uint64_t len = 0;
+ for (vector<pair<uint64_t,uint64_t> >::iterator p = objectx.begin();
+ p != objectx.end();
+ ++p)
+ len += p->second;
+ ldout(cct, 10) << "prune_parent_extents image overlap " << overlap
+ << ", object overlap " << len
+ << " from image extents " << objectx << dendl;
+ return len;
+ }
+
+ void ImageCtx::cancel_async_requests() {
+ C_SaferCond ctx;
+ cancel_async_requests(&ctx);
+ ctx.wait();
+ }
+
+ void ImageCtx::cancel_async_requests(Context *on_finish) {
+ {
+ Mutex::Locker async_ops_locker(async_ops_lock);
+ if (!async_requests.empty()) {
+ ldout(cct, 10) << "canceling async requests: count="
+ << async_requests.size() << dendl;
+ for (auto req : async_requests) {
+ ldout(cct, 10) << "canceling async request: " << req << dendl;
+ req->cancel();
+ }
+ async_requests_waiters.push_back(on_finish);
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+ }
+
+ void ImageCtx::clear_pending_completions() {
+ Mutex::Locker l(completed_reqs_lock);
+ ldout(cct, 10) << "clear pending AioCompletion: count="
+ << completed_reqs.size() << dendl;
+ completed_reqs.clear();
+ }
+
+ void ImageCtx::apply_metadata(const std::map<std::string, bufferlist> &meta,
+ bool thread_safe) {
+ ldout(cct, 20) << __func__ << dendl;
+
+ RWLock::WLocker md_locker(md_lock);
+
+ // reset settings back to global defaults
+ for (auto& key : config_overrides) {
+ std::string value;
+ int r = cct->_conf.get_val(key, &value);
+ ceph_assert(r == 0);
+
+ config.set_val(key, value);
+ }
+ config_overrides.clear();
+
+ // extract config overrides
+ for (auto meta_pair : meta) {
+ if (!boost::starts_with(meta_pair.first, METADATA_CONF_PREFIX)) {
+ continue;
+ }
+
+ std::string key = meta_pair.first.substr(METADATA_CONF_PREFIX.size());
+ if (!boost::starts_with(key, "rbd_")) {
+ // ignore non-RBD configuration keys
+ // TODO use option schema to determine applicable subsystem
+ ldout(cct, 0) << __func__ << ": ignoring config " << key << dendl;
+ continue;
+ }
+
+ if (config.find_option(key) != nullptr) {
+ std::string val(meta_pair.second.c_str(), meta_pair.second.length());
+ int r = config.set_val(key, val);
+ if (r >= 0) {
+ ldout(cct, 20) << __func__ << ": " << key << "=" << val << dendl;
+ config_overrides.insert(key);
+ } else {
+ lderr(cct) << __func__ << ": failed to set config " << key << " "
+ << "with value " << val << ": " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+ }
+
+ md_locker.unlock();
+
+#define ASSIGN_OPTION(param, type) \
+ param = config.get_val<type>("rbd_"#param)
+
+ bool skip_partial_discard = true;
+ ASSIGN_OPTION(non_blocking_aio, bool);
+ ASSIGN_OPTION(cache, bool);
+ ASSIGN_OPTION(cache_writethrough_until_flush, bool);
+ ASSIGN_OPTION(cache_max_dirty, Option::size_t);
+ ASSIGN_OPTION(sparse_read_threshold_bytes, Option::size_t);
+ ASSIGN_OPTION(readahead_max_bytes, Option::size_t);
+ ASSIGN_OPTION(readahead_disable_after_bytes, Option::size_t);
+ ASSIGN_OPTION(clone_copy_on_read, bool);
+ ASSIGN_OPTION(enable_alloc_hint, bool);
+ ASSIGN_OPTION(mirroring_replay_delay, uint64_t);
+ ASSIGN_OPTION(mtime_update_interval, uint64_t);
+ ASSIGN_OPTION(atime_update_interval, uint64_t);
+ ASSIGN_OPTION(skip_partial_discard, bool);
+ ASSIGN_OPTION(discard_granularity_bytes, uint64_t);
+ ASSIGN_OPTION(blkin_trace_all, bool);
+
+#undef ASSIGN_OPTION
+
+ if (sparse_read_threshold_bytes == 0) {
+ sparse_read_threshold_bytes = get_object_size();
+ }
+ if (!skip_partial_discard) {
+ discard_granularity_bytes = 0;
+ }
+
+ alloc_hint_flags = 0;
+ auto compression_hint = config.get_val<std::string>("rbd_compression_hint");
+ if (compression_hint == "compressible") {
+ alloc_hint_flags |= librados::ALLOC_HINT_FLAG_COMPRESSIBLE;
+ } else if (compression_hint == "incompressible") {
+ alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ }
+
+ io_work_queue->apply_qos_schedule_tick_min(
+ config.get_val<uint64_t>("rbd_qos_schedule_tick_min"));
+
+ io_work_queue->apply_qos_limit(
+ RBD_QOS_IOPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_iops_limit"),
+ config.get_val<uint64_t>("rbd_qos_iops_burst"));
+ io_work_queue->apply_qos_limit(
+ RBD_QOS_BPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_bps_limit"),
+ config.get_val<uint64_t>("rbd_qos_bps_burst"));
+ io_work_queue->apply_qos_limit(
+ RBD_QOS_READ_IOPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_read_iops_limit"),
+ config.get_val<uint64_t>("rbd_qos_read_iops_burst"));
+ io_work_queue->apply_qos_limit(
+ RBD_QOS_WRITE_IOPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_write_iops_limit"),
+ config.get_val<uint64_t>("rbd_qos_write_iops_burst"));
+ io_work_queue->apply_qos_limit(
+ RBD_QOS_READ_BPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_read_bps_limit"),
+ config.get_val<uint64_t>("rbd_qos_read_bps_burst"));
+ io_work_queue->apply_qos_limit(
+ RBD_QOS_WRITE_BPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_write_bps_limit"),
+ config.get_val<uint64_t>("rbd_qos_write_bps_burst"));
+ }
+
+ ExclusiveLock<ImageCtx> *ImageCtx::create_exclusive_lock() {
+ return new ExclusiveLock<ImageCtx>(*this);
+ }
+
+ ObjectMap<ImageCtx> *ImageCtx::create_object_map(uint64_t snap_id) {
+ return new ObjectMap<ImageCtx>(*this, snap_id);
+ }
+
+ Journal<ImageCtx> *ImageCtx::create_journal() {
+ return new Journal<ImageCtx>(*this);
+ }
+
+ void ImageCtx::set_image_name(const std::string &image_name) {
+ // update the name so rename can be invoked repeatedly
+ RWLock::RLocker owner_locker(owner_lock);
+ RWLock::WLocker snap_locker(snap_lock);
+ name = image_name;
+ if (old_format) {
+ header_oid = util::old_header_name(image_name);
+ }
+ }
+
+ void ImageCtx::notify_update() {
+ state->handle_update_notification();
+ ImageWatcher<>::notify_header_update(md_ctx, header_oid);
+ }
+
+ void ImageCtx::notify_update(Context *on_finish) {
+ state->handle_update_notification();
+ image_watcher->notify_header_update(on_finish);
+ }
+
+ exclusive_lock::Policy *ImageCtx::get_exclusive_lock_policy() const {
+ ceph_assert(owner_lock.is_locked());
+ ceph_assert(exclusive_lock_policy != nullptr);
+ return exclusive_lock_policy;
+ }
+
+ void ImageCtx::set_exclusive_lock_policy(exclusive_lock::Policy *policy) {
+ ceph_assert(owner_lock.is_wlocked());
+ ceph_assert(policy != nullptr);
+ delete exclusive_lock_policy;
+ exclusive_lock_policy = policy;
+ }
+
+ journal::Policy *ImageCtx::get_journal_policy() const {
+ ceph_assert(snap_lock.is_locked());
+ ceph_assert(journal_policy != nullptr);
+ return journal_policy;
+ }
+
+ void ImageCtx::set_journal_policy(journal::Policy *policy) {
+ ceph_assert(snap_lock.is_wlocked());
+ ceph_assert(policy != nullptr);
+ delete journal_policy;
+ journal_policy = policy;
+ }
+
+ bool ImageCtx::is_writeback_cache_enabled() const {
+ return (cache && cache_max_dirty > 0);
+ }
+
+ void ImageCtx::get_thread_pool_instance(CephContext *cct,
+ ThreadPool **thread_pool,
+ ContextWQ **op_work_queue) {
+ auto thread_pool_singleton =
+ &cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+ "librbd::thread_pool", false, cct);
+ *thread_pool = thread_pool_singleton;
+ *op_work_queue = thread_pool_singleton->op_work_queue;
+ }
+
+ void ImageCtx::get_timer_instance(CephContext *cct, SafeTimer **timer,
+ Mutex **timer_lock) {
+ auto safe_timer_singleton =
+ &cct->lookup_or_create_singleton_object<SafeTimerSingleton>(
+ "librbd::journal::safe_timer", false, cct);
+ *timer = safe_timer_singleton;
+ *timer_lock = &safe_timer_singleton->lock;
+ }
+}
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
new file mode 100644
index 00000000..df22a0e8
--- /dev/null
+++ b/src/librbd/ImageCtx.h
@@ -0,0 +1,331 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_IMAGECTX_H
+#define CEPH_LIBRBD_IMAGECTX_H
+
+#include "include/int_types.h"
+
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "common/config_proxy.h"
+#include "common/event_socket.h"
+#include "common/Mutex.h"
+#include "common/Readahead.h"
+#include "common/RWLock.h"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+
+#include "include/buffer_fwd.h"
+#include "include/rbd/librbd.hpp"
+#include "include/rbd_types.h"
+#include "include/types.h"
+#include "include/xlist.h"
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsyncRequest.h"
+#include "librbd/Types.h"
+
+class CephContext;
+class ContextWQ;
+class Finisher;
+class PerfCounters;
+class ThreadPool;
+class SafeTimer;
+
+namespace librbd {
+
+ template <typename> class ConfigWatcher;
+ template <typename> class ExclusiveLock;
+ template <typename> class ImageState;
+ template <typename> class ImageWatcher;
+ template <typename> class Journal;
+ class LibrbdAdminSocketHook;
+ template <typename> class ObjectMap;
+ template <typename> class Operations;
+
+ namespace cache { struct ImageCache; }
+ namespace exclusive_lock { struct Policy; }
+ namespace io {
+ class AioCompletion;
+ class AsyncOperation;
+ template <typename> class CopyupRequest;
+ template <typename> class ImageRequestWQ;
+ template <typename> class ObjectDispatcher;
+ }
+ namespace journal { struct Policy; }
+
+ namespace operation {
+ template <typename> class ResizeRequest;
+ }
+
+ struct ImageCtx {
+ static const string METADATA_CONF_PREFIX;
+
+ CephContext *cct;
+ ConfigProxy config;
+ std::set<std::string> config_overrides;
+
+ PerfCounters *perfcounter;
+ struct rbd_obj_header_ondisk header;
+ ::SnapContext snapc;
+ std::vector<librados::snap_t> snaps; // this mirrors snapc.snaps, but is in
+ // a format librados can understand
+ std::map<librados::snap_t, SnapInfo> snap_info;
+ std::map<std::pair<cls::rbd::SnapshotNamespace, std::string>, librados::snap_t> snap_ids;
+ uint64_t open_snap_id = CEPH_NOSNAP;
+ uint64_t snap_id;
+ bool snap_exists; // false if our snap_id was deleted
+ // whether the image was opened read-only. cannot be changed after opening
+ bool read_only;
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> lockers;
+ bool exclusive_locked;
+ std::string lock_tag;
+
+ std::string name;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+ IoCtx data_ctx, md_ctx;
+
+ ConfigWatcher<ImageCtx> *config_watcher = nullptr;
+ ImageWatcher<ImageCtx> *image_watcher;
+ Journal<ImageCtx> *journal;
+
+ /**
+ * Lock ordering:
+ *
+ * owner_lock, md_lock, snap_lock, parent_lock,
+ * object_map_lock, async_op_lock, timestamp_lock
+ */
+ RWLock owner_lock; // protects exclusive lock leadership updates
+ RWLock md_lock; // protects access to the mutable image metadata that
+ // isn't guarded by other locks below, and blocks writes
+ // when held exclusively, so snapshots can be consistent.
+ // Fields guarded include:
+ // total_bytes_read
+ // exclusive_locked
+ // lock_tag
+ // lockers
+ RWLock snap_lock; // protects snapshot-related member variables,
+ // features (and associated helper classes), and flags
+ RWLock timestamp_lock;
+ RWLock parent_lock; // protects parent_md and parent
+ RWLock object_map_lock; // protects object map updates and object_map itself
+ Mutex async_ops_lock; // protects async_ops and async_requests
+ Mutex copyup_list_lock; // protects copyup_waiting_list
+ Mutex completed_reqs_lock; // protects completed_reqs
+
+ unsigned extra_read_flags;
+
+ bool old_format;
+ uint8_t order;
+ uint64_t size;
+ uint64_t features;
+ std::string object_prefix;
+ char *format_string;
+ std::string header_oid;
+ std::string id; // only used for new-format images
+ ParentImageInfo parent_md;
+ ImageCtx *parent;
+ ImageCtx *child = nullptr;
+ MigrationInfo migration_info;
+ cls::rbd::GroupSpec group_spec;
+ uint64_t stripe_unit, stripe_count;
+ uint64_t flags;
+ uint64_t op_features = 0;
+ bool operations_disabled = false;
+ utime_t create_timestamp;
+ utime_t access_timestamp;
+ utime_t modify_timestamp;
+
+ file_layout_t layout;
+
+ cache::ImageCache *image_cache = nullptr;
+
+ Readahead readahead;
+ uint64_t total_bytes_read;
+
+ std::map<uint64_t, io::CopyupRequest<ImageCtx>*> copyup_list;
+
+ xlist<io::AsyncOperation*> async_ops;
+ xlist<AsyncRequest<>*> async_requests;
+ std::list<Context*> async_requests_waiters;
+
+ ImageState<ImageCtx> *state;
+ Operations<ImageCtx> *operations;
+
+ ExclusiveLock<ImageCtx> *exclusive_lock;
+ ObjectMap<ImageCtx> *object_map;
+
+ xlist<operation::ResizeRequest<ImageCtx>*> resize_reqs;
+
+ io::ImageRequestWQ<ImageCtx> *io_work_queue;
+ io::ObjectDispatcher<ImageCtx> *io_object_dispatcher = nullptr;
+
+ xlist<io::AioCompletion*> completed_reqs;
+ EventSocket event_socket;
+
+ ContextWQ *op_work_queue;
+
+ bool ignore_migrating = false;
+
+ /// Cached latency-sensitive configuration settings
+ bool non_blocking_aio;
+ bool cache;
+ bool cache_writethrough_until_flush;
+ uint64_t cache_max_dirty;
+ uint64_t sparse_read_threshold_bytes;
+ uint64_t readahead_max_bytes;
+ uint64_t readahead_disable_after_bytes;
+ bool clone_copy_on_read;
+ bool enable_alloc_hint;
+ uint32_t alloc_hint_flags = 0U;
+ uint32_t discard_granularity_bytes = 0;
+ bool blkin_trace_all;
+ uint64_t mirroring_replay_delay;
+ uint64_t mtime_update_interval;
+ uint64_t atime_update_interval;
+
+ LibrbdAdminSocketHook *asok_hook;
+
+ exclusive_lock::Policy *exclusive_lock_policy = nullptr;
+ journal::Policy *journal_policy = nullptr;
+
+ ZTracer::Endpoint trace_endpoint;
+
+ // unit test mock helpers
+ static ImageCtx* create(const std::string &image_name,
+ const std::string &image_id,
+ const char *snap, IoCtx& p, bool read_only) {
+ return new ImageCtx(image_name, image_id, snap, p, read_only);
+ }
+ static ImageCtx* create(const std::string &image_name,
+ const std::string &image_id,
+ librados::snap_t snap_id, IoCtx& p,
+ bool read_only) {
+ return new ImageCtx(image_name, image_id, snap_id, p, read_only);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ /**
+ * Either image_name or image_id must be set.
+ * If id is not known, pass the empty std::string,
+ * and init() will look it up.
+ */
+ ImageCtx(const std::string &image_name, const std::string &image_id,
+ const char *snap, IoCtx& p, bool read_only);
+ ImageCtx(const std::string &image_name, const std::string &image_id,
+ librados::snap_t snap_id, IoCtx& p, bool read_only);
+ ~ImageCtx();
+ void init();
+ void shutdown();
+ void init_layout(int64_t pool_id);
+ void perf_start(std::string name);
+ void perf_stop();
+ void set_read_flag(unsigned flag);
+ int get_read_flags(librados::snap_t snap_id);
+ int snap_set(uint64_t snap_id);
+ void snap_unset();
+ librados::snap_t get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace,
+ const std::string& in_snap_name) const;
+ const SnapInfo* get_snap_info(librados::snap_t in_snap_id) const;
+ int get_snap_name(librados::snap_t in_snap_id,
+ std::string *out_snap_name) const;
+ int get_snap_namespace(librados::snap_t in_snap_id,
+ cls::rbd::SnapshotNamespace *out_snap_namespace) const;
+ int get_parent_spec(librados::snap_t in_snap_id,
+ cls::rbd::ParentImageSpec *pspec) const;
+ int is_snap_protected(librados::snap_t in_snap_id,
+ bool *is_protected) const;
+ int is_snap_unprotected(librados::snap_t in_snap_id,
+ bool *is_unprotected) const;
+
+ uint64_t get_current_size() const;
+ uint64_t get_object_size() const;
+ string get_object_name(uint64_t num) const;
+ uint64_t get_stripe_unit() const;
+ uint64_t get_stripe_count() const;
+ uint64_t get_stripe_period() const;
+ utime_t get_create_timestamp() const;
+ utime_t get_access_timestamp() const;
+ utime_t get_modify_timestamp() const;
+
+ void set_access_timestamp(utime_t at);
+ void set_modify_timestamp(utime_t at);
+
+ void add_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ std::string in_snap_name,
+ librados::snap_t id,
+ uint64_t in_size, const ParentImageInfo &parent,
+ uint8_t protection_status, uint64_t flags, utime_t timestamp);
+ void rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ std::string in_snap_name,
+ librados::snap_t id);
+ uint64_t get_image_size(librados::snap_t in_snap_id) const;
+ uint64_t get_object_count(librados::snap_t in_snap_id) const;
+ bool test_features(uint64_t test_features) const;
+ bool test_features(uint64_t test_features,
+ const RWLock &in_snap_lock) const;
+ bool test_op_features(uint64_t op_features) const;
+ bool test_op_features(uint64_t op_features,
+ const RWLock &in_snap_lock) const;
+ int get_flags(librados::snap_t in_snap_id, uint64_t *flags) const;
+ int test_flags(librados::snap_t in_snap_id,
+ uint64_t test_flags, bool *flags_set) const;
+ int test_flags(librados::snap_t in_snap_id,
+ uint64_t test_flags, const RWLock &in_snap_lock,
+ bool *flags_set) const;
+ int update_flags(librados::snap_t in_snap_id, uint64_t flag, bool enabled);
+
+ const ParentImageInfo* get_parent_info(librados::snap_t in_snap_id) const;
+ int64_t get_parent_pool_id(librados::snap_t in_snap_id) const;
+ std::string get_parent_image_id(librados::snap_t in_snap_id) const;
+ uint64_t get_parent_snap_id(librados::snap_t in_snap_id) const;
+ int get_parent_overlap(librados::snap_t in_snap_id,
+ uint64_t *overlap) const;
+ void register_watch(Context *on_finish);
+ uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
+ uint64_t overlap);
+
+ void cancel_async_requests();
+ void cancel_async_requests(Context *on_finish);
+
+ void apply_metadata(const std::map<std::string, bufferlist> &meta,
+ bool thread_safe);
+
+ ExclusiveLock<ImageCtx> *create_exclusive_lock();
+ ObjectMap<ImageCtx> *create_object_map(uint64_t snap_id);
+ Journal<ImageCtx> *create_journal();
+
+ void clear_pending_completions();
+
+ void set_image_name(const std::string &name);
+
+ void notify_update();
+ void notify_update(Context *on_finish);
+
+ exclusive_lock::Policy *get_exclusive_lock_policy() const;
+ void set_exclusive_lock_policy(exclusive_lock::Policy *policy);
+
+ journal::Policy *get_journal_policy() const;
+ void set_journal_policy(journal::Policy *policy);
+
+ bool is_writeback_cache_enabled() const;
+
+ static void get_thread_pool_instance(CephContext *cct,
+ ThreadPool **thread_pool,
+ ContextWQ **op_work_queue);
+ static void get_timer_instance(CephContext *cct, SafeTimer **timer,
+ Mutex **timer_lock);
+ };
+}
+
+#endif
diff --git a/src/librbd/ImageState.cc b/src/librbd/ImageState.cc
new file mode 100644
index 00000000..1f0535e2
--- /dev/null
+++ b/src/librbd/ImageState.cc
@@ -0,0 +1,741 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ImageState.h"
+#include "include/rbd/librbd.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/OpenRequest.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageState: " << this << " "
+
+namespace librbd {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+class ImageUpdateWatchers {
+public:
+
+ explicit ImageUpdateWatchers(CephContext *cct) : m_cct(cct),
+ m_lock(util::unique_lock_name("librbd::ImageUpdateWatchers::m_lock", this)) {
+ }
+
+ ~ImageUpdateWatchers() {
+ ceph_assert(m_watchers.empty());
+ ceph_assert(m_in_flight.empty());
+ ceph_assert(m_pending_unregister.empty());
+ ceph_assert(m_on_shut_down_finish == nullptr);
+
+ destroy_work_queue();
+ }
+
+ void flush(Context *on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ if (!m_in_flight.empty()) {
+ Context *ctx = new FunctionContext(
+ [this, on_finish](int r) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing flush" << dendl;
+ on_finish->complete(r);
+ });
+ m_work_queue->queue(ctx, 0);
+ return;
+ }
+ }
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing flush" << dendl;
+ on_finish->complete(0);
+ }
+
+ void shut_down(Context *on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_shut_down_finish == nullptr);
+ m_watchers.clear();
+ if (!m_in_flight.empty()) {
+ m_on_shut_down_finish = on_finish;
+ return;
+ }
+ }
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing shut down" << dendl;
+ on_finish->complete(0);
+ }
+
+ void register_watcher(UpdateWatchCtx *watcher, uint64_t *handle) {
+ ldout(m_cct, 20) << __func__ << ": watcher=" << watcher << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_shut_down_finish == nullptr);
+
+ create_work_queue();
+
+ *handle = m_next_handle++;
+ m_watchers.insert(std::make_pair(*handle, watcher));
+ }
+
+ void unregister_watcher(uint64_t handle, Context *on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle="
+ << handle << dendl;
+ int r = 0;
+ {
+ Mutex::Locker locker(m_lock);
+ auto it = m_watchers.find(handle);
+ if (it == m_watchers.end()) {
+ r = -ENOENT;
+ } else {
+ if (m_in_flight.find(handle) != m_in_flight.end()) {
+ ceph_assert(m_pending_unregister.find(handle) == m_pending_unregister.end());
+ m_pending_unregister[handle] = on_finish;
+ on_finish = nullptr;
+ }
+ m_watchers.erase(it);
+ }
+ }
+
+ if (on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing unregister" << dendl;
+ on_finish->complete(r);
+ }
+ }
+
+ void notify() {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl;
+
+ Mutex::Locker locker(m_lock);
+ for (auto it : m_watchers) {
+ send_notify(it.first, it.second);
+ }
+ }
+
+ void send_notify(uint64_t handle, UpdateWatchCtx *watcher) {
+ ceph_assert(m_lock.is_locked());
+
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle="
+ << handle << ", watcher=" << watcher << dendl;
+
+ m_in_flight.insert(handle);
+
+ Context *ctx = new FunctionContext(
+ [this, handle, watcher](int r) {
+ handle_notify(handle, watcher);
+ });
+
+ m_work_queue->queue(ctx, 0);
+ }
+
+ void handle_notify(uint64_t handle, UpdateWatchCtx *watcher) {
+
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle="
+ << handle << ", watcher=" << watcher << dendl;
+
+ watcher->handle_notify();
+
+ Context *on_unregister_finish = nullptr;
+ Context *on_shut_down_finish = nullptr;
+
+ {
+ Mutex::Locker locker(m_lock);
+
+ auto in_flight_it = m_in_flight.find(handle);
+ ceph_assert(in_flight_it != m_in_flight.end());
+ m_in_flight.erase(in_flight_it);
+
+ // If there is no more in flight notifications for this watcher
+ // and it is pending unregister, complete it now.
+ if (m_in_flight.find(handle) == m_in_flight.end()) {
+ auto it = m_pending_unregister.find(handle);
+ if (it != m_pending_unregister.end()) {
+ on_unregister_finish = it->second;
+ m_pending_unregister.erase(it);
+ }
+ }
+
+ if (m_in_flight.empty()) {
+ ceph_assert(m_pending_unregister.empty());
+ if (m_on_shut_down_finish != nullptr) {
+ std::swap(m_on_shut_down_finish, on_shut_down_finish);
+ }
+ }
+ }
+
+ if (on_unregister_finish != nullptr) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing unregister" << dendl;
+ on_unregister_finish->complete(0);
+ }
+
+ if (on_shut_down_finish != nullptr) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing shut down" << dendl;
+ on_shut_down_finish->complete(0);
+ }
+ }
+
+private:
+ class ThreadPoolSingleton : public ThreadPool {
+ public:
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "librbd::ImageUpdateWatchers::thread_pool", "tp_librbd",
+ 1) {
+ start();
+ }
+ ~ThreadPoolSingleton() override {
+ stop();
+ }
+ };
+
+ CephContext *m_cct;
+ Mutex m_lock;
+ ContextWQ *m_work_queue = nullptr;
+ std::map<uint64_t, UpdateWatchCtx*> m_watchers;
+ uint64_t m_next_handle = 0;
+ std::multiset<uint64_t> m_in_flight;
+ std::map<uint64_t, Context*> m_pending_unregister;
+ Context *m_on_shut_down_finish = nullptr;
+
+ void create_work_queue() {
+ if (m_work_queue != nullptr) {
+ return;
+ }
+ auto& thread_pool = m_cct->lookup_or_create_singleton_object<
+ ThreadPoolSingleton>("librbd::ImageUpdateWatchers::thread_pool",
+ false, m_cct);
+ m_work_queue = new ContextWQ("librbd::ImageUpdateWatchers::op_work_queue",
+ m_cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"),
+ &thread_pool);
+ }
+
+ void destroy_work_queue() {
+ if (m_work_queue == nullptr) {
+ return;
+ }
+ m_work_queue->drain();
+ delete m_work_queue;
+ }
+};
+
+template <typename I>
+ImageState<I>::ImageState(I *image_ctx)
+ : m_image_ctx(image_ctx), m_state(STATE_UNINITIALIZED),
+ m_lock(util::unique_lock_name("librbd::ImageState::m_lock", this)),
+ m_last_refresh(0), m_refresh_seq(0),
+ m_update_watchers(new ImageUpdateWatchers(image_ctx->cct)) {
+}
+
+template <typename I>
+ImageState<I>::~ImageState() {
+ ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED);
+ delete m_update_watchers;
+}
+
+template <typename I>
+int ImageState<I>::open(uint64_t flags) {
+ C_SaferCond ctx;
+ open(flags, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ delete m_image_ctx;
+ }
+ return r;
+}
+
+template <typename I>
+void ImageState<I>::open(uint64_t flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_lock.Lock();
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ m_open_flags = flags;
+
+ Action action(ACTION_TYPE_OPEN);
+ action.refresh_seq = m_refresh_seq;
+
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+int ImageState<I>::close() {
+ C_SaferCond ctx;
+ close(&ctx);
+
+ int r = ctx.wait();
+ delete m_image_ctx;
+ return r;
+}
+
+template <typename I>
+void ImageState<I>::close(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_lock.Lock();
+ ceph_assert(!is_closed());
+
+ Action action(ACTION_TYPE_CLOSE);
+ action.refresh_seq = m_refresh_seq;
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+void ImageState<I>::handle_update_notification() {
+ Mutex::Locker locker(m_lock);
+ ++m_refresh_seq;
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": refresh_seq = " << m_refresh_seq << ", "
+ << "last_refresh = " << m_last_refresh << dendl;
+
+ if (m_state == STATE_OPEN) {
+ m_update_watchers->notify();
+ }
+}
+
+template <typename I>
+bool ImageState<I>::is_refresh_required() const {
+ Mutex::Locker locker(m_lock);
+ return (m_last_refresh != m_refresh_seq || find_pending_refresh() != nullptr);
+}
+
+template <typename I>
+int ImageState<I>::refresh() {
+ C_SaferCond refresh_ctx;
+ refresh(&refresh_ctx);
+ return refresh_ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::refresh(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_lock.Lock();
+ if (is_closed()) {
+ m_lock.Unlock();
+ on_finish->complete(-ESHUTDOWN);
+ return;
+ }
+
+ Action action(ACTION_TYPE_REFRESH);
+ action.refresh_seq = m_refresh_seq;
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+int ImageState<I>::refresh_if_required() {
+ C_SaferCond ctx;
+ {
+ m_lock.Lock();
+ Action action(ACTION_TYPE_REFRESH);
+ action.refresh_seq = m_refresh_seq;
+
+ auto refresh_action = find_pending_refresh();
+ if (refresh_action != nullptr) {
+ // if a refresh is in-flight, delay until it is finished
+ action = *refresh_action;
+ } else if (m_last_refresh == m_refresh_seq) {
+ m_lock.Unlock();
+ return 0;
+ } else if (is_closed()) {
+ m_lock.Unlock();
+ return -ESHUTDOWN;
+ }
+
+ execute_action_unlock(action, &ctx);
+ }
+
+ return ctx.wait();
+}
+
+template <typename I>
+const typename ImageState<I>::Action *
+ImageState<I>::find_pending_refresh() const {
+ ceph_assert(m_lock.is_locked());
+
+ auto it = std::find_if(m_actions_contexts.rbegin(),
+ m_actions_contexts.rend(),
+ [](const ActionContexts& action_contexts) {
+ return (action_contexts.first == ACTION_TYPE_REFRESH);
+ });
+ if (it != m_actions_contexts.rend()) {
+ return &it->first;
+ }
+ return nullptr;
+}
+
+template <typename I>
+void ImageState<I>::snap_set(uint64_t snap_id, Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": snap_id=" << snap_id << dendl;
+
+ Action action(ACTION_TYPE_SET_SNAP);
+ action.snap_id = snap_id;
+
+ m_lock.Lock();
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+void ImageState<I>::prepare_lock(Context *on_ready) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_lock.Lock();
+ if (is_closed()) {
+ m_lock.Unlock();
+ on_ready->complete(-ESHUTDOWN);
+ return;
+ }
+
+ Action action(ACTION_TYPE_LOCK);
+ action.on_ready = on_ready;
+ execute_action_unlock(action, nullptr);
+}
+
+template <typename I>
+void ImageState<I>::handle_prepare_lock_complete() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_lock.Lock();
+ if (m_state != STATE_PREPARING_LOCK) {
+ m_lock.Unlock();
+ return;
+ }
+
+ complete_action_unlock(STATE_OPEN, 0);
+}
+
+template <typename I>
+int ImageState<I>::register_update_watcher(UpdateWatchCtx *watcher,
+ uint64_t *handle) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_update_watchers->register_watcher(watcher, handle);
+
+ ldout(cct, 20) << __func__ << ": handle=" << *handle << dendl;
+ return 0;
+}
+
+template <typename I>
+int ImageState<I>::unregister_update_watcher(uint64_t handle) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": handle=" << handle << dendl;
+
+ C_SaferCond ctx;
+ m_update_watchers->unregister_watcher(handle, &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::flush_update_watchers(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_update_watchers->flush(on_finish);
+}
+
+template <typename I>
+void ImageState<I>::shut_down_update_watchers(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_update_watchers->shut_down(on_finish);
+}
+
+template <typename I>
+bool ImageState<I>::is_transition_state() const {
+ switch (m_state) {
+ case STATE_UNINITIALIZED:
+ case STATE_OPEN:
+ case STATE_CLOSED:
+ return false;
+ case STATE_OPENING:
+ case STATE_CLOSING:
+ case STATE_REFRESHING:
+ case STATE_SETTING_SNAP:
+ case STATE_PREPARING_LOCK:
+ break;
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageState<I>::is_closed() const {
+ ceph_assert(m_lock.is_locked());
+
+ return ((m_state == STATE_CLOSED) ||
+ (!m_actions_contexts.empty() &&
+ m_actions_contexts.back().first.action_type == ACTION_TYPE_CLOSE));
+}
+
+template <typename I>
+void ImageState<I>::append_context(const Action &action, Context *context) {
+ ceph_assert(m_lock.is_locked());
+
+ ActionContexts *action_contexts = nullptr;
+ for (auto &action_ctxs : m_actions_contexts) {
+ if (action == action_ctxs.first) {
+ action_contexts = &action_ctxs;
+ break;
+ }
+ }
+
+ if (action_contexts == nullptr) {
+ m_actions_contexts.push_back({action, {}});
+ action_contexts = &m_actions_contexts.back();
+ }
+
+ if (context != nullptr) {
+ action_contexts->second.push_back(context);
+ }
+}
+
+template <typename I>
+void ImageState<I>::execute_next_action_unlock() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_actions_contexts.empty());
+ switch (m_actions_contexts.front().first.action_type) {
+ case ACTION_TYPE_OPEN:
+ send_open_unlock();
+ return;
+ case ACTION_TYPE_CLOSE:
+ send_close_unlock();
+ return;
+ case ACTION_TYPE_REFRESH:
+ send_refresh_unlock();
+ return;
+ case ACTION_TYPE_SET_SNAP:
+ send_set_snap_unlock();
+ return;
+ case ACTION_TYPE_LOCK:
+ send_prepare_lock_unlock();
+ return;
+ }
+ ceph_abort();
+}
+
+template <typename I>
+void ImageState<I>::execute_action_unlock(const Action &action,
+ Context *on_finish) {
+ ceph_assert(m_lock.is_locked());
+
+ append_context(action, on_finish);
+ if (!is_transition_state()) {
+ execute_next_action_unlock();
+ } else {
+ m_lock.Unlock();
+ }
+}
+
+template <typename I>
+void ImageState<I>::complete_action_unlock(State next_state, int r) {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_actions_contexts.empty());
+
+ ActionContexts action_contexts(std::move(m_actions_contexts.front()));
+ m_actions_contexts.pop_front();
+
+ m_state = next_state;
+ m_lock.Unlock();
+
+ for (auto ctx : action_contexts.second) {
+ ctx->complete(r);
+ }
+
+ if (next_state != STATE_UNINITIALIZED && next_state != STATE_CLOSED) {
+ m_lock.Lock();
+ if (!is_transition_state() && !m_actions_contexts.empty()) {
+ execute_next_action_unlock();
+ } else {
+ m_lock.Unlock();
+ }
+ }
+}
+
+template <typename I>
+void ImageState<I>::send_open_unlock() {
+ ceph_assert(m_lock.is_locked());
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_state = STATE_OPENING;
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_open>(this));
+ image::OpenRequest<I> *req = image::OpenRequest<I>::create(
+ m_image_ctx, m_open_flags, ctx);
+
+ m_lock.Unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_open(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ }
+
+ m_lock.Lock();
+ complete_action_unlock(r < 0 ? STATE_UNINITIALIZED : STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_close_unlock() {
+ ceph_assert(m_lock.is_locked());
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_state = STATE_CLOSING;
+
+ Context *ctx = create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_close>(this);
+ image::CloseRequest<I> *req = image::CloseRequest<I>::create(
+ m_image_ctx, ctx);
+
+ m_lock.Unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_close(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "error occurred while closing image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_lock.Lock();
+ complete_action_unlock(STATE_CLOSED, r);
+}
+
+template <typename I>
+void ImageState<I>::send_refresh_unlock() {
+ ceph_assert(m_lock.is_locked());
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_state = STATE_REFRESHING;
+ ceph_assert(!m_actions_contexts.empty());
+ auto &action_context = m_actions_contexts.front().first;
+ ceph_assert(action_context.action_type == ACTION_TYPE_REFRESH);
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_refresh>(this));
+ image::RefreshRequest<I> *req = image::RefreshRequest<I>::create(
+ *m_image_ctx, false, false, ctx);
+
+ m_lock.Unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_refresh(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ m_lock.Lock();
+ ceph_assert(!m_actions_contexts.empty());
+
+ ActionContexts &action_contexts(m_actions_contexts.front());
+ ceph_assert(action_contexts.first.action_type == ACTION_TYPE_REFRESH);
+ ceph_assert(m_last_refresh <= action_contexts.first.refresh_seq);
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "incomplete refresh: not updating sequence" << dendl;
+ r = 0;
+ } else {
+ m_last_refresh = action_contexts.first.refresh_seq;
+ }
+
+ complete_action_unlock(STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_set_snap_unlock() {
+ ceph_assert(m_lock.is_locked());
+
+ m_state = STATE_SETTING_SNAP;
+
+ ceph_assert(!m_actions_contexts.empty());
+ ActionContexts &action_contexts(m_actions_contexts.front());
+ ceph_assert(action_contexts.first.action_type == ACTION_TYPE_SET_SNAP);
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "snap_id=" << action_contexts.first.snap_id << dendl;
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_set_snap>(this));
+ image::SetSnapRequest<I> *req = image::SetSnapRequest<I>::create(
+ *m_image_ctx, action_contexts.first.snap_id, ctx);
+
+ m_lock.Unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_set_snap(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to set snapshot: " << cpp_strerror(r) << dendl;
+ }
+
+ m_lock.Lock();
+ complete_action_unlock(STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_prepare_lock_unlock() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ m_state = STATE_PREPARING_LOCK;
+
+ ceph_assert(!m_actions_contexts.empty());
+ ActionContexts &action_contexts(m_actions_contexts.front());
+ ceph_assert(action_contexts.first.action_type == ACTION_TYPE_LOCK);
+
+ Context *on_ready = action_contexts.first.on_ready;
+ m_lock.Unlock();
+
+ if (on_ready == nullptr) {
+ complete_action_unlock(STATE_OPEN, 0);
+ return;
+ }
+
+ // wake up the lock handler now that its safe to proceed
+ on_ready->complete(0);
+}
+
+} // namespace librbd
+
+template class librbd::ImageState<librbd::ImageCtx>;
diff --git a/src/librbd/ImageState.h b/src/librbd/ImageState.h
new file mode 100644
index 00000000..7f28d1ee
--- /dev/null
+++ b/src/librbd/ImageState.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_STATE_H
+#define CEPH_LIBRBD_IMAGE_STATE_H
+
+#include "include/int_types.h"
+#include "common/Mutex.h"
+#include <list>
+#include <string>
+#include <utility>
+#include "cls/rbd/cls_rbd_types.h"
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+class ImageCtx;
+class ImageUpdateWatchers;
+class UpdateWatchCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageState {
+public:
+ ImageState(ImageCtxT *image_ctx);
+ ~ImageState();
+
+ int open(uint64_t flags);
+ void open(uint64_t flags, Context *on_finish);
+
+ int close();
+ void close(Context *on_finish);
+
+ void handle_update_notification();
+
+ bool is_refresh_required() const;
+
+ int refresh();
+ int refresh_if_required();
+ void refresh(Context *on_finish);
+
+ void snap_set(uint64_t snap_id, Context *on_finish);
+
+ void prepare_lock(Context *on_ready);
+ void handle_prepare_lock_complete();
+
+ int register_update_watcher(UpdateWatchCtx *watcher, uint64_t *handle);
+ int unregister_update_watcher(uint64_t handle);
+ void flush_update_watchers(Context *on_finish);
+ void shut_down_update_watchers(Context *on_finish);
+
+private:
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_OPEN,
+ STATE_CLOSED,
+ STATE_OPENING,
+ STATE_CLOSING,
+ STATE_REFRESHING,
+ STATE_SETTING_SNAP,
+ STATE_PREPARING_LOCK
+ };
+
+ enum ActionType {
+ ACTION_TYPE_OPEN,
+ ACTION_TYPE_CLOSE,
+ ACTION_TYPE_REFRESH,
+ ACTION_TYPE_SET_SNAP,
+ ACTION_TYPE_LOCK
+ };
+
+ struct Action {
+ ActionType action_type;
+ uint64_t refresh_seq = 0;
+ uint64_t snap_id = CEPH_NOSNAP;
+ Context *on_ready = nullptr;
+
+ Action(ActionType action_type) : action_type(action_type) {
+ }
+ inline bool operator==(const Action &action) const {
+ if (action_type != action.action_type) {
+ return false;
+ }
+ switch (action_type) {
+ case ACTION_TYPE_REFRESH:
+ return (refresh_seq == action.refresh_seq);
+ case ACTION_TYPE_SET_SNAP:
+ return (snap_id == action.snap_id);
+ case ACTION_TYPE_LOCK:
+ return false;
+ default:
+ return true;
+ }
+ }
+ };
+
+ typedef std::list<Context *> Contexts;
+ typedef std::pair<Action, Contexts> ActionContexts;
+ typedef std::list<ActionContexts> ActionsContexts;
+
+ ImageCtxT *m_image_ctx;
+ State m_state;
+
+ mutable Mutex m_lock;
+ ActionsContexts m_actions_contexts;
+
+ uint64_t m_last_refresh;
+ uint64_t m_refresh_seq;
+
+ ImageUpdateWatchers *m_update_watchers;
+
+ uint64_t m_open_flags;
+
+ bool is_transition_state() const;
+ bool is_closed() const;
+
+ const Action *find_pending_refresh() const;
+
+ void append_context(const Action &action, Context *context);
+ void execute_next_action_unlock();
+ void execute_action_unlock(const Action &action, Context *context);
+ void complete_action_unlock(State next_state, int r);
+
+ void send_open_unlock();
+ void handle_open(int r);
+
+ void send_close_unlock();
+ void handle_close(int r);
+
+ void send_refresh_unlock();
+ void handle_refresh(int r);
+
+ void send_set_snap_unlock();
+ void handle_set_snap(int r);
+
+ void send_prepare_lock_unlock();
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ImageState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_STATE_H
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
new file mode 100644
index 00000000..bda491bc
--- /dev/null
+++ b/src/librbd/ImageWatcher.cc
@@ -0,0 +1,1126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ImageWatcher.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/TaskFinisher.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/image_watcher/NotifyLockOwner.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/watcher/Utils.h"
+#include "include/encoding.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include <boost/bind.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageWatcher: "
+
+namespace librbd {
+
+using namespace image_watcher;
+using namespace watch_notify;
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+using librbd::watcher::util::HandlePayloadVisitor;
+
+using ceph::encode;
+using ceph::decode;
+
+static const double RETRY_DELAY_SECONDS = 1.0;
+
+template <typename I>
+struct ImageWatcher<I>::C_ProcessPayload : public Context {
+ ImageWatcher *image_watcher;
+ uint64_t notify_id;
+ uint64_t handle;
+ watch_notify::Payload payload;
+
+ C_ProcessPayload(ImageWatcher *image_watcher_, uint64_t notify_id_,
+ uint64_t handle_, const watch_notify::Payload &payload)
+ : image_watcher(image_watcher_), notify_id(notify_id_), handle(handle_),
+ payload(payload) {
+ }
+
+ void finish(int r) override {
+ image_watcher->m_async_op_tracker.start_op();
+ if (image_watcher->notifications_blocked()) {
+ // requests are blocked -- just ack the notification
+ bufferlist bl;
+ image_watcher->acknowledge_notify(notify_id, handle, bl);
+ } else {
+ image_watcher->process_payload(notify_id, handle, payload);
+ }
+ image_watcher->m_async_op_tracker.finish_op();
+ }
+};
+
+template <typename I>
+ImageWatcher<I>::ImageWatcher(I &image_ctx)
+ : Watcher(image_ctx.md_ctx, image_ctx.op_work_queue, image_ctx.header_oid),
+ m_image_ctx(image_ctx),
+ m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)),
+ m_async_request_lock(util::unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)),
+ m_owner_client_id_lock(util::unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))
+{
+}
+
+template <typename I>
+ImageWatcher<I>::~ImageWatcher()
+{
+ delete m_task_finisher;
+}
+
+template <typename I>
+void ImageWatcher<I>::unregister_watch(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " unregistering image watcher" << dendl;
+
+ cancel_async_requests();
+
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ m_async_op_tracker.wait_for_ops(on_finish);
+ });
+ FunctionContext *ctx = new FunctionContext([this, on_finish](int r) {
+ m_task_finisher->cancel_all(on_finish);
+ });
+ Watcher::unregister_watch(ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::block_notifies(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ cancel_async_requests();
+ on_finish->complete(r);
+ });
+ Watcher::block_notifies(on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_async_progress(const AsyncRequestId &request,
+ uint64_t offset, uint64_t total) {
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&ImageWatcher<I>::notify_async_progress, this, request, offset,
+ total));
+ m_task_finisher->queue(Task(TASK_CODE_ASYNC_PROGRESS, request), ctx);
+}
+
+template <typename I>
+int ImageWatcher<I>::notify_async_progress(const AsyncRequestId &request,
+ uint64_t offset, uint64_t total) {
+ ldout(m_image_ctx.cct, 20) << this << " remote async request progress: "
+ << request << " @ " << offset
+ << "/" << total << dendl;
+
+ send_notify(AsyncProgressPayload(request, offset, total));
+ return 0;
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_async_complete(const AsyncRequestId &request,
+ int r) {
+ m_async_op_tracker.start_op();
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&ImageWatcher<I>::notify_async_complete, this, request, r));
+ m_task_finisher->queue(ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_async_complete(const AsyncRequestId &request,
+ int r) {
+ ldout(m_image_ctx.cct, 20) << this << " remote async request finished: "
+ << request << " = " << r << dendl;
+
+ send_notify(AsyncCompletePayload(request, r),
+ new FunctionContext(boost::bind(&ImageWatcher<I>::handle_async_complete,
+ this, request, r, _1)));
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_async_complete(const AsyncRequestId &request,
+ int r, int ret_val) {
+ ldout(m_image_ctx.cct, 20) << this << " " << __func__ << ": "
+ << "request=" << request << ", r=" << ret_val
+ << dendl;
+ if (ret_val < 0) {
+ lderr(m_image_ctx.cct) << this << " failed to notify async complete: "
+ << cpp_strerror(ret_val) << dendl;
+ if (ret_val == -ETIMEDOUT && !is_unregistered()) {
+ schedule_async_complete(request, r);
+ m_async_op_tracker.finish_op();
+ return;
+ }
+ }
+
+ RWLock::WLocker async_request_locker(m_async_request_lock);
+ m_async_pending.erase(request);
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_flatten(uint64_t request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id, FlattenPayload(async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_resize(uint64_t request_id, uint64_t size,
+ bool allow_shrink,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ ResizePayload(size, allow_shrink, async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(SnapCreatePayload(snap_namespace, snap_name), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_rename(const snapid_t &src_snap_id,
+ const std::string &dst_snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(SnapRenamePayload(src_snap_id, dst_snap_name), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_remove(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(SnapRemovePayload(snap_namespace, snap_name), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_protect(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(SnapProtectPayload(snap_namespace, snap_name), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_unprotect(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(SnapUnprotectPayload(snap_namespace, snap_name), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_rebuild_object_map(uint64_t request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ RebuildObjectMapPayload(async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_rename(const std::string &image_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(RenamePayload(image_name), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_update_features(uint64_t features, bool enabled,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ notify_lock_owner(UpdateFeaturesPayload(features, enabled), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_migrate(uint64_t request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id, MigratePayload(async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_sparsify(uint64_t request_id, size_t sparse_size,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ SparsifyPayload(async_request_id, sparse_size), prog_ctx,
+ on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_header_update(Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << this << ": " << __func__ << dendl;
+
+ // supports legacy (empty buffer) clients
+ send_notify(HeaderUpdatePayload(), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_header_update(librados::IoCtx &io_ctx,
+ const std::string &oid) {
+ // supports legacy (empty buffer) clients
+ bufferlist bl;
+ encode(NotifyMessage(HeaderUpdatePayload()), bl);
+ io_ctx.notify2(oid, bl, watcher::Notifier::NOTIFY_TIMEOUT, nullptr);
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_cancel_async_requests() {
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&ImageWatcher<I>::cancel_async_requests, this));
+ m_task_finisher->queue(TASK_CODE_CANCEL_ASYNC_REQUESTS, ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::cancel_async_requests() {
+ RWLock::WLocker l(m_async_request_lock);
+ for (std::map<AsyncRequestId, AsyncRequest>::iterator iter =
+ m_async_requests.begin();
+ iter != m_async_requests.end(); ++iter) {
+ iter->second.first->complete(-ERESTART);
+ }
+ m_async_requests.clear();
+}
+
+template <typename I>
+void ImageWatcher<I>::set_owner_client_id(const ClientId& client_id) {
+ ceph_assert(m_owner_client_id_lock.is_locked());
+ m_owner_client_id = client_id;
+ ldout(m_image_ctx.cct, 10) << this << " current lock owner: "
+ << m_owner_client_id << dendl;
+}
+
+template <typename I>
+ClientId ImageWatcher<I>::get_client_id() {
+ RWLock::RLocker l(this->m_watch_lock);
+ return ClientId(m_image_ctx.md_ctx.get_instance_id(), this->m_watch_handle);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_acquired_lock() {
+ ldout(m_image_ctx.cct, 10) << this << " notify acquired lock" << dendl;
+
+ ClientId client_id = get_client_id();
+ {
+ Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
+ set_owner_client_id(client_id);
+ }
+
+ send_notify(AcquiredLockPayload(client_id));
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_released_lock() {
+ ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl;
+
+ {
+ Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
+ set_owner_client_id(ClientId());
+ }
+
+ send_notify(ReleasedLockPayload(get_client_id()));
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_request_lock(bool use_timer, int timer_delay) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ // exclusive lock dynamically disabled via image refresh
+ return;
+ }
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ RWLock::RLocker watch_locker(this->m_watch_lock);
+ if (this->is_registered(this->m_watch_lock)) {
+ ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
+
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&ImageWatcher<I>::notify_request_lock, this));
+ if (use_timer) {
+ if (timer_delay < 0) {
+ timer_delay = RETRY_DELAY_SECONDS;
+ }
+ m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK,
+ timer_delay, ctx);
+ } else {
+ m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx);
+ }
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_request_lock() {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+
+ // ExclusiveLock state machine can be dynamically disabled or
+ // race with task cancel
+ if (m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner()) {
+ return;
+ }
+
+ ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
+
+ notify_lock_owner(RequestLockPayload(get_client_id(), false),
+ create_context_callback<
+ ImageWatcher, &ImageWatcher<I>::handle_request_lock>(this));
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_request_lock(int r) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+
+ // ExclusiveLock state machine cannot transition -- but can be
+ // dynamically disabled
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ return;
+ }
+
+ if (r == -ETIMEDOUT) {
+ ldout(m_image_ctx.cct, 5) << this << " timed out requesting lock: retrying"
+ << dendl;
+
+ // treat this is a dead client -- so retest acquiring the lock
+ m_image_ctx.exclusive_lock->handle_peer_notification(0);
+ } else if (r == -EROFS) {
+ ldout(m_image_ctx.cct, 5) << this << " peer will not release lock" << dendl;
+ m_image_ctx.exclusive_lock->handle_peer_notification(r);
+ } else if (r < 0) {
+ lderr(m_image_ctx.cct) << this << " error requesting lock: "
+ << cpp_strerror(r) << dendl;
+ schedule_request_lock(true);
+ } else {
+ // lock owner acked -- but resend if we don't see them release the lock
+ int retry_timeout = m_image_ctx.cct->_conf.template get_val<int64_t>(
+ "client_notify_timeout");
+ ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout
+ << " seconds" << dendl;
+ schedule_request_lock(true, retry_timeout);
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_lock_owner(const Payload& payload,
+ Context *on_finish) {
+ ceph_assert(on_finish != nullptr);
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ bufferlist bl;
+ encode(NotifyMessage(payload), bl);
+
+ NotifyLockOwner *notify_lock_owner = NotifyLockOwner::create(
+ m_image_ctx, this->m_notifier, std::move(bl), on_finish);
+ notify_lock_owner->send();
+}
+
+template <typename I>
+Context *ImageWatcher<I>::remove_async_request(const AsyncRequestId &id) {
+ RWLock::WLocker async_request_locker(m_async_request_lock);
+ auto it = m_async_requests.find(id);
+ if (it != m_async_requests.end()) {
+ Context *on_complete = it->second.first;
+ m_async_requests.erase(it);
+ return on_complete;
+ }
+ return nullptr;
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_async_request_timed_out(const AsyncRequestId &id) {
+ ldout(m_image_ctx.cct, 20) << "scheduling async request time out: " << id
+ << dendl;
+
+ Context *ctx = new FunctionContext(boost::bind(
+ &ImageWatcher<I>::async_request_timed_out, this, id));
+
+ Task task(TASK_CODE_ASYNC_REQUEST, id);
+ m_task_finisher->cancel(task);
+
+ m_task_finisher->add_event_after(
+ task, m_image_ctx.config.template get_val<uint64_t>("rbd_request_timed_out_seconds"),
+ ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::async_request_timed_out(const AsyncRequestId &id) {
+ Context *on_complete = remove_async_request(id);
+ if (on_complete != nullptr) {
+ ldout(m_image_ctx.cct, 5) << "async request timed out: " << id << dendl;
+ m_image_ctx.op_work_queue->queue(on_complete, -ETIMEDOUT);
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_async_request(const AsyncRequestId &async_request_id,
+ const Payload& payload,
+ ProgressContext& prog_ctx,
+ Context *on_finish) {
+ ceph_assert(on_finish != nullptr);
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ ldout(m_image_ctx.cct, 10) << this << " async request: " << async_request_id
+ << dendl;
+
+ Context *on_notify = new FunctionContext([this, async_request_id](int r) {
+ if (r < 0) {
+ // notification failed -- don't expect updates
+ Context *on_complete = remove_async_request(async_request_id);
+ if (on_complete != nullptr) {
+ on_complete->complete(r);
+ }
+ }
+ });
+
+ Context *on_complete = new FunctionContext(
+ [this, async_request_id, on_finish](int r) {
+ m_task_finisher->cancel(Task(TASK_CODE_ASYNC_REQUEST, async_request_id));
+ on_finish->complete(r);
+ });
+
+ {
+ RWLock::WLocker async_request_locker(m_async_request_lock);
+ m_async_requests[async_request_id] = AsyncRequest(on_complete, &prog_ctx);
+ }
+
+ schedule_async_request_timed_out(async_request_id);
+ notify_lock_owner(payload, on_notify);
+}
+
+template <typename I>
+int ImageWatcher<I>::prepare_async_request(const AsyncRequestId& async_request_id,
+ bool* new_request, Context** ctx,
+ ProgressContext** prog_ctx) {
+ if (async_request_id.client_id == get_client_id()) {
+ return -ERESTART;
+ } else {
+ RWLock::WLocker l(m_async_request_lock);
+ if (m_async_pending.count(async_request_id) == 0) {
+ m_async_pending.insert(async_request_id);
+ *new_request = true;
+ *prog_ctx = new RemoteProgressContext(*this, async_request_id);
+ *ctx = new RemoteContext(*this, async_request_id, *prog_ctx);
+ } else {
+ *new_request = false;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const HeaderUpdatePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl;
+
+ m_image_ctx.state->handle_update_notification();
+ m_image_ctx.perfcounter->inc(l_librbd_notify);
+ if (ack_ctx != nullptr) {
+ m_image_ctx.state->flush_update_watchers(new C_ResponseMessage(ack_ctx));
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const AcquiredLockPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement"
+ << dendl;
+
+ bool cancel_async_requests = true;
+ if (payload.client_id.is_valid()) {
+ Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
+ if (payload.client_id == m_owner_client_id) {
+ cancel_async_requests = false;
+ }
+ set_owner_client_id(payload.client_id);
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ // potentially wake up the exclusive lock state machine now that
+ // a lock owner has advertised itself
+ m_image_ctx.exclusive_lock->handle_peer_notification(0);
+ }
+ if (cancel_async_requests &&
+ (m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner())) {
+ schedule_cancel_async_requests();
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const ReleasedLockPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl;
+
+ bool cancel_async_requests = true;
+ if (payload.client_id.is_valid()) {
+ Mutex::Locker l(m_owner_client_id_lock);
+ if (payload.client_id != m_owner_client_id) {
+ ldout(m_image_ctx.cct, 10) << this << " unexpected owner: "
+ << payload.client_id << " != "
+ << m_owner_client_id << dendl;
+ cancel_async_requests = false;
+ } else {
+ set_owner_client_id(ClientId());
+ }
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (cancel_async_requests &&
+ (m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner())) {
+ schedule_cancel_async_requests();
+ }
+
+ // alert the exclusive lock state machine that the lock is available
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.exclusive_lock->is_lock_owner()) {
+ m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK);
+ m_image_ctx.exclusive_lock->handle_peer_notification(0);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const RequestLockPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl;
+ if (payload.client_id == get_client_id()) {
+ return true;
+ }
+
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ m_image_ctx.exclusive_lock->is_lock_owner()) {
+ int r = 0;
+ bool accept_request = m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r);
+
+ if (accept_request) {
+ ceph_assert(r == 0);
+ Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
+ if (!m_owner_client_id.is_valid()) {
+ return true;
+ }
+
+ ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
+ << dendl;
+ r = m_image_ctx.get_exclusive_lock_policy()->lock_requested(
+ payload.force);
+ }
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const AsyncProgressPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_async_request_lock);
+ std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
+ m_async_requests.find(payload.async_request_id);
+ if (req_it != m_async_requests.end()) {
+ ldout(m_image_ctx.cct, 20) << this << " request progress: "
+ << payload.async_request_id << " @ "
+ << payload.offset << "/" << payload.total
+ << dendl;
+ schedule_async_request_timed_out(payload.async_request_id);
+ req_it->second.second->update_progress(payload.offset, payload.total);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const AsyncCompletePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ Context *on_complete = remove_async_request(payload.async_request_id);
+ if (on_complete != nullptr) {
+ ldout(m_image_ctx.cct, 10) << this << " request finished: "
+ << payload.async_request_id << "="
+ << payload.result << dendl;
+ on_complete->complete(payload.result);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const FlattenPayload &payload,
+ C_NotifyAck *ack_ctx) {
+
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ bool new_request;
+ Context *ctx;
+ ProgressContext *prog_ctx;
+ r = prepare_async_request(payload.async_request_id, &new_request,
+ &ctx, &prog_ctx);
+ if (r == 0 && new_request) {
+ ldout(m_image_ctx.cct, 10) << this << " remote flatten request: "
+ << payload.async_request_id << dendl;
+ m_image_ctx.operations->execute_flatten(*prog_ctx, ctx);
+ }
+
+ encode(ResponseMessage(r), ack_ctx->out);
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const ResizePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ bool new_request;
+ Context *ctx;
+ ProgressContext *prog_ctx;
+ r = prepare_async_request(payload.async_request_id, &new_request,
+ &ctx, &prog_ctx);
+ if (r == 0 && new_request) {
+ ldout(m_image_ctx.cct, 10) << this << " remote resize request: "
+ << payload.async_request_id << " "
+ << payload.size << " "
+ << payload.allow_shrink << dendl;
+ m_image_ctx.operations->execute_resize(payload.size, payload.allow_shrink, *prog_ctx, ctx, 0);
+ }
+
+ encode(ResponseMessage(r), ack_ctx->out);
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapCreatePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: "
+ << payload.snap_name << dendl;
+
+ m_image_ctx.operations->execute_snap_create(payload.snap_namespace,
+ payload.snap_name,
+ new C_ResponseMessage(ack_ctx),
+ 0, false);
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapRenamePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_rename request: "
+ << payload.snap_id << " to "
+ << payload.snap_name << dendl;
+
+ m_image_ctx.operations->execute_snap_rename(payload.snap_id,
+ payload.snap_name,
+ new C_ResponseMessage(ack_ctx));
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapRemovePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL;
+ if (cls::rbd::get_snap_namespace_type(payload.snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) {
+ request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE;
+ }
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(request_type, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_remove request: "
+ << payload.snap_name << dendl;
+
+ m_image_ctx.operations->execute_snap_remove(payload.snap_namespace,
+ payload.snap_name,
+ new C_ResponseMessage(ack_ctx));
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapProtectPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_protect request: "
+ << payload.snap_name << dendl;
+
+ m_image_ctx.operations->execute_snap_protect(payload.snap_namespace,
+ payload.snap_name,
+ new C_ResponseMessage(ack_ctx));
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapUnprotectPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_unprotect request: "
+ << payload.snap_name << dendl;
+
+ m_image_ctx.operations->execute_snap_unprotect(payload.snap_namespace,
+ payload.snap_name,
+ new C_ResponseMessage(ack_ctx));
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const RebuildObjectMapPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ bool new_request;
+ Context *ctx;
+ ProgressContext *prog_ctx;
+ r = prepare_async_request(payload.async_request_id, &new_request,
+ &ctx, &prog_ctx);
+ if (r == 0 && new_request) {
+ ldout(m_image_ctx.cct, 10) << this
+ << " remote rebuild object map request: "
+ << payload.async_request_id << dendl;
+ m_image_ctx.operations->execute_rebuild_object_map(*prog_ctx, ctx);
+ }
+
+ encode(ResponseMessage(r), ack_ctx->out);
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const RenamePayload& payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote rename request: "
+ << payload.image_name << dendl;
+
+ m_image_ctx.operations->execute_rename(payload.image_name,
+ new C_ResponseMessage(ack_ctx));
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const UpdateFeaturesPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ ldout(m_image_ctx.cct, 10) << this << " remote update_features request: "
+ << payload.features << " "
+ << (payload.enabled ? "enabled" : "disabled")
+ << dendl;
+
+ m_image_ctx.operations->execute_update_features(
+ payload.features, payload.enabled, new C_ResponseMessage(ack_ctx), 0);
+ return false;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const MigratePayload &payload,
+ C_NotifyAck *ack_ctx) {
+
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ bool new_request;
+ Context *ctx;
+ ProgressContext *prog_ctx;
+ r = prepare_async_request(payload.async_request_id, &new_request,
+ &ctx, &prog_ctx);
+ if (r == 0 && new_request) {
+ ldout(m_image_ctx.cct, 10) << this << " remote migrate request: "
+ << payload.async_request_id << dendl;
+ m_image_ctx.operations->execute_migrate(*prog_ctx, ctx);
+ }
+
+ encode(ResponseMessage(r), ack_ctx->out);
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SparsifyPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r)) {
+ bool new_request;
+ Context *ctx;
+ ProgressContext *prog_ctx;
+ r = prepare_async_request(payload.async_request_id, &new_request,
+ &ctx, &prog_ctx);
+ if (r == 0 && new_request) {
+ ldout(m_image_ctx.cct, 10) << this << " remote sparsify request: "
+ << payload.async_request_id << dendl;
+ m_image_ctx.operations->execute_sparsify(payload.sparse_size, *prog_ctx,
+ ctx);
+ }
+
+ encode(ResponseMessage(r), ack_ctx->out);
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const UnknownPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r) || r < 0) {
+ encode(ResponseMessage(-EOPNOTSUPP), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+void ImageWatcher<I>::process_payload(uint64_t notify_id, uint64_t handle,
+ const Payload &payload) {
+ apply_visitor(HandlePayloadVisitor<ImageWatcher<I>>(this, notify_id, handle),
+ payload);
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ NotifyMessage notify_message;
+ if (bl.length() == 0) {
+ // legacy notification for header updates
+ notify_message = NotifyMessage(HeaderUpdatePayload());
+ } else {
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ lderr(m_image_ctx.cct) << this << " error decoding image notification: "
+ << err.what() << dendl;
+ return;
+ }
+ }
+
+ // if an image refresh is required, refresh before processing the request
+ if (notify_message.check_for_refresh() &&
+ m_image_ctx.state->is_refresh_required()) {
+ m_image_ctx.state->refresh(new C_ProcessPayload(this, notify_id, handle,
+ notify_message.payload));
+ } else {
+ process_payload(notify_id, handle, notify_message.payload);
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_error(uint64_t handle, int err) {
+ lderr(m_image_ctx.cct) << this << " image watch failed: " << handle << ", "
+ << cpp_strerror(err) << dendl;
+
+ {
+ Mutex::Locker l(m_owner_client_id_lock);
+ set_owner_client_id(ClientId());
+ }
+
+ Watcher::handle_error(handle, err);
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_rewatch_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ // update the lock cookie with the new watch handle
+ m_image_ctx.exclusive_lock->reacquire_lock(nullptr);
+ }
+ }
+
+ // image might have been updated while we didn't have active watch
+ handle_payload(HeaderUpdatePayload(), nullptr);
+}
+
+template <typename I>
+void ImageWatcher<I>::send_notify(const Payload &payload, Context *ctx) {
+ bufferlist bl;
+
+ encode(NotifyMessage(payload), bl);
+ Watcher::send_notify(bl, nullptr, ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::RemoteContext::finish(int r) {
+ m_image_watcher.schedule_async_complete(m_async_request_id, r);
+}
+
+template <typename I>
+void ImageWatcher<I>::C_ResponseMessage::finish(int r) {
+ CephContext *cct = notify_ack->cct;
+ ldout(cct, 10) << this << " C_ResponseMessage: r=" << r << dendl;
+
+ encode(ResponseMessage(r), notify_ack->out);
+ notify_ack->complete(0);
+}
+
+} // namespace librbd
+
+template class librbd::ImageWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
new file mode 100644
index 00000000..7d04f0b4
--- /dev/null
+++ b/src/librbd/ImageWatcher.h
@@ -0,0 +1,268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_WATCHER_H
+#define CEPH_LIBRBD_IMAGE_WATCHER_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/Mutex.h"
+#include "common/RWLock.h"
+#include "common/AsyncOpTracker.h"
+#include "include/Context.h"
+#include "include/rbd/librbd.hpp"
+#include "librbd/Watcher.h"
+#include "librbd/WatchNotifyTypes.h"
+#include <set>
+#include <string>
+#include <utility>
+
+class entity_name_t;
+
+namespace librbd {
+
+namespace watcher {
+namespace util {
+template <typename> struct HandlePayloadVisitor;
+}
+}
+
+class ImageCtx;
+template <typename> class TaskFinisher;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWatcher : public Watcher {
+ friend struct watcher::util::HandlePayloadVisitor<ImageWatcher<ImageCtxT>>;
+
+public:
+ ImageWatcher(ImageCtxT& image_ctx);
+ ~ImageWatcher() override;
+
+ void unregister_watch(Context *on_finish) override;
+ void block_notifies(Context *on_finish) override;
+
+ void notify_flatten(uint64_t request_id, ProgressContext &prog_ctx,
+ Context *on_finish);
+ void notify_resize(uint64_t request_id, uint64_t size, bool allow_shrink,
+ ProgressContext &prog_ctx, Context *on_finish);
+ void notify_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_snap_rename(const snapid_t &src_snap_id,
+ const std::string &dst_snap_name,
+ Context *on_finish);
+ void notify_snap_remove(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_snap_protect(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_snap_unprotect(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_rebuild_object_map(uint64_t request_id,
+ ProgressContext &prog_ctx, Context *on_finish);
+ void notify_rename(const std::string &image_name, Context *on_finish);
+
+ void notify_update_features(uint64_t features, bool enabled,
+ Context *on_finish);
+
+ void notify_migrate(uint64_t request_id, ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ void notify_sparsify(uint64_t request_id, size_t sparse_size,
+ ProgressContext &prog_ctx, Context *on_finish);
+
+ void notify_acquired_lock();
+ void notify_released_lock();
+ void notify_request_lock();
+
+ void notify_header_update(Context *on_finish);
+ static void notify_header_update(librados::IoCtx &io_ctx,
+ const std::string &oid);
+
+private:
+ enum TaskCode {
+ TASK_CODE_REQUEST_LOCK,
+ TASK_CODE_CANCEL_ASYNC_REQUESTS,
+ TASK_CODE_REREGISTER_WATCH,
+ TASK_CODE_ASYNC_REQUEST,
+ TASK_CODE_ASYNC_PROGRESS
+ };
+
+ typedef std::pair<Context *, ProgressContext *> AsyncRequest;
+
+ class Task {
+ public:
+ Task(TaskCode task_code) : m_task_code(task_code) {}
+ Task(TaskCode task_code, const watch_notify::AsyncRequestId &id)
+ : m_task_code(task_code), m_async_request_id(id) {}
+
+ inline bool operator<(const Task& rhs) const {
+ if (m_task_code != rhs.m_task_code) {
+ return m_task_code < rhs.m_task_code;
+ } else if ((m_task_code == TASK_CODE_ASYNC_REQUEST ||
+ m_task_code == TASK_CODE_ASYNC_PROGRESS) &&
+ m_async_request_id != rhs.m_async_request_id) {
+ return m_async_request_id < rhs.m_async_request_id;
+ }
+ return false;
+ }
+ private:
+ TaskCode m_task_code;
+ watch_notify::AsyncRequestId m_async_request_id;
+ };
+
+ class RemoteProgressContext : public ProgressContext {
+ public:
+ RemoteProgressContext(ImageWatcher &image_watcher,
+ const watch_notify::AsyncRequestId &id)
+ : m_image_watcher(image_watcher), m_async_request_id(id)
+ {
+ }
+
+ int update_progress(uint64_t offset, uint64_t total) override {
+ m_image_watcher.schedule_async_progress(m_async_request_id, offset,
+ total);
+ return 0;
+ }
+
+ private:
+ ImageWatcher &m_image_watcher;
+ watch_notify::AsyncRequestId m_async_request_id;
+ };
+
+ class RemoteContext : public Context {
+ public:
+ RemoteContext(ImageWatcher &image_watcher,
+ const watch_notify::AsyncRequestId &id,
+ ProgressContext *prog_ctx)
+ : m_image_watcher(image_watcher), m_async_request_id(id),
+ m_prog_ctx(prog_ctx)
+ {
+ }
+
+ ~RemoteContext() override {
+ delete m_prog_ctx;
+ }
+
+ void finish(int r) override;
+
+ private:
+ ImageWatcher &m_image_watcher;
+ watch_notify::AsyncRequestId m_async_request_id;
+ ProgressContext *m_prog_ctx;
+ };
+
+ struct C_ProcessPayload;
+ struct C_ResponseMessage : public Context {
+ C_NotifyAck *notify_ack;
+
+ C_ResponseMessage(C_NotifyAck *notify_ack) : notify_ack(notify_ack) {
+ }
+ void finish(int r) override;
+ };
+
+ ImageCtxT &m_image_ctx;
+
+ TaskFinisher<Task> *m_task_finisher;
+
+ RWLock m_async_request_lock;
+ std::map<watch_notify::AsyncRequestId, AsyncRequest> m_async_requests;
+ std::set<watch_notify::AsyncRequestId> m_async_pending;
+
+ Mutex m_owner_client_id_lock;
+ watch_notify::ClientId m_owner_client_id;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ void handle_register_watch(int r);
+
+ void schedule_cancel_async_requests();
+ void cancel_async_requests();
+
+ void set_owner_client_id(const watch_notify::ClientId &client_id);
+ watch_notify::ClientId get_client_id();
+
+ void handle_request_lock(int r);
+ void schedule_request_lock(bool use_timer, int timer_delay = -1);
+
+ void notify_lock_owner(const watch_notify::Payload& payload,
+ Context *on_finish);
+
+ Context *remove_async_request(const watch_notify::AsyncRequestId &id);
+ void schedule_async_request_timed_out(const watch_notify::AsyncRequestId &id);
+ void async_request_timed_out(const watch_notify::AsyncRequestId &id);
+ void notify_async_request(const watch_notify::AsyncRequestId &id,
+ const watch_notify::Payload &payload,
+ ProgressContext& prog_ctx,
+ Context *on_finish);
+
+ void schedule_async_progress(const watch_notify::AsyncRequestId &id,
+ uint64_t offset, uint64_t total);
+ int notify_async_progress(const watch_notify::AsyncRequestId &id,
+ uint64_t offset, uint64_t total);
+ void schedule_async_complete(const watch_notify::AsyncRequestId &id, int r);
+ void notify_async_complete(const watch_notify::AsyncRequestId &id, int r);
+ void handle_async_complete(const watch_notify::AsyncRequestId &request, int r,
+ int ret_val);
+
+ int prepare_async_request(const watch_notify::AsyncRequestId& id,
+ bool* new_request, Context** ctx,
+ ProgressContext** prog_ctx);
+
+ bool handle_payload(const watch_notify::HeaderUpdatePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::AcquiredLockPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::ReleasedLockPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::RequestLockPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::AsyncProgressPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::AsyncCompletePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::FlattenPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::ResizePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapCreatePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapRenamePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapRemovePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapProtectPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapUnprotectPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::RebuildObjectMapPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::RenamePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::UpdateFeaturesPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::MigratePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SparsifyPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::UnknownPayload& payload,
+ C_NotifyAck *ctx);
+ void process_payload(uint64_t notify_id, uint64_t handle,
+ const watch_notify::Payload &payload);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+ void handle_error(uint64_t cookie, int err) override;
+ void handle_rewatch_complete(int r) override;
+
+ void send_notify(const watch_notify::Payload& payload,
+ Context *ctx = nullptr);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ImageWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_WATCHER_H
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
new file mode 100644
index 00000000..1be60cbf
--- /dev/null
+++ b/src/librbd/Journal.cc
@@ -0,0 +1,1780 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/Journal.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Journaler.h"
+#include "journal/Policy.h"
+#include "journal/ReplayEntry.h"
+#include "journal/Settings.h"
+#include "journal/Utils.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/DemoteRequest.h"
+#include "librbd/journal/ObjectDispatch.h"
+#include "librbd/journal/OpenRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/journal/ResetRequest.h"
+#include "librbd/journal/Replay.h"
+#include "librbd/journal/PromoteRequest.h"
+
+#include <boost/scope_exit.hpp>
+#include <utility>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal: "
+
+namespace librbd {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using journal::util::C_DecodeTag;
+using journal::util::C_DecodeTags;
+
+namespace {
+
+// TODO: once journaler is 100% async, remove separate threads and
+// reuse ImageCtx's thread pool
+class ThreadPoolSingleton : public ThreadPool {
+public:
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "librbd::Journal", "tp_librbd_journ", 1) {
+ start();
+ }
+ ~ThreadPoolSingleton() override {
+ stop();
+ }
+};
+
+template <typename I>
+struct C_IsTagOwner : public Context {
+ librados::IoCtx &io_ctx;
+ std::string image_id;
+ bool *is_tag_owner;
+ ContextWQ *op_work_queue;
+ Context *on_finish;
+
+ CephContext *cct = nullptr;
+ Journaler *journaler;
+ cls::journal::Client client;
+ journal::ImageClientMeta client_meta;
+ uint64_t tag_tid = 0;
+ journal::TagData tag_data;
+
+ C_IsTagOwner(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool *is_tag_owner, ContextWQ *op_work_queue, Context *on_finish)
+ : io_ctx(io_ctx), image_id(image_id), is_tag_owner(is_tag_owner),
+ op_work_queue(op_work_queue), on_finish(on_finish),
+ cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ journaler(new Journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID,
+ {})) {
+ }
+
+ void finish(int r) override {
+ ldout(cct, 20) << this << " C_IsTagOwner::" << __func__ << ": r=" << r
+ << dendl;
+ if (r < 0) {
+ lderr(cct) << this << " C_IsTagOwner::" << __func__ << ": "
+ << "failed to get tag owner: " << cpp_strerror(r) << dendl;
+ } else {
+ *is_tag_owner = (tag_data.mirror_uuid == Journal<>::LOCAL_MIRROR_UUID);
+ }
+
+ Journaler *journaler = this->journaler;
+ Context *on_finish = this->on_finish;
+ FunctionContext *ctx = new FunctionContext(
+ [journaler, on_finish](int r) {
+ on_finish->complete(r);
+ delete journaler;
+ });
+ op_work_queue->queue(ctx, r);
+ }
+};
+
+struct C_GetTagOwner : public Context {
+ std::string *mirror_uuid;
+ Context *on_finish;
+
+ Journaler journaler;
+ cls::journal::Client client;
+ journal::ImageClientMeta client_meta;
+ uint64_t tag_tid = 0;
+ journal::TagData tag_data;
+
+ C_GetTagOwner(librados::IoCtx &io_ctx, const std::string &image_id,
+ std::string *mirror_uuid, Context *on_finish)
+ : mirror_uuid(mirror_uuid), on_finish(on_finish),
+ journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID, {}) {
+ }
+
+ virtual void finish(int r) {
+ if (r >= 0) {
+ *mirror_uuid = tag_data.mirror_uuid;
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename J>
+struct GetTagsRequest {
+ CephContext *cct;
+ J *journaler;
+ cls::journal::Client *client;
+ journal::ImageClientMeta *client_meta;
+ uint64_t *tag_tid;
+ journal::TagData *tag_data;
+ Context *on_finish;
+
+ Mutex lock;
+
+ GetTagsRequest(CephContext *cct, J *journaler, cls::journal::Client *client,
+ journal::ImageClientMeta *client_meta, uint64_t *tag_tid,
+ journal::TagData *tag_data, Context *on_finish)
+ : cct(cct), journaler(journaler), client(client), client_meta(client_meta),
+ tag_tid(tag_tid), tag_data(tag_data), on_finish(on_finish), lock("lock") {
+ }
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_CLIENT * * * * * * * * * * * *
+ * | *
+ * v *
+ * GET_TAGS * * * * * * * * * * * * * (error)
+ * | *
+ * v *
+ * <finish> * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ void send() {
+ send_get_client();
+ }
+
+ void send_get_client() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ FunctionContext *ctx = new FunctionContext(
+ [this](int r) {
+ handle_get_client(r);
+ });
+ journaler->get_client(Journal<ImageCtx>::IMAGE_CLIENT_ID, client, ctx);
+ }
+
+ void handle_get_client(int r) {
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto bl_it = client->data.cbegin();
+ try {
+ decode(client_data, bl_it);
+ } catch (const buffer::error &err) {
+ lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": "
+ << "failed to decode client data" << dendl;
+ complete(-EBADMSG);
+ return;
+ }
+
+ journal::ImageClientMeta *image_client_meta =
+ boost::get<journal::ImageClientMeta>(&client_data.client_meta);
+ if (image_client_meta == nullptr) {
+ lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": "
+ << "failed to get client meta" << dendl;
+ complete(-EINVAL);
+ return;
+ }
+ *client_meta = *image_client_meta;
+
+ send_get_tags();
+ }
+
+ void send_get_tags() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ FunctionContext *ctx = new FunctionContext(
+ [this](int r) {
+ handle_get_tags(r);
+ });
+ C_DecodeTags *tags_ctx = new C_DecodeTags(cct, &lock, tag_tid, tag_data,
+ ctx);
+ journaler->get_tags(client_meta->tag_class, &tags_ctx->tags, tags_ctx);
+ }
+
+ void handle_get_tags(int r) {
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ complete(r);
+ }
+
+ void complete(int r) {
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+template <typename J>
+void get_tags(CephContext *cct, J *journaler,
+ cls::journal::Client *client,
+ journal::ImageClientMeta *client_meta,
+ uint64_t *tag_tid, journal::TagData *tag_data,
+ Context *on_finish) {
+ ldout(cct, 20) << __func__ << dendl;
+
+ GetTagsRequest<J> *req =
+ new GetTagsRequest<J>(cct, journaler, client, client_meta, tag_tid,
+ tag_data, on_finish);
+ req->send();
+}
+
+template <typename J>
+int allocate_journaler_tag(CephContext *cct, J *journaler,
+ uint64_t tag_class,
+ const journal::TagPredecessor &predecessor,
+ const std::string &mirror_uuid,
+ cls::journal::Tag *new_tag) {
+ journal::TagData tag_data;
+ tag_data.mirror_uuid = mirror_uuid;
+ tag_data.predecessor = predecessor;
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ C_SaferCond allocate_tag_ctx;
+ journaler->allocate_tag(tag_class, tag_bl, new_tag, &allocate_tag_ctx);
+
+ int r = allocate_tag_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << __func__ << ": "
+ << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+// client id for local image
+template <typename I>
+const std::string Journal<I>::IMAGE_CLIENT_ID("");
+
+// mirror uuid to use for local images
+template <typename I>
+const std::string Journal<I>::LOCAL_MIRROR_UUID("");
+
+// mirror uuid to use for orphaned (demoted) images
+template <typename I>
+const std::string Journal<I>::ORPHAN_MIRROR_UUID("<orphan>");
+
+template <typename I>
+std::ostream &operator<<(std::ostream &os,
+ const typename Journal<I>::State &state) {
+ switch (state) {
+ case Journal<I>::STATE_UNINITIALIZED:
+ os << "Uninitialized";
+ break;
+ case Journal<I>::STATE_INITIALIZING:
+ os << "Initializing";
+ break;
+ case Journal<I>::STATE_REPLAYING:
+ os << "Replaying";
+ break;
+ case Journal<I>::STATE_FLUSHING_RESTART:
+ os << "FlushingRestart";
+ break;
+ case Journal<I>::STATE_RESTARTING_REPLAY:
+ os << "RestartingReplay";
+ break;
+ case Journal<I>::STATE_FLUSHING_REPLAY:
+ os << "FlushingReplay";
+ break;
+ case Journal<I>::STATE_READY:
+ os << "Ready";
+ break;
+ case Journal<I>::STATE_STOPPING:
+ os << "Stopping";
+ break;
+ case Journal<I>::STATE_CLOSING:
+ os << "Closing";
+ break;
+ case Journal<I>::STATE_CLOSED:
+ os << "Closed";
+ break;
+ default:
+ os << "Unknown (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+template <typename I>
+Journal<I>::Journal(I &image_ctx)
+ : m_image_ctx(image_ctx), m_journaler(NULL),
+ m_lock("Journal<I>::m_lock"), m_state(STATE_UNINITIALIZED),
+ m_error_result(0), m_replay_handler(this), m_close_pending(false),
+ m_event_lock("Journal<I>::m_event_lock"), m_event_tid(0),
+ m_blocking_writes(false), m_journal_replay(NULL),
+ m_metadata_listener(this) {
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << ": ictx=" << &m_image_ctx << dendl;
+
+ auto thread_pool_singleton =
+ &cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+ "librbd::journal::thread_pool", false, cct);
+ m_work_queue = new ContextWQ("librbd::journal::work_queue",
+ cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"),
+ thread_pool_singleton);
+ ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
+}
+
+template <typename I>
+Journal<I>::~Journal() {
+ if (m_work_queue != nullptr) {
+ m_work_queue->drain();
+ delete m_work_queue;
+ }
+
+ ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED);
+ ceph_assert(m_journaler == NULL);
+ ceph_assert(m_journal_replay == NULL);
+ ceph_assert(m_wait_for_state_contexts.empty());
+}
+
+template <typename I>
+bool Journal<I>::is_journal_supported(I &image_ctx) {
+ ceph_assert(image_ctx.snap_lock.is_locked());
+ return ((image_ctx.features & RBD_FEATURE_JOURNALING) &&
+ !image_ctx.read_only && image_ctx.snap_id == CEPH_NOSNAP);
+}
+
+template <typename I>
+int Journal<I>::create(librados::IoCtx &io_ctx, const std::string &image_id,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond cond;
+ journal::TagData tag_data(LOCAL_MIRROR_UUID);
+ journal::CreateRequest<I> *req = journal::CreateRequest<I>::create(
+ io_ctx, image_id, order, splay_width, object_pool, cls::journal::Tag::TAG_CLASS_NEW,
+ tag_data, IMAGE_CLIENT_ID, op_work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+int Journal<I>::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond cond;
+ journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create(
+ io_ctx, image_id, IMAGE_CLIENT_ID, op_work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+int Journal<I>::reset(librados::IoCtx &io_ctx, const std::string &image_id) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond cond;
+ auto req = journal::ResetRequest<I>::create(io_ctx, image_id, IMAGE_CLIENT_ID,
+ Journal<>::LOCAL_MIRROR_UUID,
+ op_work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+void Journal<I>::is_tag_owner(I *image_ctx, bool *owner,
+ Context *on_finish) {
+ Journal<I>::is_tag_owner(image_ctx->md_ctx, image_ctx->id, owner,
+ image_ctx->op_work_queue, on_finish);
+}
+
+template <typename I>
+void Journal<I>::is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
+ bool *is_tag_owner, ContextWQ *op_work_queue,
+ Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << __func__ << dendl;
+
+ C_IsTagOwner<I> *is_tag_owner_ctx = new C_IsTagOwner<I>(
+ io_ctx, image_id, is_tag_owner, op_work_queue, on_finish);
+ get_tags(cct, is_tag_owner_ctx->journaler, &is_tag_owner_ctx->client,
+ &is_tag_owner_ctx->client_meta, &is_tag_owner_ctx->tag_tid,
+ &is_tag_owner_ctx->tag_data, is_tag_owner_ctx);
+}
+
+template <typename I>
+void Journal<I>::get_tag_owner(IoCtx& io_ctx, std::string& image_id,
+ std::string *mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ CephContext *cct = static_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto ctx = new C_GetTagOwner(io_ctx, image_id, mirror_uuid, on_finish);
+ get_tags(cct, &ctx->journaler, &ctx->client, &ctx->client_meta, &ctx->tag_tid,
+ &ctx->tag_data, create_async_context_callback(op_work_queue, ctx));
+}
+
+template <typename I>
+int Journal<I>::request_resync(I *image_ctx) {
+ CephContext *cct = image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Journaler journaler(image_ctx->md_ctx, image_ctx->id, IMAGE_CLIENT_ID, {});
+
+ Mutex lock("lock");
+ journal::ImageClientMeta client_meta;
+ uint64_t tag_tid;
+ journal::TagData tag_data;
+
+ C_SaferCond open_ctx;
+ auto open_req = journal::OpenRequest<I>::create(image_ctx, &journaler, &lock,
+ &client_meta, &tag_tid,
+ &tag_data, &open_ctx);
+ open_req->send();
+
+ BOOST_SCOPE_EXIT_ALL(&journaler) {
+ journaler.shut_down();
+ };
+
+ int r = open_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ client_meta.resync_requested = true;
+
+ journal::ClientData client_data(client_meta);
+ bufferlist client_data_bl;
+ encode(client_data, client_data_bl);
+
+ C_SaferCond update_client_ctx;
+ journaler.update_client(client_data_bl, &update_client_ctx);
+
+ r = update_client_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << __func__ << ": "
+ << "failed to update client: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Journal<I>::promote(I *image_ctx, Context *on_finish) {
+ CephContext *cct = image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto promote_req = journal::PromoteRequest<I>::create(image_ctx, false,
+ on_finish);
+ promote_req->send();
+}
+
+template <typename I>
+void Journal<I>::demote(I *image_ctx, Context *on_finish) {
+ CephContext *cct = image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto req = journal::DemoteRequest<I>::create(*image_ctx, on_finish);
+ req->send();
+}
+
+template <typename I>
+bool Journal<I>::is_journal_ready() const {
+ Mutex::Locker locker(m_lock);
+ return (m_state == STATE_READY);
+}
+
+template <typename I>
+bool Journal<I>::is_journal_replaying() const {
+ Mutex::Locker locker(m_lock);
+ return is_journal_replaying(m_lock);
+}
+
+template <typename I>
+bool Journal<I>::is_journal_replaying(const Mutex &) const {
+ ceph_assert(m_lock.is_locked());
+ return (m_state == STATE_REPLAYING ||
+ m_state == STATE_FLUSHING_REPLAY ||
+ m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_RESTARTING_REPLAY);
+}
+
+template <typename I>
+bool Journal<I>::is_journal_appending() const {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ Mutex::Locker locker(m_lock);
+ return (m_state == STATE_READY &&
+ !m_image_ctx.get_journal_policy()->append_disabled());
+}
+
+template <typename I>
+void Journal<I>::wait_for_journal_ready(Context *on_ready) {
+ on_ready = create_async_context_callback(m_image_ctx, on_ready);
+
+ Mutex::Locker locker(m_lock);
+ if (m_state == STATE_READY) {
+ on_ready->complete(m_error_result);
+ } else {
+ wait_for_steady_state(on_ready);
+ }
+}
+
+template <typename I>
+void Journal<I>::open(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ on_finish = create_async_context_callback(m_image_ctx, on_finish);
+
+ // inject our handler into the object dispatcher chain
+ m_image_ctx.io_object_dispatcher->register_object_dispatch(
+ journal::ObjectDispatch<I>::create(&m_image_ctx, this));
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ wait_for_steady_state(on_finish);
+ create_journaler();
+}
+
+template <typename I>
+void Journal<I>::close(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ // remove our handler from object dispatcher chain - preserve error
+ auto ctx = new FunctionContext([on_finish, r](int _) {
+ on_finish->complete(r);
+ });
+ m_image_ctx.io_object_dispatcher->shut_down_object_dispatch(
+ io::OBJECT_DISPATCH_LAYER_JOURNAL, ctx);
+ });
+ on_finish = create_async_context_callback(m_image_ctx, on_finish);
+
+ Mutex::Locker locker(m_lock);
+ while (m_listener_notify) {
+ m_listener_cond.Wait(m_lock);
+ }
+
+ Listeners listeners(m_listeners);
+ m_listener_notify = true;
+ m_lock.Unlock();
+ for (auto listener : listeners) {
+ listener->handle_close();
+ }
+
+ m_lock.Lock();
+ m_listener_notify = false;
+ m_listener_cond.Signal();
+
+ ceph_assert(m_state != STATE_UNINITIALIZED);
+ if (m_state == STATE_CLOSED) {
+ on_finish->complete(m_error_result);
+ return;
+ }
+
+ if (m_state == STATE_READY) {
+ stop_recording();
+ }
+
+ m_close_pending = true;
+ wait_for_steady_state(on_finish);
+}
+
+template <typename I>
+bool Journal<I>::is_tag_owner() const {
+ Mutex::Locker locker(m_lock);
+ return is_tag_owner(m_lock);
+}
+
+template <typename I>
+bool Journal<I>::is_tag_owner(const Mutex &) const {
+ ceph_assert(m_lock.is_locked());
+ return (m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
+}
+
+template <typename I>
+uint64_t Journal<I>::get_tag_tid() const {
+ Mutex::Locker locker(m_lock);
+ return m_tag_tid;
+}
+
+template <typename I>
+journal::TagData Journal<I>::get_tag_data() const {
+ Mutex::Locker locker(m_lock);
+ return m_tag_data;
+}
+
+template <typename I>
+void Journal<I>::allocate_local_tag(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ journal::TagPredecessor predecessor;
+ predecessor.mirror_uuid = LOCAL_MIRROR_UUID;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_journaler != nullptr && is_tag_owner(m_lock));
+
+ cls::journal::Client client;
+ int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, r);
+ return;
+ }
+
+ // since we are primary, populate the predecessor with our known commit
+ // position
+ ceph_assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
+ if (!client.commit_position.object_positions.empty()) {
+ auto position = client.commit_position.object_positions.front();
+ predecessor.commit_valid = true;
+ predecessor.tag_tid = position.tag_tid;
+ predecessor.entry_tid = position.entry_tid;
+ }
+ }
+
+ allocate_tag(LOCAL_MIRROR_UUID, predecessor, on_finish);
+}
+
+template <typename I>
+void Journal<I>::allocate_tag(const std::string &mirror_uuid,
+ const journal::TagPredecessor &predecessor,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": mirror_uuid=" << mirror_uuid
+ << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_journaler != nullptr);
+
+ journal::TagData tag_data;
+ tag_data.mirror_uuid = mirror_uuid;
+ tag_data.predecessor = predecessor;
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ C_DecodeTag *decode_tag_ctx = new C_DecodeTag(cct, &m_lock, &m_tag_tid,
+ &m_tag_data, on_finish);
+ m_journaler->allocate_tag(m_tag_class, tag_bl, &decode_tag_ctx->tag,
+ decode_tag_ctx);
+}
+
+template <typename I>
+void Journal<I>::flush_commit_position(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_journaler != nullptr);
+ m_journaler->flush_commit_position(on_finish);
+}
+
+template <typename I>
+void Journal<I>::user_flushed() {
+ if (m_state == STATE_READY && !m_user_flushed.exchange(true) &&
+ m_image_ctx.config.template get_val<bool>("rbd_journal_object_writethrough_until_flush")) {
+ Mutex::Locker locker(m_lock);
+ if (m_state == STATE_READY) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_journaler != nullptr);
+ m_journaler->set_append_batch_options(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_flush_interval"),
+ m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_object_flush_bytes"),
+ m_image_ctx.config.template get_val<double>("rbd_journal_object_flush_age"));
+ } else {
+ m_user_flushed = false;
+ }
+ }
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_event(uint64_t offset, size_t length,
+ const bufferlist &bl,
+ bool flush_entry) {
+ ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size());
+ uint64_t max_write_data_size =
+ m_max_append_size - journal::AioWriteEvent::get_fixed_size();
+
+ // ensure that the write event fits within the journal entry
+ Bufferlists bufferlists;
+ uint64_t bytes_remaining = length;
+ uint64_t event_offset = 0;
+ do {
+ uint64_t event_length = std::min(bytes_remaining, max_write_data_size);
+
+ bufferlist event_bl;
+ event_bl.substr_of(bl, event_offset, event_length);
+ journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset,
+ event_length,
+ event_bl),
+ ceph_clock_now());
+
+ bufferlists.emplace_back();
+ encode(event_entry, bufferlists.back());
+
+ event_offset += event_length;
+ bytes_remaining -= event_length;
+ } while (bytes_remaining > 0);
+
+ return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, offset,
+ length, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_io_event(journal::EventEntry &&event_entry,
+ uint64_t offset, size_t length,
+ bool flush_entry, int filter_ret_val) {
+ bufferlist bl;
+ event_entry.timestamp = ceph_clock_now();
+ encode(event_entry, bl);
+ return append_io_events(event_entry.get_event_type(), {bl}, offset, length,
+ flush_entry, filter_ret_val);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_io_events(journal::EventType event_type,
+ const Bufferlists &bufferlists,
+ uint64_t offset, size_t length,
+ bool flush_entry, int filter_ret_val) {
+ ceph_assert(!bufferlists.empty());
+
+ uint64_t tid;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_READY);
+
+ tid = ++m_event_tid;
+ ceph_assert(tid != 0);
+ }
+
+ Futures futures;
+ for (auto &bl : bufferlists) {
+ ceph_assert(bl.length() <= m_max_append_size);
+ futures.push_back(m_journaler->append(m_tag_tid, bl));
+ }
+
+ {
+ Mutex::Locker event_locker(m_event_lock);
+ m_events[tid] = Event(futures, offset, length, filter_ret_val);
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "event=" << event_type << ", "
+ << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "flush=" << flush_entry << ", tid=" << tid << dendl;
+
+ Context *on_safe = create_async_context_callback(
+ m_image_ctx, new C_IOEventSafe(this, tid));
+ if (flush_entry) {
+ futures.back().flush(on_safe);
+ } else {
+ futures.back().wait(on_safe);
+ }
+
+ return tid;
+}
+
+template <typename I>
+void Journal<I>::commit_io_event(uint64_t tid, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ "r=" << r << dendl;
+
+ Mutex::Locker event_locker(m_event_lock);
+ typename Events::iterator it = m_events.find(tid);
+ if (it == m_events.end()) {
+ return;
+ }
+ complete_event(it, r);
+}
+
+template <typename I>
+void Journal<I>::commit_io_event_extent(uint64_t tid, uint64_t offset,
+ uint64_t length, int r) {
+ ceph_assert(length > 0);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "r=" << r << dendl;
+
+ Mutex::Locker event_locker(m_event_lock);
+ typename Events::iterator it = m_events.find(tid);
+ if (it == m_events.end()) {
+ return;
+ }
+
+ Event &event = it->second;
+ if (event.ret_val == 0 && r < 0) {
+ event.ret_val = r;
+ }
+
+ ExtentInterval extent;
+ extent.insert(offset, length);
+
+ ExtentInterval intersect;
+ intersect.intersection_of(extent, event.pending_extents);
+
+ event.pending_extents.subtract(intersect);
+ if (!event.pending_extents.empty()) {
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "pending extents: " << event.pending_extents << dendl;
+ return;
+ }
+ complete_event(it, event.ret_val);
+}
+
+template <typename I>
+void Journal<I>::append_op_event(uint64_t op_tid,
+ journal::EventEntry &&event_entry,
+ Context *on_safe) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ bufferlist bl;
+ event_entry.timestamp = ceph_clock_now();
+ encode(event_entry, bl);
+
+ Future future;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_READY);
+
+ future = m_journaler->append(m_tag_tid, bl);
+
+ // delay committing op event to ensure consistent replay
+ ceph_assert(m_op_futures.count(op_tid) == 0);
+ m_op_futures[op_tid] = future;
+ }
+
+ on_safe = create_async_context_callback(m_image_ctx, on_safe);
+ on_safe = new FunctionContext([this, on_safe](int r) {
+ // ensure all committed IO before this op is committed
+ m_journaler->flush_commit_position(on_safe);
+ });
+ future.flush(on_safe);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "op_tid=" << op_tid << ", "
+ << "event=" << event_entry.get_event_type() << dendl;
+}
+
+template <typename I>
+void Journal<I>::commit_op_event(uint64_t op_tid, int r, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << ", "
+ << "r=" << r << dendl;
+
+ journal::EventEntry event_entry((journal::OpFinishEvent(op_tid, r)),
+ ceph_clock_now());
+
+ bufferlist bl;
+ encode(event_entry, bl);
+
+ Future op_start_future;
+ Future op_finish_future;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_READY);
+
+ // ready to commit op event
+ auto it = m_op_futures.find(op_tid);
+ ceph_assert(it != m_op_futures.end());
+ op_start_future = it->second;
+ m_op_futures.erase(it);
+
+ op_finish_future = m_journaler->append(m_tag_tid, bl);
+ }
+
+ op_finish_future.flush(create_async_context_callback(
+ m_image_ctx, new C_OpEventSafe(this, op_tid, op_start_future,
+ op_finish_future, on_safe)));
+}
+
+template <typename I>
+void Journal<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_journal_replay != nullptr);
+ m_journal_replay->replay_op_ready(op_tid, on_resume);
+ }
+}
+
+template <typename I>
+void Journal<I>::flush_event(uint64_t tid, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ << "on_safe=" << on_safe << dendl;
+
+ Future future;
+ {
+ Mutex::Locker event_locker(m_event_lock);
+ future = wait_event(m_lock, tid, on_safe);
+ }
+
+ if (future.is_valid()) {
+ future.flush(nullptr);
+ }
+}
+
+template <typename I>
+void Journal<I>::wait_event(uint64_t tid, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ << "on_safe=" << on_safe << dendl;
+
+ Mutex::Locker event_locker(m_event_lock);
+ wait_event(m_lock, tid, on_safe);
+}
+
+template <typename I>
+typename Journal<I>::Future Journal<I>::wait_event(Mutex &lock, uint64_t tid,
+ Context *on_safe) {
+ ceph_assert(m_event_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ typename Events::iterator it = m_events.find(tid);
+ ceph_assert(it != m_events.end());
+
+ Event &event = it->second;
+ if (event.safe) {
+ // journal entry already safe
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "journal entry already safe" << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, event.ret_val);
+ return Future();
+ }
+
+ event.on_safe_contexts.push_back(create_async_context_callback(m_image_ctx,
+ on_safe));
+ return event.futures.back();
+}
+
+template <typename I>
+void Journal<I>::start_external_replay(journal::Replay<I> **journal_replay,
+ Context *on_start) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_READY);
+ ceph_assert(m_journal_replay == nullptr);
+
+ on_start = util::create_async_context_callback(m_image_ctx, on_start);
+ on_start = new FunctionContext(
+ [this, journal_replay, on_start](int r) {
+ handle_start_external_replay(r, journal_replay, on_start);
+ });
+
+ // safely flush all in-flight events before starting external replay
+ m_journaler->stop_append(util::create_async_context_callback(m_image_ctx,
+ on_start));
+}
+
+template <typename I>
+void Journal<I>::handle_start_external_replay(int r,
+ journal::Replay<I> **journal_replay,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_READY);
+ ceph_assert(m_journal_replay == nullptr);
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to stop recording: " << cpp_strerror(r) << dendl;
+ *journal_replay = nullptr;
+
+ // get back to a sane-state
+ start_append();
+ on_finish->complete(r);
+ return;
+ }
+
+ transition_state(STATE_REPLAYING, 0);
+ m_journal_replay = journal::Replay<I>::create(m_image_ctx);
+ *journal_replay = m_journal_replay;
+ on_finish->complete(0);
+}
+
+template <typename I>
+void Journal<I>::stop_external_replay() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_journal_replay != nullptr);
+ ceph_assert(m_state == STATE_REPLAYING);
+
+ delete m_journal_replay;
+ m_journal_replay = nullptr;
+
+ if (m_close_pending) {
+ destroy_journaler(0);
+ return;
+ }
+
+ start_append();
+}
+
+template <typename I>
+void Journal<I>::create_journaler() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_RESTARTING_REPLAY);
+ ceph_assert(m_journaler == NULL);
+
+ transition_state(STATE_INITIALIZING, 0);
+ ::journal::Settings settings;
+ settings.commit_interval =
+ m_image_ctx.config.template get_val<double>("rbd_journal_commit_age");
+ settings.max_payload_bytes =
+ m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_max_payload_bytes");
+ settings.max_concurrent_object_sets =
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_max_concurrent_object_sets");
+ // TODO: a configurable filter to exclude certain peers from being
+ // disconnected.
+ settings.whitelisted_laggy_clients = {IMAGE_CLIENT_ID};
+
+ m_journaler = new Journaler(m_work_queue, m_timer, m_timer_lock,
+ m_image_ctx.md_ctx, m_image_ctx.id,
+ IMAGE_CLIENT_ID, settings);
+ m_journaler->add_listener(&m_metadata_listener);
+
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_open>(this));
+ auto open_req = journal::OpenRequest<I>::create(&m_image_ctx, m_journaler,
+ &m_lock, &m_client_meta,
+ &m_tag_tid, &m_tag_data, ctx);
+ open_req->send();
+}
+
+template <typename I>
+void Journal<I>::destroy_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ delete m_journal_replay;
+ m_journal_replay = NULL;
+
+ m_journaler->remove_listener(&m_metadata_listener);
+
+ transition_state(STATE_CLOSING, r);
+
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_journal_destroyed>(this));
+ ctx = new FunctionContext(
+ [this, ctx](int r) {
+ Mutex::Locker locker(m_lock);
+ m_journaler->shut_down(ctx);
+ });
+ m_async_journal_op_tracker.wait(m_image_ctx, ctx);
+}
+
+template <typename I>
+void Journal<I>::recreate_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_FLUSHING_REPLAY);
+
+ delete m_journal_replay;
+ m_journal_replay = NULL;
+
+ m_journaler->remove_listener(&m_metadata_listener);
+
+ transition_state(STATE_RESTARTING_REPLAY, r);
+ m_journaler->shut_down(create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_journal_destroyed>(this)));
+}
+
+template <typename I>
+void Journal<I>::complete_event(typename Events::iterator it, int r) {
+ ceph_assert(m_event_lock.is_locked());
+ ceph_assert(m_state == STATE_READY);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << it->first << " "
+ << "r=" << r << dendl;
+
+ Event &event = it->second;
+ if (r < 0 && r == event.filter_ret_val) {
+ // ignore allowed error codes
+ r = 0;
+ }
+ if (r < 0) {
+ // event recorded to journal but failed to update disk, we cannot
+ // commit this IO event. this event must be replayed.
+ ceph_assert(event.safe);
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit IO to disk, replay required: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ event.committed_io = true;
+ if (event.safe) {
+ if (r >= 0) {
+ for (auto &future : event.futures) {
+ m_journaler->committed(future);
+ }
+ }
+ m_events.erase(it);
+ }
+}
+
+template <typename I>
+void Journal<I>::start_append() {
+ ceph_assert(m_lock.is_locked());
+
+ m_journaler->start_append(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_max_in_flight_appends"));
+ if (!m_image_ctx.config.template get_val<bool>("rbd_journal_object_writethrough_until_flush")) {
+ m_journaler->set_append_batch_options(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_flush_interval"),
+ m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_object_flush_bytes"),
+ m_image_ctx.config.template get_val<double>("rbd_journal_object_flush_age"));
+ }
+
+ transition_state(STATE_READY, 0);
+}
+
+template <typename I>
+void Journal<I>::handle_open(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_INITIALIZING);
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to initialize journal: " << cpp_strerror(r)
+ << dendl;
+ destroy_journaler(r);
+ return;
+ }
+
+ m_tag_class = m_client_meta.tag_class;
+ m_max_append_size = m_journaler->get_max_append_size();
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "tag_class=" << m_tag_class << ", "
+ << "max_append_size=" << m_max_append_size << dendl;
+
+ transition_state(STATE_REPLAYING, 0);
+ m_journal_replay = journal::Replay<I>::create(m_image_ctx);
+ m_journaler->start_replay(&m_replay_handler);
+}
+
+template <typename I>
+void Journal<I>::handle_replay_ready() {
+ CephContext *cct = m_image_ctx.cct;
+ ReplayEntry replay_entry;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+ if (!m_journaler->try_pop_front(&replay_entry)) {
+ return;
+ }
+
+ // only one entry should be in-flight at a time
+ ceph_assert(!m_processing_entry);
+ m_processing_entry = true;
+ }
+
+ bufferlist data = replay_entry.get_data();
+ auto it = data.cbegin();
+
+ journal::EventEntry event_entry;
+ int r = m_journal_replay->decode(&it, &event_entry);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to decode journal event entry" << dendl;
+ handle_replay_process_safe(replay_entry, r);
+ return;
+ }
+
+ Context *on_ready = create_context_callback<
+ Journal<I>, &Journal<I>::handle_replay_process_ready>(this);
+ Context *on_commit = new C_ReplayProcessSafe(this, std::move(replay_entry));
+ m_journal_replay->process(event_entry, on_ready, on_commit);
+}
+
+template <typename I>
+void Journal<I>::handle_replay_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+
+ bool cancel_ops = false;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+ if (r < 0) {
+ cancel_ops = true;
+ transition_state(STATE_FLUSHING_RESTART, r);
+ } else {
+ // state might change back to FLUSHING_RESTART on flush error
+ transition_state(STATE_FLUSHING_REPLAY, 0);
+ }
+ }
+
+ Context *ctx = new FunctionContext([this, cct](int r) {
+ ldout(cct, 20) << this << " handle_replay_complete: "
+ << "handle shut down replay" << dendl;
+
+ State state;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_FLUSHING_REPLAY);
+ state = m_state;
+ }
+
+ if (state == STATE_FLUSHING_RESTART) {
+ handle_flushing_restart(0);
+ } else {
+ handle_flushing_replay();
+ }
+ });
+ ctx = new FunctionContext([this, ctx](int r) {
+ // ensure the commit position is flushed to disk
+ m_journaler->flush_commit_position(ctx);
+ });
+ ctx = new FunctionContext([this, cct, cancel_ops, ctx](int r) {
+ ldout(cct, 20) << this << " handle_replay_complete: "
+ << "shut down replay" << dendl;
+ m_journal_replay->shut_down(cancel_ops, ctx);
+ });
+ m_journaler->stop_replay(ctx);
+}
+
+template <typename I>
+void Journal<I>::handle_replay_process_ready(int r) {
+ // journal::Replay is ready for more events -- attempt to pop another
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(r == 0);
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_processing_entry);
+ m_processing_entry = false;
+ }
+ handle_replay_ready();
+}
+
+template <typename I>
+void Journal<I>::handle_replay_process_safe(ReplayEntry replay_entry, int r) {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_lock.Lock();
+ ceph_assert(m_state == STATE_REPLAYING ||
+ m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_FLUSHING_REPLAY);
+
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+ if (r < 0) {
+ if (r != -ECANCELED) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit journal event to disk: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ if (m_state == STATE_REPLAYING) {
+ // abort the replay if we have an error
+ transition_state(STATE_FLUSHING_RESTART, r);
+ m_lock.Unlock();
+
+ // stop replay, shut down, and restart
+ Context* ctx = create_context_callback<
+ Journal<I>, &Journal<I>::handle_flushing_restart>(this);
+ ctx = new FunctionContext([this, ctx](int r) {
+ // ensure the commit position is flushed to disk
+ m_journaler->flush_commit_position(ctx);
+ });
+ ctx = new FunctionContext([this, cct, ctx](int r) {
+ ldout(cct, 20) << this << " handle_replay_process_safe: "
+ << "shut down replay" << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_FLUSHING_RESTART);
+ }
+
+ m_journal_replay->shut_down(true, ctx);
+ });
+ m_journaler->stop_replay(ctx);
+ return;
+ } else if (m_state == STATE_FLUSHING_REPLAY) {
+ // end-of-replay flush in-progress -- we need to restart replay
+ transition_state(STATE_FLUSHING_RESTART, r);
+ m_lock.Unlock();
+ return;
+ }
+ } else {
+ // only commit the entry if written successfully
+ m_journaler->committed(replay_entry);
+ }
+ m_lock.Unlock();
+}
+
+template <typename I>
+void Journal<I>::handle_flushing_restart(int r) {
+ Mutex::Locker locker(m_lock);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(r == 0);
+ ceph_assert(m_state == STATE_FLUSHING_RESTART);
+ if (m_close_pending) {
+ destroy_journaler(r);
+ return;
+ }
+
+ recreate_journaler(r);
+}
+
+template <typename I>
+void Journal<I>::handle_flushing_replay() {
+ Mutex::Locker locker(m_lock);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_state == STATE_FLUSHING_REPLAY ||
+ m_state == STATE_FLUSHING_RESTART);
+ if (m_close_pending) {
+ destroy_journaler(0);
+ return;
+ } else if (m_state == STATE_FLUSHING_RESTART) {
+ // failed to replay one-or-more events -- restart
+ recreate_journaler(0);
+ return;
+ }
+
+ delete m_journal_replay;
+ m_journal_replay = NULL;
+
+ m_error_result = 0;
+ start_append();
+}
+
+template <typename I>
+void Journal<I>::handle_recording_stopped(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_STOPPING);
+
+ destroy_journaler(r);
+}
+
+template <typename I>
+void Journal<I>::handle_journal_destroyed(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__
+ << "error detected while closing journal: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ Mutex::Locker locker(m_lock);
+ delete m_journaler;
+ m_journaler = nullptr;
+
+ ceph_assert(m_state == STATE_CLOSING || m_state == STATE_RESTARTING_REPLAY);
+ if (m_state == STATE_RESTARTING_REPLAY) {
+ create_journaler();
+ return;
+ }
+
+ transition_state(STATE_CLOSED, r);
+}
+
+template <typename I>
+void Journal<I>::handle_io_event_safe(int r, uint64_t tid) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", "
+ << "tid=" << tid << dendl;
+
+ // journal will be flushed before closing
+ ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit IO event: " << cpp_strerror(r) << dendl;
+ }
+
+ Contexts on_safe_contexts;
+ {
+ Mutex::Locker event_locker(m_event_lock);
+ typename Events::iterator it = m_events.find(tid);
+ ceph_assert(it != m_events.end());
+
+ Event &event = it->second;
+ on_safe_contexts.swap(event.on_safe_contexts);
+
+ if (r < 0 || event.committed_io) {
+ // failed journal write so IO won't be sent -- or IO extent was
+ // overwritten by future IO operations so this was a no-op IO event
+ event.ret_val = r;
+ for (auto &future : event.futures) {
+ m_journaler->committed(future);
+ }
+ }
+
+ if (event.committed_io) {
+ m_events.erase(it);
+ } else {
+ event.safe = true;
+ }
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "completing tid=" << tid << dendl;
+
+ // alert the cache about the journal event status
+ for (Contexts::iterator it = on_safe_contexts.begin();
+ it != on_safe_contexts.end(); ++it) {
+ (*it)->complete(r);
+ }
+}
+
+template <typename I>
+void Journal<I>::handle_op_event_safe(int r, uint64_t tid,
+ const Future &op_start_future,
+ const Future &op_finish_future,
+ Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", "
+ << "tid=" << tid << dendl;
+
+ // journal will be flushed before closing
+ ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit op event: " << cpp_strerror(r) << dendl;
+ }
+
+ m_journaler->committed(op_start_future);
+ m_journaler->committed(op_finish_future);
+
+ // reduce the replay window after committing an op event
+ m_journaler->flush_commit_position(on_safe);
+}
+
+template <typename I>
+void Journal<I>::stop_recording() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_journaler != NULL);
+
+ ceph_assert(m_state == STATE_READY);
+ transition_state(STATE_STOPPING, 0);
+
+ m_journaler->stop_append(util::create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_recording_stopped>(this)));
+}
+
+template <typename I>
+void Journal<I>::transition_state(State state, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl;
+ ceph_assert(m_lock.is_locked());
+ m_state = state;
+
+ if (m_error_result == 0 && r < 0) {
+ m_error_result = r;
+ }
+
+ if (is_steady_state()) {
+ Contexts wait_for_state_contexts(std::move(m_wait_for_state_contexts));
+ for (auto ctx : wait_for_state_contexts) {
+ ctx->complete(m_error_result);
+ }
+ }
+}
+
+template <typename I>
+bool Journal<I>::is_steady_state() const {
+ ceph_assert(m_lock.is_locked());
+ switch (m_state) {
+ case STATE_READY:
+ case STATE_CLOSED:
+ return true;
+ case STATE_UNINITIALIZED:
+ case STATE_INITIALIZING:
+ case STATE_REPLAYING:
+ case STATE_FLUSHING_RESTART:
+ case STATE_RESTARTING_REPLAY:
+ case STATE_FLUSHING_REPLAY:
+ case STATE_STOPPING:
+ case STATE_CLOSING:
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+void Journal<I>::wait_for_steady_state(Context *on_state) {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!is_steady_state());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": on_state=" << on_state
+ << dendl;
+ m_wait_for_state_contexts.push_back(on_state);
+}
+
+template <typename I>
+int Journal<I>::is_resync_requested(bool *do_resync) {
+ Mutex::Locker l(m_lock);
+ return check_resync_requested(do_resync);
+}
+
+template <typename I>
+int Journal<I>::check_resync_requested(bool *do_resync) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(do_resync != nullptr);
+
+ cls::journal::Client client;
+ int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto bl_it = client.data.cbegin();
+ try {
+ decode(client_data, bl_it);
+ } catch (const buffer::error &err) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to decode client data: " << err << dendl;
+ return -EINVAL;
+ }
+
+ journal::ImageClientMeta *image_client_meta =
+ boost::get<journal::ImageClientMeta>(&client_data.client_meta);
+ if (image_client_meta == nullptr) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to access image client meta struct" << dendl;
+ return -EINVAL;
+ }
+
+ *do_resync = image_client_meta->resync_requested;
+
+ return 0;
+}
+
+struct C_RefreshTags : public Context {
+ util::AsyncOpTracker &async_op_tracker;
+ Context *on_finish = nullptr;
+
+ Mutex lock;
+ uint64_t tag_tid = 0;
+ journal::TagData tag_data;
+
+ explicit C_RefreshTags(util::AsyncOpTracker &async_op_tracker)
+ : async_op_tracker(async_op_tracker),
+ lock("librbd::Journal::C_RefreshTags::lock") {
+ async_op_tracker.start_op();
+ }
+ ~C_RefreshTags() override {
+ async_op_tracker.finish_op();
+ }
+
+ void finish(int r) override {
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+void Journal<I>::handle_metadata_updated() {
+ CephContext *cct = m_image_ctx.cct;
+ Mutex::Locker locker(m_lock);
+
+ if (m_state != STATE_READY && !is_journal_replaying(m_lock)) {
+ return;
+ } else if (is_tag_owner(m_lock)) {
+ ldout(cct, 20) << this << " " << __func__ << ": primary image" << dendl;
+ return;
+ } else if (m_listeners.empty()) {
+ ldout(cct, 20) << this << " " << __func__ << ": no listeners" << dendl;
+ return;
+ }
+
+ uint64_t refresh_sequence = ++m_refresh_sequence;
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "refresh_sequence=" << refresh_sequence << dendl;
+
+ // pull the most recent tags from the journal, decode, and
+ // update the internal tag state
+ C_RefreshTags *refresh_ctx = new C_RefreshTags(m_async_journal_op_tracker);
+ refresh_ctx->on_finish = new FunctionContext(
+ [this, refresh_sequence, refresh_ctx](int r) {
+ handle_refresh_metadata(refresh_sequence, refresh_ctx->tag_tid,
+ refresh_ctx->tag_data, r);
+ });
+ C_DecodeTags *decode_tags_ctx = new C_DecodeTags(
+ cct, &refresh_ctx->lock, &refresh_ctx->tag_tid,
+ &refresh_ctx->tag_data, refresh_ctx);
+ m_journaler->get_tags(m_tag_tid == 0 ? 0 : m_tag_tid - 1, m_tag_class,
+ &decode_tags_ctx->tags, decode_tags_ctx);
+}
+
+template <typename I>
+void Journal<I>::handle_refresh_metadata(uint64_t refresh_sequence,
+ uint64_t tag_tid,
+ journal::TagData tag_data, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to refresh metadata: "
+ << cpp_strerror(r) << dendl;
+ return;
+ } else if (m_state != STATE_READY && !is_journal_replaying(m_lock)) {
+ return;
+ } else if (refresh_sequence != m_refresh_sequence) {
+ // another, more up-to-date refresh is in-flight
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "refresh_sequence=" << refresh_sequence << ", "
+ << "tag_tid=" << tag_tid << ", "
+ << "tag_data=" << tag_data << dendl;
+ while (m_listener_notify) {
+ m_listener_cond.Wait(m_lock);
+ }
+
+ bool was_tag_owner = is_tag_owner(m_lock);
+ if (m_tag_tid < tag_tid) {
+ m_tag_tid = tag_tid;
+ m_tag_data = tag_data;
+ }
+ bool promoted_to_primary = (!was_tag_owner && is_tag_owner(m_lock));
+
+ bool resync_requested = false;
+ r = check_resync_requested(&resync_requested);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to check if a resync was requested" << dendl;
+ return;
+ }
+
+ Listeners listeners(m_listeners);
+ m_listener_notify = true;
+ m_lock.Unlock();
+
+ if (promoted_to_primary) {
+ for (auto listener : listeners) {
+ listener->handle_promoted();
+ }
+ } else if (resync_requested) {
+ for (auto listener : listeners) {
+ listener->handle_resync();
+ }
+ }
+
+ m_lock.Lock();
+ m_listener_notify = false;
+ m_listener_cond.Signal();
+}
+
+template <typename I>
+void Journal<I>::add_listener(journal::Listener *listener) {
+ Mutex::Locker locker(m_lock);
+ m_listeners.insert(listener);
+}
+
+template <typename I>
+void Journal<I>::remove_listener(journal::Listener *listener) {
+ Mutex::Locker locker(m_lock);
+ while (m_listener_notify) {
+ m_listener_cond.Wait(m_lock);
+ }
+ m_listeners.erase(listener);
+}
+
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::Journal<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h
new file mode 100644
index 00000000..e63cc4a7
--- /dev/null
+++ b/src/librbd/Journal.h
@@ -0,0 +1,386 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_H
+#define CEPH_LIBRBD_JOURNAL_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/interval_set.h"
+#include "include/rados/librados_fwd.hpp"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/WorkQueue.h"
+#include "journal/Future.h"
+#include "journal/JournalMetadataListener.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <atomic>
+#include <unordered_map>
+
+class SafeTimer;
+namespace journal {
+class Journaler;
+}
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal { template <typename> class Replay; }
+
+template <typename ImageCtxT = ImageCtx>
+class Journal {
+public:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> FLUSHING ---> READY
+ * | * . ^ * . * |
+ * | * . | * . * |
+ * | * . | (error) * . . . . . . . * |
+ * | * . | * . * |
+ * | * . | v . * |
+ * | * . | FLUSHING_RESTART . * |
+ * | * . | | . * |
+ * | * . | | . * |
+ * | * . | v . * v
+ * | * . | RESTARTING < * * * * * STOPPING
+ * | * . | | . |
+ * | * . | | . |
+ * | * * * * * * . \-------------/ . |
+ * | * (error) . . |
+ * | * . . . . . . . . . . . . . . . . |
+ * | * . . |
+ * | v v v |
+ * | CLOSED <----- CLOSING <---------------------------------------/
+ * | |
+ * | v
+ * \---> <finish>
+ *
+ * @endverbatim
+ */
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_INITIALIZING,
+ STATE_REPLAYING,
+ STATE_FLUSHING_RESTART,
+ STATE_RESTARTING_REPLAY,
+ STATE_FLUSHING_REPLAY,
+ STATE_READY,
+ STATE_STOPPING,
+ STATE_CLOSING,
+ STATE_CLOSED
+ };
+
+ static const std::string IMAGE_CLIENT_ID;
+ static const std::string LOCAL_MIRROR_UUID;
+ static const std::string ORPHAN_MIRROR_UUID;
+
+ Journal(ImageCtxT &image_ctx);
+ ~Journal();
+
+ static bool is_journal_supported(ImageCtxT &image_ctx);
+ static int create(librados::IoCtx &io_ctx, const std::string &image_id,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool);
+ static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
+ static int reset(librados::IoCtx &io_ctx, const std::string &image_id);
+
+ static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner,
+ Context *on_finish);
+ static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
+ bool *is_tag_owner, ContextWQ *op_work_queue,
+ Context *on_finish);
+ static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
+ std::string *mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish);
+ static int request_resync(ImageCtxT *image_ctx);
+ static void promote(ImageCtxT *image_ctx, Context *on_finish);
+ static void demote(ImageCtxT *image_ctx, Context *on_finish);
+
+ bool is_journal_ready() const;
+ bool is_journal_replaying() const;
+ bool is_journal_appending() const;
+
+ void wait_for_journal_ready(Context *on_ready);
+
+ void open(Context *on_finish);
+ void close(Context *on_finish);
+
+ bool is_tag_owner() const;
+ uint64_t get_tag_tid() const;
+ journal::TagData get_tag_data() const;
+
+ void allocate_local_tag(Context *on_finish);
+ void allocate_tag(const std::string &mirror_uuid,
+ const journal::TagPredecessor &predecessor,
+ Context *on_finish);
+
+ void flush_commit_position(Context *on_finish);
+
+ void user_flushed();
+
+ uint64_t append_write_event(uint64_t offset, size_t length,
+ const bufferlist &bl,
+ bool flush_entry);
+ uint64_t append_io_event(journal::EventEntry &&event_entry,
+ uint64_t offset, size_t length,
+ bool flush_entry, int filter_ret_val);
+ void commit_io_event(uint64_t tid, int r);
+ void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length,
+ int r);
+
+ void append_op_event(uint64_t op_tid, journal::EventEntry &&event_entry,
+ Context *on_safe);
+ void commit_op_event(uint64_t tid, int r, Context *on_safe);
+ void replay_op_ready(uint64_t op_tid, Context *on_resume);
+
+ void flush_event(uint64_t tid, Context *on_safe);
+ void wait_event(uint64_t tid, Context *on_safe);
+
+ uint64_t allocate_op_tid() {
+ uint64_t op_tid = ++m_op_tid;
+ ceph_assert(op_tid != 0);
+ return op_tid;
+ }
+
+ void start_external_replay(journal::Replay<ImageCtxT> **journal_replay,
+ Context *on_start);
+ void stop_external_replay();
+
+ void add_listener(journal::Listener *listener);
+ void remove_listener(journal::Listener *listener);
+
+ int is_resync_requested(bool *do_resync);
+
+ inline ContextWQ *get_work_queue() {
+ return m_work_queue;
+ }
+
+private:
+ ImageCtxT &m_image_ctx;
+
+ // mock unit testing support
+ typedef journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef typename TypeTraits::Future Future;
+ typedef typename TypeTraits::ReplayEntry ReplayEntry;
+
+ typedef std::list<bufferlist> Bufferlists;
+ typedef std::list<Context *> Contexts;
+ typedef std::list<Future> Futures;
+ typedef interval_set<uint64_t> ExtentInterval;
+
+ struct Event {
+ Futures futures;
+ Contexts on_safe_contexts;
+ ExtentInterval pending_extents;
+ int filter_ret_val = 0;
+ bool committed_io = false;
+ bool safe = false;
+ int ret_val = 0;
+
+ Event() {
+ }
+ Event(const Futures &_futures, uint64_t offset, size_t length,
+ int filter_ret_val)
+ : futures(_futures), filter_ret_val(filter_ret_val) {
+ if (length > 0) {
+ pending_extents.insert(offset, length);
+ }
+ }
+ };
+
+ typedef std::unordered_map<uint64_t, Event> Events;
+ typedef std::unordered_map<uint64_t, Future> TidToFutures;
+
+ struct C_IOEventSafe : public Context {
+ Journal *journal;
+ uint64_t tid;
+
+ C_IOEventSafe(Journal *_journal, uint64_t _tid)
+ : journal(_journal), tid(_tid) {
+ }
+
+ void finish(int r) override {
+ journal->handle_io_event_safe(r, tid);
+ }
+ };
+
+ struct C_OpEventSafe : public Context {
+ Journal *journal;
+ uint64_t tid;
+ Future op_start_future;
+ Future op_finish_future;
+ Context *on_safe;
+
+ C_OpEventSafe(Journal *journal, uint64_t tid, const Future &op_start_future,
+ const Future &op_finish_future, Context *on_safe)
+ : journal(journal), tid(tid), op_start_future(op_start_future),
+ op_finish_future(op_finish_future), on_safe(on_safe) {
+ }
+
+ void finish(int r) override {
+ journal->handle_op_event_safe(r, tid, op_start_future, op_finish_future,
+ on_safe);
+ }
+ };
+
+ struct C_ReplayProcessSafe : public Context {
+ Journal *journal;
+ ReplayEntry replay_entry;
+
+ C_ReplayProcessSafe(Journal *journal, ReplayEntry &&replay_entry) :
+ journal(journal), replay_entry(std::move(replay_entry)) {
+ }
+ void finish(int r) override {
+ journal->handle_replay_process_safe(replay_entry, r);
+ }
+ };
+
+ struct ReplayHandler : public ::journal::ReplayHandler {
+ Journal *journal;
+ ReplayHandler(Journal *_journal) : journal(_journal) {
+ }
+
+ void get() override {
+ // TODO
+ }
+ void put() override {
+ // TODO
+ }
+
+ void handle_entries_available() override {
+ journal->handle_replay_ready();
+ }
+ void handle_complete(int r) override {
+ journal->handle_replay_complete(r);
+ }
+ };
+
+ ContextWQ *m_work_queue = nullptr;
+ SafeTimer *m_timer = nullptr;
+ Mutex *m_timer_lock = nullptr;
+
+ Journaler *m_journaler;
+ mutable Mutex m_lock;
+ State m_state;
+ uint64_t m_max_append_size = 0;
+ uint64_t m_tag_class = 0;
+ uint64_t m_tag_tid = 0;
+ journal::ImageClientMeta m_client_meta;
+ journal::TagData m_tag_data;
+
+ int m_error_result;
+ Contexts m_wait_for_state_contexts;
+
+ ReplayHandler m_replay_handler;
+ bool m_close_pending;
+
+ Mutex m_event_lock;
+ uint64_t m_event_tid;
+ Events m_events;
+
+ std::atomic<bool> m_user_flushed = false;
+
+ std::atomic<uint64_t> m_op_tid = { 0 };
+ TidToFutures m_op_futures;
+
+ bool m_processing_entry = false;
+ bool m_blocking_writes;
+
+ journal::Replay<ImageCtxT> *m_journal_replay;
+
+ util::AsyncOpTracker m_async_journal_op_tracker;
+
+ struct MetadataListener : public ::journal::JournalMetadataListener {
+ Journal<ImageCtxT> *journal;
+
+ MetadataListener(Journal<ImageCtxT> *journal) : journal(journal) { }
+
+ void handle_update(::journal::JournalMetadata *) override {
+ FunctionContext *ctx = new FunctionContext([this](int r) {
+ journal->handle_metadata_updated();
+ });
+ journal->m_work_queue->queue(ctx, 0);
+ }
+ } m_metadata_listener;
+
+ typedef std::set<journal::Listener *> Listeners;
+ Listeners m_listeners;
+ Cond m_listener_cond;
+ bool m_listener_notify = false;
+
+ uint64_t m_refresh_sequence = 0;
+
+ bool is_journal_replaying(const Mutex &) const;
+ bool is_tag_owner(const Mutex &) const;
+
+ uint64_t append_io_events(journal::EventType event_type,
+ const Bufferlists &bufferlists,
+ uint64_t offset, size_t length, bool flush_entry,
+ int filter_ret_val);
+ Future wait_event(Mutex &lock, uint64_t tid, Context *on_safe);
+
+ void create_journaler();
+ void destroy_journaler(int r);
+ void recreate_journaler(int r);
+
+ void complete_event(typename Events::iterator it, int r);
+
+ void start_append();
+
+ void handle_open(int r);
+
+ void handle_replay_ready();
+ void handle_replay_complete(int r);
+ void handle_replay_process_ready(int r);
+ void handle_replay_process_safe(ReplayEntry replay_entry, int r);
+
+ void handle_start_external_replay(int r,
+ journal::Replay<ImageCtxT> **journal_replay,
+ Context *on_finish);
+
+ void handle_flushing_restart(int r);
+ void handle_flushing_replay();
+
+ void handle_recording_stopped(int r);
+
+ void handle_journal_destroyed(int r);
+
+ void handle_io_event_safe(int r, uint64_t tid);
+ void handle_op_event_safe(int r, uint64_t tid, const Future &op_start_future,
+ const Future &op_finish_future, Context *on_safe);
+
+ void stop_recording();
+
+ void transition_state(State state, int r);
+
+ bool is_steady_state() const;
+ void wait_for_steady_state(Context *on_state);
+
+ int check_resync_requested(bool *do_resync);
+
+ void handle_metadata_updated();
+ void handle_refresh_metadata(uint64_t refresh_sequence, uint64_t tag_tid,
+ journal::TagData tag_data, int r);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::Journal<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_H
diff --git a/src/librbd/LibrbdAdminSocketHook.cc b/src/librbd/LibrbdAdminSocketHook.cc
new file mode 100644
index 00000000..15b8fb97
--- /dev/null
+++ b/src/librbd/LibrbdAdminSocketHook.cc
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/LibrbdAdminSocketHook.h"
+#include "librbd/internal.h"
+#include "librbd/io/ImageRequestWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbdadminsocket: "
+
+namespace librbd {
+
+class LibrbdAdminSocketCommand {
+public:
+ virtual ~LibrbdAdminSocketCommand() {}
+ virtual bool call(stringstream *ss) = 0;
+};
+
+class FlushCacheCommand : public LibrbdAdminSocketCommand {
+public:
+ explicit FlushCacheCommand(ImageCtx *ictx) : ictx(ictx) {}
+
+ bool call(stringstream *ss) override {
+ int r = ictx->io_work_queue->flush();
+ if (r < 0) {
+ *ss << "flush: " << cpp_strerror(r);
+ return false;
+ }
+ return true;
+ }
+
+private:
+ ImageCtx *ictx;
+};
+
+struct InvalidateCacheCommand : public LibrbdAdminSocketCommand {
+public:
+ explicit InvalidateCacheCommand(ImageCtx *ictx) : ictx(ictx) {}
+
+ bool call(stringstream *ss) override {
+ int r = invalidate_cache(ictx);
+ if (r < 0) {
+ *ss << "invalidate_cache: " << cpp_strerror(r);
+ return false;
+ }
+ return true;
+ }
+
+private:
+ ImageCtx *ictx;
+};
+
+LibrbdAdminSocketHook::LibrbdAdminSocketHook(ImageCtx *ictx) :
+ admin_socket(ictx->cct->get_admin_socket()) {
+
+ std::string command;
+ std::string imagename;
+ int r;
+
+ imagename = ictx->md_ctx.get_pool_name() + "/" + ictx->name;
+ command = "rbd cache flush " + imagename;
+
+ r = admin_socket->register_command(command, command, this,
+ "flush rbd image " + imagename +
+ " cache");
+ if (r == 0) {
+ commands[command] = new FlushCacheCommand(ictx);
+ }
+
+ command = "rbd cache invalidate " + imagename;
+ r = admin_socket->register_command(command, command, this,
+ "invalidate rbd image " + imagename +
+ " cache");
+ if (r == 0) {
+ commands[command] = new InvalidateCacheCommand(ictx);
+ }
+}
+
+LibrbdAdminSocketHook::~LibrbdAdminSocketHook() {
+ for (Commands::const_iterator i = commands.begin(); i != commands.end();
+ ++i) {
+ (void)admin_socket->unregister_command(i->first);
+ delete i->second;
+ }
+}
+
+bool LibrbdAdminSocketHook::call(std::string_view command,
+ const cmdmap_t& cmdmap,
+ std::string_view format,
+ bufferlist& out) {
+ Commands::const_iterator i = commands.find(command);
+ ceph_assert(i != commands.end());
+ stringstream ss;
+ bool r = i->second->call(&ss);
+ out.append(ss);
+ return r;
+}
+
+} // namespace librbd
diff --git a/src/librbd/LibrbdAdminSocketHook.h b/src/librbd/LibrbdAdminSocketHook.h
new file mode 100644
index 00000000..8b1b5b94
--- /dev/null
+++ b/src/librbd/LibrbdAdminSocketHook.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H
+#define CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H
+
+#include <map>
+
+#include "common/admin_socket.h"
+
+namespace librbd {
+
+ struct ImageCtx;
+ class LibrbdAdminSocketCommand;
+
+ class LibrbdAdminSocketHook : public AdminSocketHook {
+ public:
+ LibrbdAdminSocketHook(ImageCtx *ictx);
+ ~LibrbdAdminSocketHook() override;
+
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override;
+
+ private:
+ typedef std::map<std::string,LibrbdAdminSocketCommand*,
+ std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+ };
+}
+
+#endif
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc
new file mode 100644
index 00000000..acd7e874
--- /dev/null
+++ b/src/librbd/LibrbdWriteback.cc
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <errno.h>
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/Mutex.h"
+#include "common/WorkQueue.h"
+#include "osdc/Striper.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/LibrbdWriteback.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/ReadResult.h"
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbdwriteback: "
+
+namespace librbd {
+
+ /**
+ * context to wrap another context in a Mutex
+ *
+ * @param cct cct
+ * @param c context to finish
+ * @param l mutex to lock
+ */
+ class C_ReadRequest : public Context {
+ public:
+ C_ReadRequest(CephContext *cct, Context *c, Mutex *cache_lock)
+ : m_cct(cct), m_ctx(c), m_cache_lock(cache_lock) {
+ }
+ void finish(int r) override {
+ ldout(m_cct, 20) << "aio_cb completing " << dendl;
+ {
+ Mutex::Locker cache_locker(*m_cache_lock);
+ m_ctx->complete(r);
+ }
+ ldout(m_cct, 20) << "aio_cb finished" << dendl;
+ }
+ private:
+ CephContext *m_cct;
+ Context *m_ctx;
+ Mutex *m_cache_lock;
+ };
+
+ class C_OrderedWrite : public Context {
+ public:
+ C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result,
+ const ZTracer::Trace &trace, LibrbdWriteback *wb)
+ : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {}
+ ~C_OrderedWrite() override {}
+ void finish(int r) override {
+ ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl;
+ {
+ Mutex::Locker l(m_wb_handler->m_lock);
+ ceph_assert(!m_result->done);
+ m_result->done = true;
+ m_result->ret = r;
+ m_wb_handler->complete_writes(m_result->oid);
+ }
+ ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl;
+ m_trace.event("finish");
+ }
+ private:
+ CephContext *m_cct;
+ LibrbdWriteback::write_result_d *m_result;
+ ZTracer::Trace m_trace;
+ LibrbdWriteback *m_wb_handler;
+ };
+
+ struct C_CommitIOEventExtent : public Context {
+ ImageCtx *image_ctx;
+ uint64_t journal_tid;
+ uint64_t offset;
+ uint64_t length;
+
+ C_CommitIOEventExtent(ImageCtx *image_ctx, uint64_t journal_tid,
+ uint64_t offset, uint64_t length)
+ : image_ctx(image_ctx), journal_tid(journal_tid), offset(offset),
+ length(length) {
+ }
+
+ void finish(int r) override {
+ // all IO operations are flushed prior to closing the journal
+ ceph_assert(image_ctx->journal != nullptr);
+
+ image_ctx->journal->commit_io_event_extent(journal_tid, offset, length,
+ r);
+ }
+ };
+
+ LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock)
+ : m_tid(0), m_lock(lock), m_ictx(ictx) {
+ }
+
+ void LibrbdWriteback::read(const object_t& oid, uint64_t object_no,
+ const object_locator_t& oloc,
+ uint64_t off, uint64_t len, snapid_t snapid,
+ bufferlist *pbl, uint64_t trunc_size,
+ __u32 trunc_seq, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *onfinish)
+ {
+ ZTracer::Trace trace;
+ if (parent_trace.valid()) {
+ trace.init("", &m_ictx->trace_endpoint, &parent_trace);
+ trace.copy_name("cache read " + oid.name);
+ trace.event("start");
+ }
+
+ // on completion, take the mutex and then call onfinish.
+ onfinish = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock);
+
+ // re-use standard object read state machine
+ auto aio_comp = io::AioCompletion::create_and_start(onfinish, m_ictx,
+ io::AIO_TYPE_READ);
+ aio_comp->read_result = io::ReadResult{pbl};
+ aio_comp->set_request_count(1);
+
+ auto req_comp = new io::ReadResult::C_ObjectReadRequest(
+ aio_comp, off, len, {{0, len}});
+
+ auto req = io::ObjectDispatchSpec::create_read(
+ m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, oid.name, object_no, off, len,
+ snapid, op_flags, trace, &req_comp->bl, &req_comp->extent_map, req_comp);
+ req->send();
+ }
+
+ bool LibrbdWriteback::may_copy_on_write(const object_t& oid, uint64_t read_off, uint64_t read_len, snapid_t snapid)
+ {
+ m_ictx->snap_lock.get_read();
+ librados::snap_t snap_id = m_ictx->snap_id;
+ m_ictx->parent_lock.get_read();
+ uint64_t overlap = 0;
+ m_ictx->get_parent_overlap(snap_id, &overlap);
+ m_ictx->parent_lock.put_read();
+ m_ictx->snap_lock.put_read();
+
+ uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+ // reverse map this object extent onto the parent
+ vector<pair<uint64_t,uint64_t> > objectx;
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
+ object_no, 0, m_ictx->layout.object_size,
+ objectx);
+ uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap);
+ bool may = object_overlap > 0;
+ ldout(m_ictx->cct, 10) << "may_copy_on_write " << oid << " " << read_off
+ << "~" << read_len << " = " << may << dendl;
+ return may;
+ }
+
+ ceph_tid_t LibrbdWriteback::write(const object_t& oid,
+ const object_locator_t& oloc,
+ uint64_t off, uint64_t len,
+ const SnapContext& snapc,
+ const bufferlist &bl,
+ ceph::real_time mtime, uint64_t trunc_size,
+ __u32 trunc_seq, ceph_tid_t journal_tid,
+ const ZTracer::Trace &parent_trace,
+ Context *oncommit)
+ {
+ ZTracer::Trace trace;
+ if (parent_trace.valid()) {
+ trace.init("", &m_ictx->trace_endpoint, &parent_trace);
+ trace.copy_name("writeback " + oid.name);
+ trace.event("start");
+ }
+
+ uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+ write_result_d *result = new write_result_d(oid.name, oncommit);
+ m_writes[oid.name].push(result);
+ ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
+
+ bufferlist bl_copy(bl);
+
+ Context *ctx = new C_OrderedWrite(m_ictx->cct, result, trace, this);
+ ctx = util::create_async_context_callback(*m_ictx, ctx);
+
+ auto req = io::ObjectDispatchSpec::create_write(
+ m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, oid.name, object_no, off,
+ std::move(bl_copy), snapc, 0, journal_tid, trace, ctx);
+ req->object_dispatch_flags = (
+ io::OBJECT_DISPATCH_FLAG_FLUSH |
+ io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR);
+ req->send();
+
+ return ++m_tid;
+ }
+
+
+ void LibrbdWriteback::overwrite_extent(const object_t& oid, uint64_t off,
+ uint64_t len,
+ ceph_tid_t original_journal_tid,
+ ceph_tid_t new_journal_tid) {
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+ ldout(m_ictx->cct, 20) << __func__ << ": " << oid << " "
+ << off << "~" << len << " "
+ << "journal_tid=" << original_journal_tid << ", "
+ << "new_journal_tid=" << new_journal_tid << dendl;
+
+ uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+ // all IO operations are flushed prior to closing the journal
+ ceph_assert(original_journal_tid != 0 && m_ictx->journal != NULL);
+
+ Extents file_extents;
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, object_no, off,
+ len, file_extents);
+ for (Extents::iterator it = file_extents.begin();
+ it != file_extents.end(); ++it) {
+ if (new_journal_tid != 0) {
+ // ensure new journal event is safely committed to disk before
+ // committing old event
+ m_ictx->journal->flush_event(
+ new_journal_tid, new C_CommitIOEventExtent(m_ictx,
+ original_journal_tid,
+ it->first, it->second));
+ } else {
+ m_ictx->journal->commit_io_event_extent(original_journal_tid, it->first,
+ it->second, 0);
+ }
+ }
+ }
+
+ void LibrbdWriteback::complete_writes(const std::string& oid)
+ {
+ ceph_assert(m_lock.is_locked());
+ std::queue<write_result_d*>& results = m_writes[oid];
+ ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl;
+ std::list<write_result_d*> finished;
+
+ while (!results.empty()) {
+ write_result_d *result = results.front();
+ if (!result->done)
+ break;
+ finished.push_back(result);
+ results.pop();
+ }
+
+ if (results.empty())
+ m_writes.erase(oid);
+
+ for (std::list<write_result_d*>::iterator it = finished.begin();
+ it != finished.end(); ++it) {
+ write_result_d *result = *it;
+ ldout(m_ictx->cct, 20) << "complete_writes() completing " << result
+ << dendl;
+ result->oncommit->complete(result->ret);
+ delete result;
+ }
+ }
+}
diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h
new file mode 100644
index 00000000..6ffba511
--- /dev/null
+++ b/src/librbd/LibrbdWriteback.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H
+#define CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H
+
+#include <queue>
+
+#include "common/snap_types.h"
+#include "osd/osd_types.h"
+#include "osdc/WritebackHandler.h"
+
+class Mutex;
+class Context;
+
+namespace librbd {
+
+ struct ImageCtx;
+
+ class LibrbdWriteback : public WritebackHandler {
+ public:
+ LibrbdWriteback(ImageCtx *ictx, Mutex& lock);
+
+ // Note that oloc, trunc_size, and trunc_seq are ignored
+ void read(const object_t& oid, uint64_t object_no,
+ const object_locator_t& oloc, uint64_t off, uint64_t len,
+ snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
+ __u32 trunc_seq, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *onfinish) override;
+
+ // Determine whether a read to this extent could be affected by a
+ // write-triggered copy-on-write
+ bool may_copy_on_write(const object_t& oid, uint64_t read_off,
+ uint64_t read_len, snapid_t snapid) override;
+
+ // Note that oloc, trunc_size, and trunc_seq are ignored
+ ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
+ uint64_t off, uint64_t len,
+ const SnapContext& snapc, const bufferlist &bl,
+ ceph::real_time mtime, uint64_t trunc_size,
+ __u32 trunc_seq, ceph_tid_t journal_tid,
+ const ZTracer::Trace &parent_trace,
+ Context *oncommit) override;
+ using WritebackHandler::write;
+
+ void overwrite_extent(const object_t& oid, uint64_t off,
+ uint64_t len, ceph_tid_t original_journal_tid,
+ ceph_tid_t new_journal_tid) override;
+
+ struct write_result_d {
+ bool done;
+ int ret;
+ std::string oid;
+ Context *oncommit;
+ write_result_d(const std::string& oid, Context *oncommit) :
+ done(false), ret(0), oid(oid), oncommit(oncommit) {}
+ private:
+ write_result_d(const write_result_d& rhs);
+ const write_result_d& operator=(const write_result_d& rhs);
+ };
+
+ private:
+ void complete_writes(const std::string& oid);
+
+ ceph_tid_t m_tid;
+ Mutex& m_lock;
+ librbd::ImageCtx *m_ictx;
+ ceph::unordered_map<std::string, std::queue<write_result_d*> > m_writes;
+ friend class C_OrderedWrite;
+ };
+}
+
+#endif
diff --git a/src/librbd/ManagedLock.cc b/src/librbd/ManagedLock.cc
new file mode 100644
index 00000000..15d2016f
--- /dev/null
+++ b/src/librbd/ManagedLock.cc
@@ -0,0 +1,852 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ManagedLock.h"
+#include "librbd/managed_lock/AcquireRequest.h"
+#include "librbd/managed_lock/BreakRequest.h"
+#include "librbd/managed_lock/GetLockerRequest.h"
+#include "librbd/managed_lock/ReleaseRequest.h"
+#include "librbd/managed_lock/ReacquireRequest.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/managed_lock/Utils.h"
+#include "librbd/Watcher.h"
+#include "librbd/ImageCtx.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "common/WorkQueue.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ManagedLock: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+using std::string;
+using namespace managed_lock;
+
+namespace {
+
+template <typename R>
+struct C_SendLockRequest : public Context {
+ R* request;
+ explicit C_SendLockRequest(R* request) : request(request) {
+ }
+ void finish(int r) override {
+ request->send();
+ }
+};
+
+struct C_Tracked : public Context {
+ AsyncOpTracker &tracker;
+ Context *ctx;
+ C_Tracked(AsyncOpTracker &tracker, Context *ctx)
+ : tracker(tracker), ctx(ctx) {
+ tracker.start_op();
+ }
+ ~C_Tracked() override {
+ tracker.finish_op();
+ }
+ void finish(int r) override {
+ ctx->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+using managed_lock::util::decode_lock_cookie;
+using managed_lock::util::encode_lock_cookie;
+
+template <typename I>
+ManagedLock<I>::ManagedLock(librados::IoCtx &ioctx, ContextWQ *work_queue,
+ const string& oid, Watcher *watcher, Mode mode,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds)
+ : m_lock(unique_lock_name("librbd::ManagedLock<I>::m_lock", this)),
+ m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
+ m_work_queue(work_queue),
+ m_oid(oid),
+ m_watcher(watcher),
+ m_mode(mode),
+ m_blacklist_on_break_lock(blacklist_on_break_lock),
+ m_blacklist_expire_seconds(blacklist_expire_seconds),
+ m_state(STATE_UNLOCKED) {
+}
+
+template <typename I>
+ManagedLock<I>::~ManagedLock() {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_SHUTDOWN || m_state == STATE_UNLOCKED ||
+ m_state == STATE_UNINITIALIZED);
+ if (m_state == STATE_UNINITIALIZED) {
+ // never initialized -- ensure any in-flight ops are complete
+ // since we wouldn't expect shut_down to be invoked
+ C_SaferCond ctx;
+ m_async_op_tracker.wait_for_ops(&ctx);
+ ctx.wait();
+ }
+ ceph_assert(m_async_op_tracker.empty());
+}
+
+template <typename I>
+bool ManagedLock<I>::is_lock_owner() const {
+ Mutex::Locker locker(m_lock);
+
+ return is_lock_owner(m_lock);
+}
+
+template <typename I>
+bool ManagedLock<I>::is_lock_owner(Mutex &lock) const {
+
+ ceph_assert(m_lock.is_locked());
+
+ bool lock_owner;
+
+ switch (m_state) {
+ case STATE_LOCKED:
+ case STATE_REACQUIRING:
+ case STATE_PRE_SHUTTING_DOWN:
+ case STATE_POST_ACQUIRING:
+ case STATE_PRE_RELEASING:
+ lock_owner = true;
+ break;
+ default:
+ lock_owner = false;
+ break;
+ }
+
+ ldout(m_cct, 20) << "=" << lock_owner << dendl;
+ return lock_owner;
+}
+
+template <typename I>
+void ManagedLock<I>::shut_down(Context *on_shut_down) {
+ ldout(m_cct, 10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!is_state_shutdown());
+
+ if (m_state == STATE_WAITING_FOR_REGISTER) {
+ // abort stalled acquire lock state
+ ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl;
+ Action active_action = get_active_action();
+ ceph_assert(active_action == ACTION_TRY_LOCK ||
+ active_action == ACTION_ACQUIRE_LOCK);
+ complete_active_action(STATE_UNLOCKED, -ESHUTDOWN);
+ }
+
+ execute_action(ACTION_SHUT_DOWN, on_shut_down);
+}
+
+template <typename I>
+void ManagedLock<I>::acquire_lock(Context *on_acquired) {
+ int r = 0;
+ {
+ Mutex::Locker locker(m_lock);
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) {
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_ACQUIRE_LOCK, on_acquired);
+ return;
+ }
+ }
+
+ if (on_acquired != nullptr) {
+ on_acquired->complete(r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::try_acquire_lock(Context *on_acquired) {
+ int r = 0;
+ {
+ Mutex::Locker locker(m_lock);
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) {
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_TRY_LOCK, on_acquired);
+ return;
+ }
+ }
+
+ if (on_acquired != nullptr) {
+ on_acquired->complete(r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::release_lock(Context *on_released) {
+ int r = 0;
+ {
+ Mutex::Locker locker(m_lock);
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (m_state != STATE_UNLOCKED || !m_actions_contexts.empty()) {
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_RELEASE_LOCK, on_released);
+ return;
+ }
+ }
+
+ if (on_released != nullptr) {
+ on_released->complete(r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::reacquire_lock(Context *on_reacquired) {
+ {
+ Mutex::Locker locker(m_lock);
+
+ if (m_state == STATE_WAITING_FOR_REGISTER) {
+ // restart the acquire lock process now that watch is valid
+ ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl;
+ Action active_action = get_active_action();
+ ceph_assert(active_action == ACTION_TRY_LOCK ||
+ active_action == ACTION_ACQUIRE_LOCK);
+ execute_next_action();
+ } else if (!is_state_shutdown() &&
+ (m_state == STATE_LOCKED ||
+ m_state == STATE_ACQUIRING ||
+ m_state == STATE_POST_ACQUIRING ||
+ m_state == STATE_WAITING_FOR_LOCK)) {
+ // interlock the lock operation with other state ops
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_REACQUIRE_LOCK, on_reacquired);
+ return;
+ }
+ }
+
+ // ignore request if shutdown or not in a locked-related state
+ if (on_reacquired != nullptr) {
+ on_reacquired->complete(0);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::get_locker(managed_lock::Locker *locker,
+ Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+ {
+ Mutex::Locker l(m_lock);
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else {
+ on_finish = new C_Tracked(m_async_op_tracker, on_finish);
+ auto req = managed_lock::GetLockerRequest<I>::create(
+ m_ioctx, m_oid, m_mode == EXCLUSIVE, locker, on_finish);
+ req->send();
+ return;
+ }
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::break_lock(const managed_lock::Locker &locker,
+ bool force_break_lock, Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+ {
+ Mutex::Locker l(m_lock);
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (is_lock_owner(m_lock)) {
+ r = -EBUSY;
+ } else {
+ on_finish = new C_Tracked(m_async_op_tracker, on_finish);
+ auto req = managed_lock::BreakRequest<I>::create(
+ m_ioctx, m_work_queue, m_oid, locker, m_mode == EXCLUSIVE,
+ m_blacklist_on_break_lock, m_blacklist_expire_seconds, force_break_lock,
+ on_finish);
+ req->send();
+ return;
+ }
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+int ManagedLock<I>::assert_header_locked() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ {
+ Mutex::Locker locker(m_lock);
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME,
+ (m_mode == EXCLUSIVE ? LOCK_EXCLUSIVE :
+ LOCK_SHARED),
+ m_cookie,
+ managed_lock::util::get_watcher_lock_tag());
+ }
+
+ int r = m_ioctx.operate(m_oid, &op, nullptr);
+ if (r < 0) {
+ if (r == -EBLACKLISTED) {
+ ldout(m_cct, 5) << "client is not lock owner -- client blacklisted"
+ << dendl;
+ } else if (r == -ENOENT) {
+ ldout(m_cct, 5) << "client is not lock owner -- no lock detected"
+ << dendl;
+ } else if (r == -EBUSY) {
+ ldout(m_cct, 5) << "client is not lock owner -- owned by different client"
+ << dendl;
+ } else {
+ lderr(m_cct) << "failed to verify lock ownership: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ManagedLock<I>::shutdown_handler(int r, Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::pre_acquire_lock_handler(Context *on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ManagedLock<I>::post_acquire_lock_handler(int r, Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ManagedLock<I>::post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+bool ManagedLock<I>::is_transition_state() const {
+ switch (m_state) {
+ case STATE_ACQUIRING:
+ case STATE_WAITING_FOR_REGISTER:
+ case STATE_REACQUIRING:
+ case STATE_RELEASING:
+ case STATE_PRE_SHUTTING_DOWN:
+ case STATE_SHUTTING_DOWN:
+ case STATE_INITIALIZING:
+ case STATE_WAITING_FOR_LOCK:
+ case STATE_POST_ACQUIRING:
+ case STATE_PRE_RELEASING:
+ return true;
+ case STATE_UNLOCKED:
+ case STATE_LOCKED:
+ case STATE_SHUTDOWN:
+ case STATE_UNINITIALIZED:
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+void ManagedLock<I>::append_context(Action action, Context *ctx) {
+ ceph_assert(m_lock.is_locked());
+
+ for (auto &action_ctxs : m_actions_contexts) {
+ if (action == action_ctxs.first) {
+ if (ctx != nullptr) {
+ action_ctxs.second.push_back(ctx);
+ }
+ return;
+ }
+ }
+
+ Contexts contexts;
+ if (ctx != nullptr) {
+ contexts.push_back(ctx);
+ }
+ m_actions_contexts.push_back({action, std::move(contexts)});
+}
+
+template <typename I>
+void ManagedLock<I>::execute_action(Action action, Context *ctx) {
+ ceph_assert(m_lock.is_locked());
+
+ append_context(action, ctx);
+ if (!is_transition_state()) {
+ execute_next_action();
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::execute_next_action() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_actions_contexts.empty());
+ switch (get_active_action()) {
+ case ACTION_ACQUIRE_LOCK:
+ case ACTION_TRY_LOCK:
+ send_acquire_lock();
+ break;
+ case ACTION_REACQUIRE_LOCK:
+ send_reacquire_lock();
+ break;
+ case ACTION_RELEASE_LOCK:
+ send_release_lock();
+ break;
+ case ACTION_SHUT_DOWN:
+ send_shutdown();
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+}
+
+template <typename I>
+typename ManagedLock<I>::Action ManagedLock<I>::get_active_action() const {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_actions_contexts.empty());
+ return m_actions_contexts.front().first;
+}
+
+template <typename I>
+void ManagedLock<I>::complete_active_action(State next_state, int r) {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_actions_contexts.empty());
+
+ ActionContexts action_contexts(std::move(m_actions_contexts.front()));
+ m_actions_contexts.pop_front();
+ m_state = next_state;
+
+ m_lock.Unlock();
+ for (auto ctx : action_contexts.second) {
+ ctx->complete(r);
+ }
+ m_lock.Lock();
+
+ if (!is_transition_state() && !m_actions_contexts.empty()) {
+ execute_next_action();
+ }
+}
+
+template <typename I>
+bool ManagedLock<I>::is_state_shutdown() const {
+ ceph_assert(m_lock.is_locked());
+
+ switch (m_state) {
+ case STATE_PRE_SHUTTING_DOWN:
+ case STATE_SHUTTING_DOWN:
+ case STATE_SHUTDOWN:
+ return true;
+ default:
+ break;
+ }
+
+ return (!m_actions_contexts.empty() &&
+ m_actions_contexts.back().first == ACTION_SHUT_DOWN);
+}
+
+template <typename I>
+void ManagedLock<I>::send_acquire_lock() {
+ ceph_assert(m_lock.is_locked());
+ if (m_state == STATE_LOCKED) {
+ complete_active_action(STATE_LOCKED, 0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ uint64_t watch_handle = m_watcher->get_watch_handle();
+ if (watch_handle == 0) {
+ lderr(m_cct) << "watcher not registered - delaying request" << dendl;
+ m_state = STATE_WAITING_FOR_REGISTER;
+
+ // shut down might race w/ release/re-acquire of the lock
+ if (is_state_shutdown()) {
+ complete_active_action(STATE_UNLOCKED, -ESHUTDOWN);
+ }
+ return;
+ }
+
+ m_state = STATE_ACQUIRING;
+ m_cookie = encode_lock_cookie(watch_handle);
+
+ m_work_queue->queue(new FunctionContext([this](int r) {
+ pre_acquire_lock_handler(create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_pre_acquire_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_pre_acquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ handle_acquire_lock(r);
+ return;
+ }
+
+ using managed_lock::AcquireRequest;
+ AcquireRequest<I>* req = AcquireRequest<I>::create(
+ m_ioctx, m_watcher, m_work_queue, m_oid, m_cookie, m_mode == EXCLUSIVE,
+ m_blacklist_on_break_lock, m_blacklist_expire_seconds,
+ create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_acquire_lock>(this));
+ m_work_queue->queue(new C_SendLockRequest<AcquireRequest<I>>(req), 0);
+}
+
+template <typename I>
+void ManagedLock<I>::handle_acquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -EBUSY || r == -EAGAIN) {
+ ldout(m_cct, 5) << "unable to acquire exclusive lock" << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to acquire exclusive lock:" << cpp_strerror(r)
+ << dendl;
+ } else {
+ ldout(m_cct, 5) << "successfully acquired exclusive lock" << dendl;
+ }
+
+ m_post_next_state = (r < 0 ? STATE_UNLOCKED : STATE_LOCKED);
+
+ m_work_queue->queue(new FunctionContext([this, r](int ret) {
+ post_acquire_lock_handler(r, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_post_acquire_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_post_acquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0 && m_post_next_state == STATE_LOCKED) {
+ // release_lock without calling pre and post handlers
+ revert_to_unlock_state(r);
+ } else if (r != -ECANCELED) {
+ // fail the lock request
+ complete_active_action(m_post_next_state, r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::revert_to_unlock_state(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ using managed_lock::ReleaseRequest;
+ ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher,
+ m_work_queue, m_oid, m_cookie,
+ new FunctionContext([this, r](int ret) {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(ret == 0);
+ complete_active_action(STATE_UNLOCKED, r);
+ }));
+ m_work_queue->queue(new C_SendLockRequest<ReleaseRequest<I>>(req));
+}
+
+template <typename I>
+void ManagedLock<I>::send_reacquire_lock() {
+ ceph_assert(m_lock.is_locked());
+
+ if (m_state != STATE_LOCKED) {
+ complete_active_action(m_state, 0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+ m_state = STATE_REACQUIRING;
+
+ uint64_t watch_handle = m_watcher->get_watch_handle();
+ if (watch_handle == 0) {
+ // watch (re)failed while recovering
+ lderr(m_cct) << "aborting reacquire due to invalid watch handle"
+ << dendl;
+
+ // treat double-watch failure as a lost lock and invoke the
+ // release/acquire handlers
+ release_acquire_lock();
+ complete_active_action(STATE_LOCKED, 0);
+ return;
+ }
+
+ m_new_cookie = encode_lock_cookie(watch_handle);
+ if (m_cookie == m_new_cookie) {
+ ldout(m_cct, 10) << "skipping reacquire since cookie still valid"
+ << dendl;
+ auto ctx = create_context_callback<
+ ManagedLock, &ManagedLock<I>::handle_no_op_reacquire_lock>(this);
+ post_reacquire_lock_handler(0, ctx);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ManagedLock, &ManagedLock<I>::handle_reacquire_lock>(this);
+ ctx = new FunctionContext([this, ctx](int r) {
+ post_reacquire_lock_handler(r, ctx);
+ });
+
+ using managed_lock::ReacquireRequest;
+ ReacquireRequest<I>* req = ReacquireRequest<I>::create(m_ioctx, m_oid,
+ m_cookie, m_new_cookie, m_mode == EXCLUSIVE, ctx);
+ m_work_queue->queue(new C_SendLockRequest<ReacquireRequest<I>>(req));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_reacquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_REACQUIRING);
+
+ if (r < 0) {
+ if (r == -EOPNOTSUPP) {
+ ldout(m_cct, 10) << "updating lock is not supported" << dendl;
+ } else {
+ lderr(m_cct) << "failed to update lock cookie: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ release_acquire_lock();
+ } else {
+ m_cookie = m_new_cookie;
+ }
+
+ complete_active_action(STATE_LOCKED, 0);
+}
+
+template <typename I>
+void ManagedLock<I>::handle_no_op_reacquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+ ceph_assert(m_state == STATE_REACQUIRING);
+ ceph_assert(r >= 0);
+ complete_active_action(STATE_LOCKED, 0);
+}
+
+template <typename I>
+void ManagedLock<I>::release_acquire_lock() {
+ assert(m_lock.is_locked());
+
+ if (!is_state_shutdown()) {
+ // queue a release and re-acquire of the lock since cookie cannot
+ // be updated on older OSDs
+ execute_action(ACTION_RELEASE_LOCK, nullptr);
+
+ ceph_assert(!m_actions_contexts.empty());
+ ActionContexts &action_contexts(m_actions_contexts.front());
+
+ // reacquire completes when the request lock completes
+ Contexts contexts;
+ std::swap(contexts, action_contexts.second);
+ if (contexts.empty()) {
+ execute_action(ACTION_ACQUIRE_LOCK, nullptr);
+ } else {
+ for (auto ctx : contexts) {
+ execute_action(ACTION_ACQUIRE_LOCK, ctx);
+ }
+ }
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::send_release_lock() {
+ ceph_assert(m_lock.is_locked());
+ if (m_state == STATE_UNLOCKED) {
+ complete_active_action(STATE_UNLOCKED, 0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+ m_state = STATE_PRE_RELEASING;
+
+ m_work_queue->queue(new FunctionContext([this](int r) {
+ pre_release_lock_handler(false, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_pre_release_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_pre_release_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_PRE_RELEASING);
+ m_state = STATE_RELEASING;
+ }
+
+ if (r < 0) {
+ handle_release_lock(r);
+ return;
+ }
+
+ using managed_lock::ReleaseRequest;
+ ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher,
+ m_work_queue, m_oid, m_cookie,
+ create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_release_lock>(this));
+ m_work_queue->queue(new C_SendLockRequest<ReleaseRequest<I>>(req), 0);
+}
+
+template <typename I>
+void ManagedLock<I>::handle_release_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_RELEASING);
+
+ if (r >= 0 || r == -EBLACKLISTED || r == -ENOENT) {
+ m_cookie = "";
+ m_post_next_state = STATE_UNLOCKED;
+ } else {
+ m_post_next_state = STATE_LOCKED;
+ }
+
+ m_work_queue->queue(new FunctionContext([this, r](int ret) {
+ post_release_lock_handler(false, r, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_post_release_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_post_release_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+ complete_active_action(m_post_next_state, r);
+}
+
+template <typename I>
+void ManagedLock<I>::send_shutdown() {
+ ldout(m_cct, 10) << dendl;
+ ceph_assert(m_lock.is_locked());
+ if (m_state == STATE_UNLOCKED) {
+ m_state = STATE_SHUTTING_DOWN;
+ m_work_queue->queue(new FunctionContext([this](int r) {
+ shutdown_handler(r, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_shutdown>(this));
+ }));
+ return;
+ }
+
+ ceph_assert(m_state == STATE_LOCKED);
+ m_state = STATE_PRE_SHUTTING_DOWN;
+
+ m_lock.Unlock();
+ m_work_queue->queue(new C_ShutDownRelease(this), 0);
+ m_lock.Lock();
+}
+
+template <typename I>
+void ManagedLock<I>::handle_shutdown(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ wait_for_tracked_ops(r);
+}
+
+template <typename I>
+void ManagedLock<I>::send_shutdown_release() {
+ ldout(m_cct, 10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ m_work_queue->queue(new FunctionContext([this](int r) {
+ pre_release_lock_handler(true, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_shutdown_pre_release>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_shutdown_pre_release(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::string cookie;
+ {
+ Mutex::Locker locker(m_lock);
+ cookie = m_cookie;
+
+ ceph_assert(m_state == STATE_PRE_SHUTTING_DOWN);
+ m_state = STATE_SHUTTING_DOWN;
+ }
+
+ using managed_lock::ReleaseRequest;
+ ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher,
+ m_work_queue, m_oid, cookie,
+ new FunctionContext([this, r](int l) {
+ int rst = r < 0 ? r : l;
+ post_release_lock_handler(true, rst, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_shutdown_post_release>(this));
+ }));
+ req->send();
+
+}
+
+template <typename I>
+void ManagedLock<I>::handle_shutdown_post_release(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ wait_for_tracked_ops(r);
+}
+
+template <typename I>
+void ManagedLock<I>::wait_for_tracked_ops(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ Context *ctx = new FunctionContext([this, r](int ret) {
+ complete_shutdown(r);
+ });
+
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void ManagedLock<I>::complete_shutdown(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to shut down lock: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ ActionContexts action_contexts;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_actions_contexts.size() == 1);
+
+ action_contexts = std::move(m_actions_contexts.front());
+ m_actions_contexts.pop_front();
+ m_state = STATE_SHUTDOWN;
+ }
+
+ // expect to be destroyed after firing callback
+ for (auto ctx : action_contexts.second) {
+ ctx->complete(r);
+ }
+}
+
+} // namespace librbd
+
+template class librbd::ManagedLock<librbd::ImageCtx>;
diff --git a/src/librbd/ManagedLock.h b/src/librbd/ManagedLock.h
new file mode 100644
index 00000000..b27e549d
--- /dev/null
+++ b/src/librbd/ManagedLock.h
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_H
+#define CEPH_LIBRBD_MANAGED_LOCK_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/Mutex.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/watcher/Types.h"
+#include "librbd/managed_lock/Types.h"
+#include <list>
+#include <string>
+#include <utility>
+
+class ContextWQ;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace managed_lock { struct Locker; }
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ManagedLock {
+private:
+ typedef watcher::Traits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Watcher Watcher;
+
+public:
+ static ManagedLock *create(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid, Watcher *watcher,
+ managed_lock::Mode mode,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds) {
+ return new ManagedLock(ioctx, work_queue, oid, watcher, mode,
+ blacklist_on_break_lock, blacklist_expire_seconds);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ ManagedLock(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid, Watcher *watcher,
+ managed_lock::Mode mode, bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds);
+ virtual ~ManagedLock();
+
+ bool is_lock_owner() const;
+
+ void shut_down(Context *on_shutdown);
+ void acquire_lock(Context *on_acquired);
+ void try_acquire_lock(Context *on_acquired);
+ void release_lock(Context *on_released);
+ void reacquire_lock(Context *on_reacquired);
+ void get_locker(managed_lock::Locker *locker, Context *on_finish);
+ void break_lock(const managed_lock::Locker &locker, bool force_break_lock,
+ Context *on_finish);
+
+ int assert_header_locked();
+
+ bool is_shutdown() const {
+ Mutex::Locker l(m_lock);
+ return is_state_shutdown();
+ }
+
+protected:
+ mutable Mutex m_lock;
+
+ inline void set_state_uninitialized() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_UNLOCKED);
+ m_state = STATE_UNINITIALIZED;
+ }
+ inline void set_state_initializing() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ m_state = STATE_INITIALIZING;
+ }
+ inline void set_state_unlocked() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_INITIALIZING || m_state == STATE_RELEASING);
+ m_state = STATE_UNLOCKED;
+ }
+ inline void set_state_waiting_for_lock() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_ACQUIRING);
+ m_state = STATE_WAITING_FOR_LOCK;
+ }
+ inline void set_state_post_acquiring() {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_ACQUIRING);
+ m_state = STATE_POST_ACQUIRING;
+ }
+
+ bool is_state_shutdown() const;
+ inline bool is_state_acquiring() const {
+ ceph_assert(m_lock.is_locked());
+ return m_state == STATE_ACQUIRING;
+ }
+ inline bool is_state_post_acquiring() const {
+ ceph_assert(m_lock.is_locked());
+ return m_state == STATE_POST_ACQUIRING;
+ }
+ inline bool is_state_releasing() const {
+ ceph_assert(m_lock.is_locked());
+ return m_state == STATE_RELEASING;
+ }
+ inline bool is_state_pre_releasing() const {
+ ceph_assert(m_lock.is_locked());
+ return m_state == STATE_PRE_RELEASING;
+ }
+ inline bool is_state_locked() const {
+ ceph_assert(m_lock.is_locked());
+ return m_state == STATE_LOCKED;
+ }
+ inline bool is_state_waiting_for_lock() const {
+ ceph_assert(m_lock.is_locked());
+ return m_state == STATE_WAITING_FOR_LOCK;
+ }
+
+ inline bool is_action_acquire_lock() const {
+ ceph_assert(m_lock.is_locked());
+ return get_active_action() == ACTION_ACQUIRE_LOCK;
+ }
+
+ virtual void shutdown_handler(int r, Context *on_finish);
+ virtual void pre_acquire_lock_handler(Context *on_finish);
+ virtual void post_acquire_lock_handler(int r, Context *on_finish);
+ virtual void pre_release_lock_handler(bool shutting_down,
+ Context *on_finish);
+ virtual void post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish);
+ virtual void post_reacquire_lock_handler(int r, Context *on_finish);
+
+ void execute_next_action();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v (acquire_lock)
+ * UNLOCKED -----------------------------------------> ACQUIRING
+ * ^ |
+ * | |
+ * RELEASING |
+ * | |
+ * | |
+ * | (release_lock) v
+ * PRE_RELEASING <----------------------------------------- LOCKED
+ *
+ * <LOCKED state>
+ * |
+ * v
+ * REACQUIRING -------------------------------------> <finish>
+ * . ^
+ * . |
+ * . . . > <RELEASE action> ---> <ACQUIRE action> ---/
+ *
+ * <UNLOCKED/LOCKED states>
+ * |
+ * |
+ * v
+ * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> <finish>
+ *
+ * @endverbatim
+ */
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_INITIALIZING,
+ STATE_UNLOCKED,
+ STATE_LOCKED,
+ STATE_ACQUIRING,
+ STATE_POST_ACQUIRING,
+ STATE_WAITING_FOR_REGISTER,
+ STATE_WAITING_FOR_LOCK,
+ STATE_REACQUIRING,
+ STATE_PRE_RELEASING,
+ STATE_RELEASING,
+ STATE_PRE_SHUTTING_DOWN,
+ STATE_SHUTTING_DOWN,
+ STATE_SHUTDOWN,
+ };
+
+ enum Action {
+ ACTION_TRY_LOCK,
+ ACTION_ACQUIRE_LOCK,
+ ACTION_REACQUIRE_LOCK,
+ ACTION_RELEASE_LOCK,
+ ACTION_SHUT_DOWN
+ };
+
+ typedef std::list<Context *> Contexts;
+ typedef std::pair<Action, Contexts> ActionContexts;
+ typedef std::list<ActionContexts> ActionsContexts;
+
+ struct C_ShutDownRelease : public Context {
+ ManagedLock *lock;
+ C_ShutDownRelease(ManagedLock *lock)
+ : lock(lock) {
+ }
+ void finish(int r) override {
+ lock->send_shutdown_release();
+ }
+ };
+
+ librados::IoCtx& m_ioctx;
+ CephContext *m_cct;
+ ContextWQ *m_work_queue;
+ std::string m_oid;
+ Watcher *m_watcher;
+ managed_lock::Mode m_mode;
+ bool m_blacklist_on_break_lock;
+ uint32_t m_blacklist_expire_seconds;
+
+ std::string m_cookie;
+ std::string m_new_cookie;
+
+ State m_state;
+ State m_post_next_state;
+
+ ActionsContexts m_actions_contexts;
+ AsyncOpTracker m_async_op_tracker;
+
+ bool is_lock_owner(Mutex &lock) const;
+ bool is_transition_state() const;
+
+ void append_context(Action action, Context *ctx);
+ void execute_action(Action action, Context *ctx);
+
+ Action get_active_action() const;
+ void complete_active_action(State next_state, int r);
+
+ void send_acquire_lock();
+ void handle_pre_acquire_lock(int r);
+ void handle_acquire_lock(int r);
+ void handle_no_op_reacquire_lock(int r);
+
+ void handle_post_acquire_lock(int r);
+ void revert_to_unlock_state(int r);
+
+ void send_reacquire_lock();
+ void handle_reacquire_lock(int r);
+ void release_acquire_lock();
+
+ void send_release_lock();
+ void handle_pre_release_lock(int r);
+ void handle_release_lock(int r);
+ void handle_post_release_lock(int r);
+
+ void send_shutdown();
+ void handle_shutdown(int r);
+ void send_shutdown_release();
+ void handle_shutdown_pre_release(int r);
+ void handle_shutdown_post_release(int r);
+ void wait_for_tracked_ops(int r);
+ void complete_shutdown(int r);
+};
+
+} // namespace librbd
+
+extern template class librbd::ManagedLock<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_H
diff --git a/src/librbd/MirroringWatcher.cc b/src/librbd/MirroringWatcher.cc
new file mode 100644
index 00000000..f22dc149
--- /dev/null
+++ b/src/librbd/MirroringWatcher.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/MirroringWatcher.h"
+#include "include/rbd_types.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "librbd/Utils.h"
+#include "librbd/watcher/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MirroringWatcher: "
+
+namespace librbd {
+
+using namespace mirroring_watcher;
+using namespace watcher;
+
+using librbd::util::create_rados_callback;
+
+namespace {
+
+static const uint64_t NOTIFY_TIMEOUT_MS = 5000;
+
+} // anonymous namespace
+
+template <typename I>
+MirroringWatcher<I>::MirroringWatcher(librados::IoCtx &io_ctx,
+ ContextWQ *work_queue)
+ : Watcher(io_ctx, work_queue, RBD_MIRRORING) {
+}
+
+template <typename I>
+int MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode) {
+ C_SaferCond ctx;
+ notify_mode_updated(io_ctx, mirror_mode, &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode,
+ Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ModeUpdatedPayload{mirror_mode}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS,
+ nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+int MirroringWatcher<I>::notify_image_updated(
+ librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id, const std::string &global_image_id) {
+ C_SaferCond ctx;
+ notify_image_updated(io_ctx, mirror_image_state, image_id, global_image_id,
+ &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void MirroringWatcher<I>::notify_image_updated(
+ librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id, const std::string &global_image_id,
+ Context *on_finish) {
+
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ImageUpdatedPayload{
+ mirror_image_state, image_id, global_image_id}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS,
+ nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+
+}
+
+template <typename I>
+void MirroringWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 15) << ": notify_id=" << notify_id << ", "
+ << "handle=" << handle << dendl;
+
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ lderr(cct) << ": error decoding image notification: " << err.what()
+ << dendl;
+ Context *ctx = new C_NotifyAck(this, notify_id, handle);
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(watcher::util::HandlePayloadVisitor<MirroringWatcher<I>>(
+ this, notify_id, handle), notify_message.payload);
+}
+
+template <typename I>
+bool MirroringWatcher<I>::handle_payload(const ModeUpdatedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << ": mode updated: " << payload.mirror_mode << dendl;
+ handle_mode_updated(payload.mirror_mode);
+ return true;
+}
+
+template <typename I>
+bool MirroringWatcher<I>::handle_payload(const ImageUpdatedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << ": image state updated" << dendl;
+ handle_image_updated(payload.mirror_image_state, payload.image_id,
+ payload.global_image_id);
+ return true;
+}
+
+template <typename I>
+bool MirroringWatcher<I>::handle_payload(const UnknownPayload &payload,
+ Context *on_notify_ack) {
+ return true;
+}
+
+} // namespace librbd
+
+template class librbd::MirroringWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/MirroringWatcher.h b/src/librbd/MirroringWatcher.h
new file mode 100644
index 00000000..4c72080b
--- /dev/null
+++ b/src/librbd/MirroringWatcher.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_H
+#define CEPH_LIBRBD_MIRRORING_WATCHER_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Watcher.h"
+#include "librbd/mirroring_watcher/Types.h"
+
+namespace librbd {
+
+namespace watcher {
+namespace util {
+template <typename> struct HandlePayloadVisitor;
+}
+}
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MirroringWatcher : public Watcher {
+ friend struct watcher::util::HandlePayloadVisitor<MirroringWatcher<ImageCtxT>>;
+
+public:
+ MirroringWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue);
+
+ static int notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode);
+ static void notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode,
+ Context *on_finish);
+
+ static int notify_image_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id,
+ const std::string &global_image_id);
+ static void notify_image_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id,
+ const std::string &global_image_id,
+ Context *on_finish);
+
+ virtual void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) = 0;
+ virtual void handle_image_updated(cls::rbd::MirrorImageState state,
+ const std::string &image_id,
+ const std::string &global_image_id) = 0;
+
+private:
+ bool handle_payload(const mirroring_watcher::ModeUpdatedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const mirroring_watcher::ImageUpdatedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const mirroring_watcher::UnknownPayload &payload,
+ Context *on_notify_ack);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+};
+
+} // namespace librbd
+
+extern template class librbd::MirroringWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRRORING_WATCHER_H
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
new file mode 100644
index 00000000..50cbfda8
--- /dev/null
+++ b/src/librbd/ObjectMap.cc
@@ -0,0 +1,364 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ObjectMap.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/object_map/RefreshRequest.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "librbd/object_map/UnlockRequest.h"
+#include "librbd/object_map/UpdateRequest.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+
+#include "include/rados/librados.hpp"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/stringify.h"
+#include "osdc/Striper.h"
+#include <sstream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ObjectMap: " << this << " " << __func__ \
+ << ": "
+
+namespace librbd {
+
+template <typename I>
+ObjectMap<I>::ObjectMap(I &image_ctx, uint64_t snap_id)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id),
+ m_update_guard(new UpdateGuard(m_image_ctx.cct)) {
+}
+
+template <typename I>
+ObjectMap<I>::~ObjectMap() {
+ delete m_update_guard;
+}
+
+template <typename I>
+int ObjectMap<I>::aio_remove(librados::IoCtx &io_ctx, const std::string &image_id,
+ librados::AioCompletion *c) {
+ return io_ctx.aio_remove(object_map_name(image_id, CEPH_NOSNAP), c);
+}
+
+template <typename I>
+std::string ObjectMap<I>::object_map_name(const std::string &image_id,
+ uint64_t snap_id) {
+ std::string oid(RBD_OBJECT_MAP_PREFIX + image_id);
+ if (snap_id != CEPH_NOSNAP) {
+ std::stringstream snap_suffix;
+ snap_suffix << "." << std::setfill('0') << std::setw(16) << std::hex
+ << snap_id;
+ oid += snap_suffix.str();
+ }
+ return oid;
+}
+
+template <typename I>
+bool ObjectMap<I>::is_compatible(const file_layout_t& layout, uint64_t size) {
+ uint64_t object_count = Striper::get_num_objects(layout, size);
+ return (object_count <= cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT);
+}
+
+template <typename I>
+ceph::BitVector<2u>::Reference ObjectMap<I>::operator[](uint64_t object_no)
+{
+ ceph_assert(m_image_ctx.object_map_lock.is_wlocked());
+ ceph_assert(object_no < m_object_map.size());
+ return m_object_map[object_no];
+}
+
+template <typename I>
+uint8_t ObjectMap<I>::operator[](uint64_t object_no) const
+{
+ ceph_assert(m_image_ctx.object_map_lock.is_locked());
+ ceph_assert(object_no < m_object_map.size());
+ return m_object_map[object_no];
+}
+
+template <typename I>
+bool ObjectMap<I>::object_may_exist(uint64_t object_no) const
+{
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+
+ // Fall back to default logic if object map is disabled or invalid
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock)) {
+ return true;
+ }
+
+ bool flags_set;
+ int r = m_image_ctx.test_flags(m_image_ctx.snap_id,
+ RBD_FLAG_OBJECT_MAP_INVALID,
+ m_image_ctx.snap_lock, &flags_set);
+ if (r < 0 || flags_set) {
+ return true;
+ }
+
+ RWLock::RLocker l(m_image_ctx.object_map_lock);
+ uint8_t state = (*this)[object_no];
+ bool exists = (state != OBJECT_NONEXISTENT);
+ ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" << exists
+ << dendl;
+ return exists;
+}
+
+template <typename I>
+bool ObjectMap<I>::object_may_not_exist(uint64_t object_no) const
+{
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+
+ // Fall back to default logic if object map is disabled or invalid
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock)) {
+ return true;
+ }
+
+ bool flags_set;
+ int r = m_image_ctx.test_flags(m_image_ctx.snap_id,
+ RBD_FLAG_OBJECT_MAP_INVALID,
+ m_image_ctx.snap_lock, &flags_set);
+ if (r < 0 || flags_set) {
+ return true;
+ }
+
+ RWLock::RLocker l(m_image_ctx.object_map_lock);
+ uint8_t state = (*this)[object_no];
+ bool nonexistent = (state != OBJECT_EXISTS && state != OBJECT_EXISTS_CLEAN);
+ ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r="
+ << nonexistent << dendl;
+ return nonexistent;
+}
+
+template <typename I>
+bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it,
+ uint8_t new_state) {
+ ceph_assert(m_image_ctx.object_map_lock.is_wlocked());
+ uint8_t state = *it;
+ if ((state == new_state) ||
+ (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+ (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) {
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+void ObjectMap<I>::open(Context *on_finish) {
+ auto req = object_map::RefreshRequest<I>::create(
+ m_image_ctx, &m_object_map, m_snap_id, on_finish);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::close(Context *on_finish) {
+ if (m_snap_id != CEPH_NOSNAP) {
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto req = object_map::UnlockRequest<I>::create(m_image_ctx, on_finish);
+ req->send();
+}
+
+template <typename I>
+bool ObjectMap<I>::set_object_map(ceph::BitVector<2> &target_object_map) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock));
+ RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock);
+ m_object_map = target_object_map;
+ return true;
+}
+
+template <typename I>
+void ObjectMap<I>::rollback(uint64_t snap_id, Context *on_finish) {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.object_map_lock.is_wlocked());
+
+ object_map::SnapshotRollbackRequest *req =
+ new object_map::SnapshotRollbackRequest(m_image_ctx, snap_id, on_finish);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::snapshot_add(uint64_t snap_id, Context *on_finish) {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+ ceph_assert(snap_id != CEPH_NOSNAP);
+
+ object_map::SnapshotCreateRequest *req =
+ new object_map::SnapshotCreateRequest(m_image_ctx, &m_object_map, snap_id,
+ on_finish);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::snapshot_remove(uint64_t snap_id, Context *on_finish) {
+ ceph_assert(m_image_ctx.snap_lock.is_wlocked());
+ ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+ ceph_assert(snap_id != CEPH_NOSNAP);
+
+ object_map::SnapshotRemoveRequest *req =
+ new object_map::SnapshotRemoveRequest(m_image_ctx, &m_object_map, snap_id,
+ on_finish);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::aio_save(Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock));
+ RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock);
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_save(&op, m_object_map);
+
+ std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
+ librados::AioCompletion *comp = util::create_rados_callback(on_finish);
+
+ int r = m_image_ctx.md_ctx.aio_operate(oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ObjectMap<I>::aio_resize(uint64_t new_size, uint8_t default_object_state,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock));
+ ceph_assert(m_image_ctx.image_watcher != NULL);
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ object_map::ResizeRequest *req = new object_map::ResizeRequest(
+ m_image_ctx, &m_object_map, m_snap_id, new_size, default_object_state,
+ on_finish);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::detained_aio_update(UpdateOperation &&op) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.object_map_lock.is_wlocked());
+
+ BlockGuardCell *cell;
+ int r = m_update_guard->detain({op.start_object_no, op.end_object_no},
+ &op, &cell);
+ if (r < 0) {
+ lderr(cct) << "failed to detain object map update: " << cpp_strerror(r)
+ << dendl;
+ m_image_ctx.op_work_queue->queue(op.on_finish, r);
+ return;
+ } else if (r > 0) {
+ ldout(cct, 20) << "detaining object map update due to in-flight update: "
+ << "start=" << op.start_object_no << ", "
+ << "end=" << op.end_object_no << ", "
+ << (op.current_state ?
+ stringify(static_cast<uint32_t>(*op.current_state)) :
+ "")
+ << "->" << static_cast<uint32_t>(op.new_state) << dendl;
+ return;
+ }
+
+ ldout(cct, 20) << "in-flight update cell: " << cell << dendl;
+ Context *on_finish = op.on_finish;
+ Context *ctx = new FunctionContext([this, cell, on_finish](int r) {
+ handle_detained_aio_update(cell, r, on_finish);
+ });
+ aio_update(CEPH_NOSNAP, op.start_object_no, op.end_object_no, op.new_state,
+ op.current_state, op.parent_trace, op.ignore_enoent, ctx);
+}
+
+template <typename I>
+void ObjectMap<I>::handle_detained_aio_update(BlockGuardCell *cell, int r,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "cell=" << cell << ", r=" << r << dendl;
+
+ typename UpdateGuard::BlockOperations block_ops;
+ m_update_guard->release(cell, &block_ops);
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+ for (auto &op : block_ops) {
+ detained_aio_update(std::move(op));
+ }
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace,
+ bool ignore_enoent, Context *on_finish) {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+ ceph_assert(m_image_ctx.image_watcher != nullptr);
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ ceph_assert(snap_id != CEPH_NOSNAP ||
+ m_image_ctx.object_map_lock.is_wlocked());
+ ceph_assert(start_object_no < end_object_no);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "start=" << start_object_no << ", "
+ << "end=" << end_object_no << ", "
+ << (current_state ?
+ stringify(static_cast<uint32_t>(*current_state)) : "")
+ << "->" << static_cast<uint32_t>(new_state) << dendl;
+ if (snap_id == CEPH_NOSNAP) {
+ end_object_no = std::min(end_object_no, m_object_map.size());
+ if (start_object_no >= end_object_no) {
+ ldout(cct, 20) << "skipping update of invalid object map" << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto it = m_object_map.begin() + start_object_no;
+ auto end_it = m_object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ if (update_required(it, new_state)) {
+ break;
+ }
+ }
+ if (it == end_it) {
+ ldout(cct, 20) << "object map update not required" << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+ }
+
+ auto req = object_map::UpdateRequest<I>::create(
+ m_image_ctx, &m_object_map, snap_id, start_object_no, end_object_no,
+ new_state, current_state, parent_trace, ignore_enoent, on_finish);
+ req->send();
+}
+
+} // namespace librbd
+
+template class librbd::ObjectMap<librbd::ImageCtx>;
+
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
new file mode 100644
index 00000000..c930a5b5
--- /dev/null
+++ b/src/librbd/ObjectMap.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_H
+#define CEPH_LIBRBD_OBJECT_MAP_H
+
+#include "include/int_types.h"
+#include "include/fs_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/object_map_types.h"
+#include "common/bit_vector.hpp"
+#include "librbd/Utils.h"
+#include <boost/optional.hpp>
+
+class Context;
+class RWLock;
+namespace ZTracer { struct Trace; }
+
+namespace librbd {
+
+template <typename Op> class BlockGuard;
+struct BlockGuardCell;
+class ImageCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectMap {
+public:
+ static ObjectMap *create(ImageCtxT &image_ctx, uint64_t snap_id) {
+ return new ObjectMap(image_ctx, snap_id);
+ }
+
+ ObjectMap(ImageCtxT &image_ctx, uint64_t snap_id);
+ ~ObjectMap();
+
+ static int aio_remove(librados::IoCtx &io_ctx, const std::string &image_id, librados::AioCompletion *c);
+ static std::string object_map_name(const std::string &image_id,
+ uint64_t snap_id);
+
+ static bool is_compatible(const file_layout_t& layout, uint64_t size);
+
+ ceph::BitVector<2u>::Reference operator[](uint64_t object_no);
+ uint8_t operator[](uint64_t object_no) const;
+ inline uint64_t size() const {
+ return m_object_map.size();
+ }
+
+ void open(Context *on_finish);
+ void close(Context *on_finish);
+ bool set_object_map(ceph::BitVector<2> &target_object_map);
+ bool object_may_exist(uint64_t object_no) const;
+ bool object_may_not_exist(uint64_t object_no) const;
+
+ void aio_save(Context *on_finish);
+ void aio_resize(uint64_t new_size, uint8_t default_object_state,
+ Context *on_finish);
+
+ template <typename T, void(T::*MF)(int) = &T::complete>
+ bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ T *callback_object) {
+ return aio_update<T, MF>(snap_id, start_object_no, start_object_no + 1,
+ new_state, current_state, parent_trace,
+ ignore_enoent, callback_object);
+ }
+
+ template <typename T, void(T::*MF)(int) = &T::complete>
+ bool aio_update(uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ T *callback_object) {
+ ceph_assert(start_object_no < end_object_no);
+ if (snap_id == CEPH_NOSNAP) {
+ end_object_no = std::min(end_object_no, m_object_map.size());
+ if (start_object_no >= end_object_no) {
+ return false;
+ }
+
+ auto it = m_object_map.begin() + start_object_no;
+ auto end_it = m_object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ if (update_required(it, new_state)) {
+ break;
+ }
+ }
+
+ if (it == end_it) {
+ return false;
+ }
+
+ UpdateOperation update_operation(start_object_no, end_object_no,
+ new_state, current_state, parent_trace,
+ ignore_enoent,
+ util::create_context_callback<T, MF>(
+ callback_object));
+ detained_aio_update(std::move(update_operation));
+ } else {
+ aio_update(snap_id, start_object_no, end_object_no, new_state,
+ current_state, parent_trace, ignore_enoent,
+ util::create_context_callback<T, MF>(callback_object));
+ }
+ return true;
+ }
+
+ void rollback(uint64_t snap_id, Context *on_finish);
+ void snapshot_add(uint64_t snap_id, Context *on_finish);
+ void snapshot_remove(uint64_t snap_id, Context *on_finish);
+
+private:
+ struct UpdateOperation {
+ uint64_t start_object_no;
+ uint64_t end_object_no;
+ uint8_t new_state;
+ boost::optional<uint8_t> current_state;
+ ZTracer::Trace parent_trace;
+ bool ignore_enoent;
+ Context *on_finish;
+
+ UpdateOperation(uint64_t start_object_no, uint64_t end_object_no,
+ uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace,
+ bool ignore_enoent, Context *on_finish)
+ : start_object_no(start_object_no), end_object_no(end_object_no),
+ new_state(new_state), current_state(current_state),
+ parent_trace(parent_trace), ignore_enoent(ignore_enoent),
+ on_finish(on_finish) {
+ }
+ };
+
+ typedef BlockGuard<UpdateOperation> UpdateGuard;
+
+ ImageCtxT &m_image_ctx;
+ ceph::BitVector<2> m_object_map;
+ uint64_t m_snap_id;
+
+ UpdateGuard *m_update_guard = nullptr;
+
+ void detained_aio_update(UpdateOperation &&update_operation);
+ void handle_detained_aio_update(BlockGuardCell *cell, int r,
+ Context *on_finish);
+
+ void aio_update(uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ Context *on_finish);
+ bool update_required(const ceph::BitVector<2>::Iterator &it,
+ uint8_t new_state);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ObjectMap<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_H
diff --git a/src/librbd/Operations.cc b/src/librbd/Operations.cc
new file mode 100644
index 00000000..ffb8f5c8
--- /dev/null
+++ b/src/librbd/Operations.cc
@@ -0,0 +1,1793 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Operations.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
+#include "osdc/Striper.h"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/journal/StandardPolicy.h"
+#include "librbd/operation/DisableFeaturesRequest.h"
+#include "librbd/operation/EnableFeaturesRequest.h"
+#include "librbd/operation/FlattenRequest.h"
+#include "librbd/operation/MetadataRemoveRequest.h"
+#include "librbd/operation/MetadataSetRequest.h"
+#include "librbd/operation/MigrateRequest.h"
+#include "librbd/operation/ObjectMapIterate.h"
+#include "librbd/operation/RebuildObjectMapRequest.h"
+#include "librbd/operation/RenameRequest.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "librbd/operation/SnapshotRenameRequest.h"
+#include "librbd/operation/SnapshotRollbackRequest.h"
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "librbd/operation/SnapshotLimitRequest.h"
+#include "librbd/operation/SparsifyRequest.h"
+#include <set>
+#include <boost/bind.hpp>
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Operations: "
+
+namespace librbd {
+
+namespace {
+
+template <typename I>
+struct C_NotifyUpdate : public Context {
+ I &image_ctx;
+ Context *on_finish;
+ bool notified = false;
+
+ C_NotifyUpdate(I &image_ctx, Context *on_finish)
+ : image_ctx(image_ctx), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ CephContext *cct = image_ctx.cct;
+ if (notified) {
+ if (r == -ETIMEDOUT) {
+ // don't fail the op if a peer fails to get the update notification
+ lderr(cct) << "update notification timed-out" << dendl;
+ r = 0;
+ } else if (r == -ENOENT) {
+ // don't fail if header is missing (e.g. v1 image rename)
+ ldout(cct, 5) << "update notification on missing header" << dendl;
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "update notification failed: " << cpp_strerror(r)
+ << dendl;
+ }
+ Context::complete(r);
+ return;
+ }
+
+ if (r < 0) {
+ // op failed -- no need to send update notification
+ Context::complete(r);
+ return;
+ }
+
+ notified = true;
+ image_ctx.notify_update(this);
+ }
+ void finish(int r) override {
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+struct C_InvokeAsyncRequest : public Context {
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * . . . . . . | . . . . . . . . . . . . . . . . . .
+ * . . | . .
+ * . v v v .
+ * . REFRESH_IMAGE (skip if not needed) .
+ * . | .
+ * . v .
+ * . ACQUIRE_LOCK (skip if exclusive lock .
+ * . | disabled or has lock) .
+ * . | .
+ * . /--------/ \--------\ . . . . . . . . . . . . .
+ * . | | .
+ * . v v .
+ * LOCAL_REQUEST REMOTE_REQUEST
+ * | |
+ * | |
+ * \--------\ /--------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ I &image_ctx;
+ std::string name;
+ exclusive_lock::OperationRequestType request_type;
+ bool permit_snapshot;
+ boost::function<void(Context*)> local;
+ boost::function<void(Context*)> remote;
+ std::set<int> filter_error_codes;
+ Context *on_finish;
+ bool request_lock = false;
+
+ C_InvokeAsyncRequest(I &image_ctx, const std::string& name,
+ exclusive_lock::OperationRequestType request_type,
+ bool permit_snapshot,
+ const boost::function<void(Context*)>& local,
+ const boost::function<void(Context*)>& remote,
+ const std::set<int> &filter_error_codes,
+ Context *on_finish)
+ : image_ctx(image_ctx), name(name), request_type(request_type),
+ permit_snapshot(permit_snapshot), local(local), remote(remote),
+ filter_error_codes(filter_error_codes), on_finish(on_finish) {
+ }
+
+ void send() {
+ send_refresh_image();
+ }
+
+ void send_refresh_image() {
+ if (!image_ctx.state->is_refresh_required()) {
+ send_acquire_exclusive_lock();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_refresh_image>(this);
+ image_ctx.state->refresh(ctx);
+ }
+
+ void handle_refresh_image(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ send_acquire_exclusive_lock();
+ }
+
+ void send_acquire_exclusive_lock() {
+ // context can complete before owner_lock is unlocked
+ RWLock &owner_lock(image_ctx.owner_lock);
+ owner_lock.get_read();
+ image_ctx.snap_lock.get_read();
+ if (image_ctx.read_only ||
+ (!permit_snapshot && image_ctx.snap_id != CEPH_NOSNAP)) {
+ image_ctx.snap_lock.put_read();
+ owner_lock.put_read();
+ complete(-EROFS);
+ return;
+ }
+ image_ctx.snap_lock.put_read();
+
+ if (image_ctx.exclusive_lock == nullptr) {
+ send_local_request();
+ owner_lock.put_read();
+ return;
+ } else if (image_ctx.image_watcher == nullptr) {
+ owner_lock.put_read();
+ complete(-EROFS);
+ return;
+ }
+
+ if (image_ctx.exclusive_lock->is_lock_owner() &&
+ image_ctx.exclusive_lock->accept_request(request_type, nullptr)) {
+ send_local_request();
+ owner_lock.put_read();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_async_context_callback(
+ image_ctx, util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_acquire_exclusive_lock>(this));
+
+ if (request_lock) {
+ // current lock owner doesn't support op -- try to perform
+ // the action locally
+ request_lock = false;
+ image_ctx.exclusive_lock->acquire_lock(ctx);
+ } else {
+ image_ctx.exclusive_lock->try_acquire_lock(ctx);
+ }
+ owner_lock.put_read();
+ }
+
+ void handle_acquire_exclusive_lock(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ complete(r == -EBLACKLISTED ? -EBLACKLISTED : -EROFS);
+ return;
+ }
+
+ // context can complete before owner_lock is unlocked
+ RWLock &owner_lock(image_ctx.owner_lock);
+ owner_lock.get_read();
+ if (image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner()) {
+ send_local_request();
+ owner_lock.put_read();
+ return;
+ }
+
+ send_remote_request();
+ owner_lock.put_read();
+ }
+
+ void send_remote_request() {
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_async_context_callback(
+ image_ctx, util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_remote_request>(this));
+ remote(ctx);
+ }
+
+ void handle_remote_request(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ ldout(cct, 5) << name << " not supported by current lock owner" << dendl;
+ request_lock = true;
+ send_refresh_image();
+ return;
+ } else if (r != -ETIMEDOUT && r != -ERESTART) {
+ image_ctx.state->handle_update_notification();
+
+ complete(r);
+ return;
+ }
+
+ ldout(cct, 5) << name << " timed out notifying lock owner" << dendl;
+ send_refresh_image();
+ }
+
+ void send_local_request() {
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_async_context_callback(
+ image_ctx, util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_local_request>(this));
+ local(ctx);
+ }
+
+ void handle_local_request(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ send_refresh_image();
+ return;
+ }
+ complete(r);
+ }
+
+ void finish(int r) override {
+ if (filter_error_codes.count(r) != 0) {
+ r = 0;
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+bool needs_invalidate(I& image_ctx, uint64_t object_no,
+ uint8_t current_state, uint8_t new_state) {
+ if ( (current_state == OBJECT_EXISTS ||
+ current_state == OBJECT_EXISTS_CLEAN) &&
+ (new_state == OBJECT_NONEXISTENT ||
+ new_state == OBJECT_PENDING)) {
+ return false;
+ }
+ return true;
+}
+
+} // anonymous namespace
+
+template <typename I>
+Operations<I>::Operations(I &image_ctx)
+ : m_image_ctx(image_ctx), m_async_request_seq(0) {
+}
+
+template <typename I>
+int Operations<I>::flatten(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "flatten" << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ {
+ RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+ if (m_image_ctx.parent_md.spec.pool_id == -1) {
+ lderr(cct) << "image has no parent" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ uint64_t request_id = ++m_async_request_seq;
+ r = invoke_async_request("flatten",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_flatten, this,
+ boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_flatten,
+ m_image_ctx.image_watcher, request_id,
+ boost::ref(prog_ctx), _1));
+
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ ldout(cct, 20) << "flatten finished" << dendl;
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_flatten(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "flatten" << dendl;
+
+ if (m_image_ctx.read_only || m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ m_image_ctx.parent_lock.get_read();
+
+ // can't flatten a non-clone
+ if (m_image_ctx.parent_md.spec.pool_id == -1) {
+ lderr(cct) << "image has no parent" << dendl;
+ m_image_ctx.parent_lock.put_read();
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ if (m_image_ctx.snap_id != CEPH_NOSNAP) {
+ lderr(cct) << "snapshots cannot be flattened" << dendl;
+ m_image_ctx.parent_lock.put_read();
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ ::SnapContext snapc = m_image_ctx.snapc;
+
+ uint64_t overlap;
+ int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &overlap);
+ ceph_assert(r == 0);
+ ceph_assert(overlap <= m_image_ctx.size);
+
+ uint64_t overlap_objects = Striper::get_num_objects(m_image_ctx.layout,
+ overlap);
+
+ m_image_ctx.parent_lock.put_read();
+ m_image_ctx.snap_lock.put_read();
+
+ operation::FlattenRequest<I> *req = new operation::FlattenRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), overlap_objects,
+ snapc, prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::rebuild_object_map(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "rebuild_object_map" << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t request_id = ++m_async_request_seq;
+ r = invoke_async_request("rebuild object map",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true,
+ boost::bind(&Operations<I>::execute_rebuild_object_map,
+ this, boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_rebuild_object_map,
+ m_image_ctx.image_watcher, request_id,
+ boost::ref(prog_ctx), _1));
+
+ ldout(cct, 10) << "rebuild object map finished" << dendl;
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_rebuild_object_map(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ if (m_image_ctx.read_only || m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ lderr(cct) << "image must support object-map feature" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ operation::RebuildObjectMapRequest<I> *req =
+ new operation::RebuildObjectMapRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::check_object_map(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ r = invoke_async_request("check object map",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true,
+ boost::bind(&Operations<I>::check_object_map, this,
+ boost::ref(prog_ctx), _1),
+ [this](Context *c) {
+ m_image_ctx.op_work_queue->queue(c, -EOPNOTSUPP);
+ });
+
+ return r;
+}
+
+template <typename I>
+void Operations<I>::object_map_iterate(ProgressContext &prog_ctx,
+ operation::ObjectIterateWork<I> handle_mismatch,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ operation::ObjectMapIterateRequest<I> *req =
+ new operation::ObjectMapIterateRequest<I>(m_image_ctx, on_finish,
+ prog_ctx, handle_mismatch);
+ req->send();
+}
+
+template <typename I>
+void Operations<I>::check_object_map(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ object_map_iterate(prog_ctx, needs_invalidate, on_finish);
+}
+
+template <typename I>
+int Operations<I>::rename(const char *dstname) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dstname
+ << dendl;
+
+ int r = librbd::detect_format(m_image_ctx.md_ctx, dstname, NULL, NULL);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error checking for existing image called "
+ << dstname << ":" << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (r == 0) {
+ lderr(cct) << "rbd image " << dstname << " already exists" << dendl;
+ return -EEXIST;
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ r = invoke_async_request("rename",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_rename, this,
+ dstname, _1),
+ boost::bind(&ImageWatcher<I>::notify_rename,
+ m_image_ctx.image_watcher, dstname,
+ _1));
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ execute_rename(dstname, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ m_image_ctx.set_image_name(dstname);
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_rename(const std::string &dest_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ if (m_image_ctx.name == dest_name) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dest_name
+ << dendl;
+
+ if (m_image_ctx.old_format) {
+ // unregister watch before and register back after rename
+ on_finish = new C_NotifyUpdate<I>(m_image_ctx, on_finish);
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ if (m_image_ctx.old_format) {
+ m_image_ctx.image_watcher->set_oid(m_image_ctx.header_oid);
+ }
+ m_image_ctx.image_watcher->register_watch(on_finish);
+ });
+ on_finish = new FunctionContext([this, dest_name, on_finish](int r) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ operation::RenameRequest<I> *req = new operation::RenameRequest<I>(
+ m_image_ctx, on_finish, dest_name);
+ req->send();
+ });
+ m_image_ctx.image_watcher->unregister_watch(on_finish);
+ return;
+ }
+ operation::RenameRequest<I> *req = new operation::RenameRequest<I>(
+ m_image_ctx, on_finish, dest_name);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.snap_lock.get_read();
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "size=" << m_image_ctx.size << ", "
+ << "new_size=" << size << dendl;
+ m_image_ctx.snap_lock.put_read();
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP) &&
+ !ObjectMap<>::is_compatible(m_image_ctx.layout, size)) {
+ lderr(cct) << "New size not compatible with object map" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t request_id = ++m_async_request_seq;
+ r = invoke_async_request("resize",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_resize, this,
+ size, allow_shrink, boost::ref(prog_ctx), _1, 0),
+ boost::bind(&ImageWatcher<I>::notify_resize,
+ m_image_ctx.image_watcher, request_id,
+ size, allow_shrink, boost::ref(prog_ctx), _1));
+
+ m_image_ctx.perfcounter->inc(l_librbd_resize);
+ ldout(cct, 2) << "resize finished" << dendl;
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx,
+ Context *on_finish,
+ uint64_t journal_op_tid) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ m_image_ctx.snap_lock.get_read();
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "size=" << m_image_ctx.size << ", "
+ << "new_size=" << size << dendl;
+
+ if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only ||
+ m_image_ctx.operations_disabled) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EROFS);
+ return;
+ } else if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock) &&
+ !ObjectMap<>::is_compatible(m_image_ctx.layout, size)) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ operation::ResizeRequest<I> *req = new operation::ResizeRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), size, allow_shrink,
+ prog_ctx, journal_op_tid, false);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name) {
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ snap_create(snap_namespace, snap_name, &ctx);
+ r = ctx.wait();
+
+ if (r < 0) {
+ return r;
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_create);
+ return r;
+}
+
+template <typename I>
+void Operations<I>::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(
+ m_image_ctx, "snap_create", exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_create, this, snap_namespace, snap_name,
+ _1, 0, false),
+ boost::bind(&ImageWatcher<I>::notify_snap_create, m_image_ctx.image_watcher,
+ snap_namespace, snap_name, _1),
+ {-EEXIST}, on_finish);
+ req->send();
+}
+
+template <typename I>
+void Operations<I>::execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish,
+ uint64_t journal_op_tid,
+ bool skip_object_map) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ operation::SnapshotCreateRequest<I> *req =
+ new operation::SnapshotCreateRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ snap_namespace, snap_name, journal_op_tid, skip_object_map);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ ProgressContext& prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ C_SaferCond cond_ctx;
+ {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ {
+ // need to drop snap_lock before invalidating cache
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ if (!m_image_ctx.snap_exists) {
+ return -ENOENT;
+ }
+
+ if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "No such snapshot found." << dendl;
+ return -ENOENT;
+ }
+ }
+
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_snap_rollback(snap_namespace, snap_name, prog_ctx, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_rollback);
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ ProgressContext& prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "No such snapshot found." << dendl;
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ uint64_t new_size = m_image_ctx.get_image_size(snap_id);
+ m_image_ctx.snap_lock.put_read();
+
+ // async mode used for journal replay
+ operation::SnapshotRollbackRequest<I> *request =
+ new operation::SnapshotRollbackRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name,
+ snap_id, new_size, prog_ctx);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name) {
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ snap_remove(snap_namespace, snap_name, &ctx);
+ r = ctx.wait();
+
+ if (r < 0) {
+ return r;
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_remove);
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ // quickly filter out duplicate ops
+ m_image_ctx.snap_lock.get_read();
+ if (m_image_ctx.get_snap_id(snap_namespace, snap_name) == CEPH_NOSNAP) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ bool proxy_op = ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 ||
+ (m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0);
+ m_image_ctx.snap_lock.put_read();
+
+ if (proxy_op) {
+ auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL;
+ if (cls::rbd::get_snap_namespace_type(snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) {
+ request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE;
+ }
+ C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(
+ m_image_ctx, "snap_remove", request_type, true,
+ boost::bind(&Operations<I>::execute_snap_remove, this, snap_namespace, snap_name, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_remove, m_image_ctx.image_watcher,
+ snap_namespace, snap_name, _1),
+ {-ENOENT}, on_finish);
+ req->send();
+ } else {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ execute_snap_remove(snap_namespace, snap_name, on_finish);
+ }
+}
+
+template <typename I>
+void Operations<I>::execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ {
+ if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(m_image_ctx.cct) << "No such snapshot found." << dendl;
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ bool is_protected;
+ int r = m_image_ctx.is_snap_protected(snap_id, &is_protected);
+ if (r < 0) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(r);
+ return;
+ } else if (is_protected) {
+ lderr(m_image_ctx.cct) << "snapshot is protected" << dendl;
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EBUSY);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ operation::SnapshotRemoveRequest<I> *req =
+ new operation::SnapshotRemoveRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ snap_namespace, snap_name, snap_id);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_rename(const char *srcname, const char *dstname) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "snap_name=" << srcname << ", "
+ << "new_snap_name=" << dstname << dendl;
+
+ snapid_t snap_id;
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ {
+ RWLock::RLocker l(m_image_ctx.snap_lock);
+ snap_id = m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), srcname);
+ if (snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+ if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), dstname) != CEPH_NOSNAP) {
+ return -EEXIST;
+ }
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ r = invoke_async_request("snap_rename",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_rename,
+ this, snap_id, dstname, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_rename,
+ m_image_ctx.image_watcher, snap_id,
+ dstname, _1));
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ execute_snap_rename(snap_id, dstname, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_rename);
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_rename(const uint64_t src_snap_id,
+ const std::string &dest_snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ if ((m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ dest_snap_name) != CEPH_NOSNAP) {
+ // Renaming is supported for snapshots from user namespace only.
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "snap_id=" << src_snap_id << ", "
+ << "new_snap_name=" << dest_snap_name << dendl;
+
+ operation::SnapshotRenameRequest<I> *req =
+ new operation::SnapshotRenameRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), src_snap_id,
+ dest_snap_name);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ if (!m_image_ctx.test_features(RBD_FEATURE_LAYERING)) {
+ lderr(cct) << "image must support layering" << dendl;
+ return -ENOSYS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ bool is_protected;
+ r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_protected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_protected) {
+ return -EBUSY;
+ }
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ r = invoke_async_request("snap_protect",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_protect,
+ this, snap_namespace, snap_name, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_protect,
+ m_image_ctx.image_watcher,
+ snap_namespace, snap_name, _1));
+ if (r < 0 && r != -EBUSY) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ execute_snap_protect(snap_namespace, snap_name, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ bool is_protected;
+ int r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_protected);
+ if (r < 0) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(r);
+ return;
+ } else if (is_protected) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EBUSY);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ operation::SnapshotProtectRequest<I> *request =
+ new operation::SnapshotProtectRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ bool is_unprotected;
+ r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_unprotected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_unprotected) {
+ return -EINVAL;
+ }
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ r = invoke_async_request("snap_unprotect",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_unprotect,
+ this, snap_namespace, snap_name, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_unprotect,
+ m_image_ctx.image_watcher,
+ snap_namespace, snap_name, _1));
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ execute_snap_unprotect(snap_namespace, snap_name, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ bool is_unprotected;
+ int r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_unprotected);
+ if (r < 0) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(r);
+ return;
+ } else if (is_unprotected) {
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_image_ctx.snap_lock.put_read();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ operation::SnapshotUnprotectRequest<I> *request =
+ new operation::SnapshotUnprotectRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::snap_set_limit(uint64_t limit) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit << dendl;
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond limit_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_snap_set_limit(limit, &limit_ctx);
+ }
+
+ r = limit_ctx.wait();
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_set_limit(const uint64_t limit,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit
+ << dendl;
+
+ operation::SnapshotLimitRequest<I> *request =
+ new operation::SnapshotLimitRequest<I>(m_image_ctx, on_finish, limit);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::update_features(uint64_t features, bool enabled) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": features=" << features
+ << ", enabled=" << enabled << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ } else if (m_image_ctx.old_format) {
+ lderr(cct) << "old-format images do not support features" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t disable_mask = (RBD_FEATURES_MUTABLE |
+ RBD_FEATURES_DISABLE_ONLY);
+ if ((enabled && (features & RBD_FEATURES_MUTABLE) != features) ||
+ (!enabled && (features & disable_mask) != features)) {
+ lderr(cct) << "cannot update immutable features" << dendl;
+ return -EINVAL;
+ }
+
+ bool set_object_map = (features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP;
+ bool set_fast_diff = (features & RBD_FEATURE_FAST_DIFF) == RBD_FEATURE_FAST_DIFF;
+ bool exist_fast_diff = (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0;
+ bool exist_object_map = (m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0;
+
+ if ((enabled && ((set_object_map && !exist_fast_diff) || (set_fast_diff && !exist_object_map)))
+ || (!enabled && (set_object_map && exist_fast_diff))) {
+ features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF);
+ }
+
+ if (features == 0) {
+ lderr(cct) << "update requires at least one feature" << dendl;
+ return -EINVAL;
+ }
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ if (enabled && (features & m_image_ctx.features) != 0) {
+ lderr(cct) << "one or more requested features are already enabled"
+ << dendl;
+ return -EINVAL;
+ }
+ if (!enabled && (features & ~m_image_ctx.features) != 0) {
+ lderr(cct) << "one or more requested features are already disabled"
+ << dendl;
+ return -EINVAL;
+ }
+ }
+
+ // if disabling journaling, avoid attempting to open the journal
+ // when acquiring the exclusive lock in case the journal is corrupt
+ bool disabling_journal = false;
+ if (!enabled && ((features & RBD_FEATURE_JOURNALING) != 0)) {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ m_image_ctx.set_journal_policy(new journal::DisabledPolicy());
+ disabling_journal = true;
+ }
+ BOOST_SCOPE_EXIT_ALL( (this)(disabling_journal) ) {
+ if (disabling_journal) {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ m_image_ctx.set_journal_policy(
+ new journal::StandardPolicy<I>(&m_image_ctx));
+ }
+ };
+
+ // The journal options are not passed to the lock owner in the
+ // update features request. Therefore, if journaling is being
+ // enabled, the lock should be locally acquired instead of
+ // attempting to send the request to the peer.
+ if (enabled && (features & RBD_FEATURE_JOURNALING) != 0) {
+ C_SaferCond cond_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_update_features(features, enabled, &cond_ctx, 0);
+ }
+
+ r = cond_ctx.wait();
+ } else {
+ r = invoke_async_request("update_features",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_update_features,
+ this, features, enabled, _1, 0),
+ boost::bind(&ImageWatcher<I>::notify_update_features,
+ m_image_ctx.image_watcher, features,
+ enabled, _1));
+ }
+ ldout(cct, 2) << "update_features finished" << dendl;
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_update_features(uint64_t features, bool enabled,
+ Context *on_finish,
+ uint64_t journal_op_tid) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": features=" << features
+ << ", enabled=" << enabled << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ if (enabled) {
+ operation::EnableFeaturesRequest<I> *req =
+ new operation::EnableFeaturesRequest<I>(
+ m_image_ctx, on_finish, journal_op_tid, features);
+ req->send();
+ } else {
+ operation::DisableFeaturesRequest<I> *req =
+ new operation::DisableFeaturesRequest<I>(
+ m_image_ctx, on_finish, journal_op_tid, features, false);
+ req->send();
+ }
+}
+
+template <typename I>
+int Operations<I>::metadata_set(const std::string &key,
+ const std::string &value) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value="
+ << value << dendl;
+
+ std::string config_key;
+ bool config_override = util::is_metadata_config_override(key, &config_key);
+ if (config_override) {
+ // validate config setting
+ if (!librbd::api::Config<I>::is_option_name(&m_image_ctx, config_key)) {
+ lderr(cct) << "validation for " << key
+ << " failed: not allowed image level override" << dendl;
+ return -EINVAL;
+ }
+ int r = ConfigProxy{false}.set_val(config_key.c_str(), value);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ C_SaferCond metadata_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_metadata_set(key, value, &metadata_ctx);
+ }
+
+ r = metadata_ctx.wait();
+ if (config_override && r >= 0) {
+ // apply new config key immediately
+ r = m_image_ctx.state->refresh_if_required();
+ }
+
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_metadata_set(const std::string &key,
+ const std::string &value,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value="
+ << value << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ operation::MetadataSetRequest<I> *request =
+ new operation::MetadataSetRequest<I>(m_image_ctx,
+ new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ key, value);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::metadata_remove(const std::string &key) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ std::string value;
+ r = cls_client::metadata_get(&m_image_ctx.md_ctx, m_image_ctx.header_oid, key, &value);
+ if(r < 0)
+ return r;
+
+ C_SaferCond metadata_ctx;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_metadata_remove(key, &metadata_ctx);
+ }
+
+ r = metadata_ctx.wait();
+
+ std::string config_key;
+ if (util::is_metadata_config_override(key, &config_key) && r >= 0) {
+ // apply new config key immediately
+ r = m_image_ctx.state->refresh_if_required();
+ }
+
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_metadata_remove(const std::string &key,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ operation::MetadataRemoveRequest<I> *request =
+ new operation::MetadataRemoveRequest<I>(
+ m_image_ctx,
+ new C_NotifyUpdate<I>(m_image_ctx, on_finish), key);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::migrate(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "migrate" << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ {
+ RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+ if (m_image_ctx.migration_info.empty()) {
+ lderr(cct) << "image has no migrating parent" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ uint64_t request_id = ++m_async_request_seq;
+ r = invoke_async_request("migrate",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_migrate, this,
+ boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_migrate,
+ m_image_ctx.image_watcher, request_id,
+ boost::ref(prog_ctx), _1));
+
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ ldout(cct, 20) << "migrate finished" << dendl;
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_migrate(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "migrate" << dendl;
+
+ if (m_image_ctx.read_only || m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.snap_lock.get_read();
+ m_image_ctx.parent_lock.get_read();
+
+ if (m_image_ctx.migration_info.empty()) {
+ lderr(cct) << "image has no migrating parent" << dendl;
+ m_image_ctx.parent_lock.put_read();
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ if (m_image_ctx.snap_id != CEPH_NOSNAP) {
+ lderr(cct) << "snapshots cannot be migrated" << dendl;
+ m_image_ctx.parent_lock.put_read();
+ m_image_ctx.snap_lock.put_read();
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.parent_lock.put_read();
+ m_image_ctx.snap_lock.put_read();
+
+ operation::MigrateRequest<I> *req = new operation::MigrateRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::sparsify(size_t sparse_size, ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "sparsify" << dendl;
+
+ if (sparse_size < 4096 || sparse_size > m_image_ctx.get_object_size() ||
+ (sparse_size & (sparse_size - 1)) != 0) {
+ lderr(cct) << "sparse size should be power of two not less than 4096"
+ << " and not larger image object size" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t request_id = ++m_async_request_seq;
+ int r = invoke_async_request("sparsify",
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_sparsify,
+ this, sparse_size,
+ boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_sparsify,
+ m_image_ctx.image_watcher,
+ request_id, sparse_size,
+ boost::ref(prog_ctx), _1));
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ ldout(cct, 20) << "resparsify finished" << dendl;
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_sparsify(size_t sparse_size,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "sparsify" << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ auto req = new operation::SparsifyRequest<I>(
+ m_image_ctx, sparse_size, new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::prepare_image_update(
+ exclusive_lock::OperationRequestType request_type, bool request_lock) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked() &&
+ !m_image_ctx.owner_lock.is_wlocked());
+ if (m_image_ctx.image_watcher == nullptr) {
+ return -EROFS;
+ }
+
+ // need to upgrade to a write lock
+ C_SaferCond ctx;
+ m_image_ctx.owner_lock.put_read();
+ bool attempting_lock = false;
+ {
+ RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ (!m_image_ctx.exclusive_lock->is_lock_owner() ||
+ !m_image_ctx.exclusive_lock->accept_request(request_type, nullptr))) {
+
+ attempting_lock = true;
+ m_image_ctx.exclusive_lock->block_requests(0);
+
+ if (request_lock) {
+ m_image_ctx.exclusive_lock->acquire_lock(&ctx);
+ } else {
+ m_image_ctx.exclusive_lock->try_acquire_lock(&ctx);
+ }
+ }
+ }
+
+ int r = 0;
+ if (attempting_lock) {
+ r = ctx.wait();
+ }
+
+ m_image_ctx.owner_lock.get_read();
+ if (attempting_lock && m_image_ctx.exclusive_lock != nullptr) {
+ m_image_ctx.exclusive_lock->unblock_requests();
+ }
+
+ if (r == -EAGAIN || r == -EBUSY) {
+ r = 0;
+ }
+ if (r < 0) {
+ return r;
+ } else if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.exclusive_lock->is_lock_owner()) {
+ return m_image_ctx.exclusive_lock->get_unlocked_op_error();
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Operations<I>::invoke_async_request(
+ const std::string& name, exclusive_lock::OperationRequestType request_type,
+ bool permit_snapshot, const boost::function<void(Context*)>& local_request,
+ const boost::function<void(Context*)>& remote_request) {
+ C_SaferCond ctx;
+ C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(m_image_ctx, name,
+ request_type,
+ permit_snapshot,
+ local_request,
+ remote_request,
+ {}, &ctx);
+ req->send();
+ return ctx.wait();
+}
+
+} // namespace librbd
+
+template class librbd::Operations<librbd::ImageCtx>;
diff --git a/src/librbd/Operations.h b/src/librbd/Operations.h
new file mode 100644
index 00000000..253c9f92
--- /dev/null
+++ b/src/librbd/Operations.h
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATIONS_H
+#define CEPH_LIBRBD_OPERATIONS_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/int_types.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/operation/ObjectMapIterate.h"
+#include <atomic>
+#include <string>
+#include <boost/function.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+template <typename ImageCtxT = ImageCtx>
+class Operations {
+public:
+ Operations(ImageCtxT &image_ctx);
+
+ int flatten(ProgressContext &prog_ctx);
+ void execute_flatten(ProgressContext &prog_ctx, Context *on_finish);
+
+ int rebuild_object_map(ProgressContext &prog_ctx);
+ void execute_rebuild_object_map(ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ int check_object_map(ProgressContext &prog_ctx);
+ void check_object_map(ProgressContext &prog_ctx, Context *on_finish);
+
+ void object_map_iterate(ProgressContext &prog_ctx,
+ operation::ObjectIterateWork<ImageCtxT> handle_mismatch,
+ Context* on_finish);
+
+ int rename(const char *dstname);
+ void execute_rename(const std::string &dest_name, Context *on_finish);
+
+ int resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx);
+ void execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx,
+ Context *on_finish, uint64_t journal_op_tid);
+
+ int snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name);
+ void snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name, Context *on_finish);
+ void execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish,
+ uint64_t journal_op_tid, bool skip_object_map);
+
+ int snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ ProgressContext& prog_ctx);
+ void execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ ProgressContext& prog_ctx, Context *on_finish);
+
+ int snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name);
+ void snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ Context *on_finish);
+ void execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+
+ int snap_rename(const char *srcname, const char *dstname);
+ void execute_snap_rename(const uint64_t src_snap_id,
+ const std::string &dest_snap_name,
+ Context *on_finish);
+
+ int snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name);
+ void execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+
+ int snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name);
+ void execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+
+ int snap_set_limit(uint64_t limit);
+ void execute_snap_set_limit(uint64_t limit, Context *on_finish);
+
+ int update_features(uint64_t features, bool enabled);
+ void execute_update_features(uint64_t features, bool enabled,
+ Context *on_finish, uint64_t journal_op_tid);
+
+ int metadata_set(const std::string &key, const std::string &value);
+ void execute_metadata_set(const std::string &key, const std::string &value,
+ Context *on_finish);
+
+ int metadata_remove(const std::string &key);
+ void execute_metadata_remove(const std::string &key, Context *on_finish);
+
+ int migrate(ProgressContext &prog_ctx);
+ void execute_migrate(ProgressContext &prog_ctx, Context *on_finish);
+
+ int sparsify(size_t sparse_size, ProgressContext &prog_ctx);
+ void execute_sparsify(size_t sparse_size, ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ int prepare_image_update(exclusive_lock::OperationRequestType request_type,
+ bool request_lock);
+
+private:
+ ImageCtxT &m_image_ctx;
+ std::atomic<int> m_async_request_seq;
+
+ int invoke_async_request(const std::string& name,
+ exclusive_lock::OperationRequestType request_type,
+ bool permit_snapshot,
+ const boost::function<void(Context*)>& local,
+ const boost::function<void(Context*)>& remote);
+};
+
+} // namespace librbd
+
+extern template class librbd::Operations<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATIONS_H
diff --git a/src/librbd/TaskFinisher.h b/src/librbd/TaskFinisher.h
new file mode 100644
index 00000000..410b8ee8
--- /dev/null
+++ b/src/librbd/TaskFinisher.h
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef LIBRBD_TASK_FINISHER_H
+#define LIBRBD_TASK_FINISHER_H
+
+#include "include/Context.h"
+#include "common/ceph_context.h"
+#include "common/Finisher.h"
+#include "common/Mutex.h"
+#include "common/Timer.h"
+#include <map>
+#include <utility>
+
+class CephContext;
+
+namespace librbd {
+
+struct TaskFinisherSingleton {
+ Mutex m_lock;
+ SafeTimer *m_safe_timer;
+ Finisher *m_finisher;
+
+ explicit TaskFinisherSingleton(CephContext *cct)
+ : m_lock("librbd::TaskFinisher::m_lock") {
+ m_safe_timer = new SafeTimer(cct, m_lock, false);
+ m_safe_timer->init();
+ m_finisher = new Finisher(cct, "librbd::TaskFinisher::m_finisher", "taskfin_librbd");
+ m_finisher->start();
+ }
+ virtual ~TaskFinisherSingleton() {
+ {
+ Mutex::Locker l(m_lock);
+ m_safe_timer->shutdown();
+ delete m_safe_timer;
+ }
+ m_finisher->wait_for_empty();
+ m_finisher->stop();
+ delete m_finisher;
+ }
+};
+
+
+template <typename Task>
+class TaskFinisher {
+public:
+ TaskFinisher(CephContext &cct) : m_cct(cct) {
+ auto& singleton =
+ cct.lookup_or_create_singleton_object<TaskFinisherSingleton>(
+ "librbd::TaskFinisher::m_safe_timer", false, &cct);
+ m_lock = &singleton.m_lock;
+ m_safe_timer = singleton.m_safe_timer;
+ m_finisher = singleton.m_finisher;
+ }
+
+ void cancel(const Task& task) {
+ Mutex::Locker l(*m_lock);
+ typename TaskContexts::iterator it = m_task_contexts.find(task);
+ if (it != m_task_contexts.end()) {
+ delete it->second.first;
+ m_safe_timer->cancel_event(it->second.second);
+ m_task_contexts.erase(it);
+ }
+ }
+
+ void cancel_all(Context *comp) {
+ {
+ Mutex::Locker l(*m_lock);
+ for (typename TaskContexts::iterator it = m_task_contexts.begin();
+ it != m_task_contexts.end(); ++it) {
+ delete it->second.first;
+ m_safe_timer->cancel_event(it->second.second);
+ }
+ m_task_contexts.clear();
+ }
+ m_finisher->queue(comp);
+ }
+
+ bool add_event_after(const Task& task, double seconds, Context *ctx) {
+ Mutex::Locker l(*m_lock);
+ if (m_task_contexts.count(task) != 0) {
+ // task already scheduled on finisher or timer
+ delete ctx;
+ return false;
+ }
+ C_Task *timer_ctx = new C_Task(this, task);
+ m_task_contexts[task] = std::make_pair(ctx, timer_ctx);
+
+ m_safe_timer->add_event_after(seconds, timer_ctx);
+ return true;
+ }
+
+ void queue(Context *ctx) {
+ m_finisher->queue(ctx);
+ }
+
+ bool queue(const Task& task, Context *ctx) {
+ Mutex::Locker l(*m_lock);
+ typename TaskContexts::iterator it = m_task_contexts.find(task);
+ if (it != m_task_contexts.end()) {
+ if (it->second.second != NULL) {
+ ceph_assert(m_safe_timer->cancel_event(it->second.second));
+ delete it->second.first;
+ } else {
+ // task already scheduled on the finisher
+ delete ctx;
+ return false;
+ }
+ }
+ m_task_contexts[task] = std::make_pair(ctx, reinterpret_cast<Context *>(0));
+
+ m_finisher->queue(new C_Task(this, task));
+ return true;
+ }
+
+private:
+ class C_Task : public Context {
+ public:
+ C_Task(TaskFinisher *task_finisher, const Task& task)
+ : m_task_finisher(task_finisher), m_task(task)
+ {
+ }
+ protected:
+ void finish(int r) override {
+ m_task_finisher->complete(m_task);
+ }
+ private:
+ TaskFinisher *m_task_finisher;
+ Task m_task;
+ };
+
+ CephContext &m_cct;
+
+ Mutex *m_lock;
+ Finisher *m_finisher;
+ SafeTimer *m_safe_timer;
+
+ typedef std::map<Task, std::pair<Context *, Context *> > TaskContexts;
+ TaskContexts m_task_contexts;
+
+ void complete(const Task& task) {
+ Context *ctx = NULL;
+ {
+ Mutex::Locker l(*m_lock);
+ typename TaskContexts::iterator it = m_task_contexts.find(task);
+ if (it != m_task_contexts.end()) {
+ ctx = it->second.first;
+ m_task_contexts.erase(it);
+ }
+ }
+
+ if (ctx != NULL) {
+ ctx->complete(0);
+ }
+ }
+};
+
+} // namespace librbd
+
+#endif // LIBRBD_TASK_FINISHER
diff --git a/src/librbd/TrashWatcher.cc b/src/librbd/TrashWatcher.cc
new file mode 100644
index 00000000..74e12e48
--- /dev/null
+++ b/src/librbd/TrashWatcher.cc
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/TrashWatcher.h"
+#include "include/rbd_types.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/watcher/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::TrashWatcher: " << __func__ << ": "
+
+namespace librbd {
+
+using namespace trash_watcher;
+using namespace watcher;
+
+using librbd::util::create_rados_callback;
+
+namespace {
+
+static const uint64_t NOTIFY_TIMEOUT_MS = 5000;
+
+} // anonymous namespace
+
+template <typename I>
+TrashWatcher<I>::TrashWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue)
+ : Watcher(io_ctx, work_queue, RBD_TRASH) {
+}
+
+template <typename I>
+void TrashWatcher<I>::notify_image_added(
+ librados::IoCtx &io_ctx, const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec, Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ImageAddedPayload{image_id, trash_image_spec}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::notify_image_removed(librados::IoCtx &io_ctx,
+ const std::string& image_id,
+ Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ImageRemovedPayload{image_id}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 15) << "notify_id=" << notify_id << ", "
+ << "handle=" << handle << dendl;
+
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "error decoding image notification: " << err.what()
+ << dendl;
+ Context *ctx = new C_NotifyAck(this, notify_id, handle);
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(watcher::util::HandlePayloadVisitor<TrashWatcher<I>>(
+ this, notify_id, handle), notify_message.payload);
+}
+
+template <typename I>
+bool TrashWatcher<I>::handle_payload(const ImageAddedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << dendl;
+ handle_image_added(payload.image_id, payload.trash_image_spec);
+ return true;
+}
+
+template <typename I>
+bool TrashWatcher<I>::handle_payload(const ImageRemovedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << dendl;
+ handle_image_removed(payload.image_id);
+ return true;
+}
+
+template <typename I>
+bool TrashWatcher<I>::handle_payload(const UnknownPayload &payload,
+ Context *on_notify_ack) {
+ return true;
+}
+
+} // namespace librbd
+
+template class librbd::TrashWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/TrashWatcher.h b/src/librbd/TrashWatcher.h
new file mode 100644
index 00000000..8f5fd139
--- /dev/null
+++ b/src/librbd/TrashWatcher.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_WATCHER_H
+#define CEPH_LIBRBD_TRASH_WATCHER_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Watcher.h"
+#include "librbd/trash_watcher/Types.h"
+
+namespace librbd {
+
+namespace watcher {
+namespace util {
+template <typename> struct HandlePayloadVisitor;
+} // namespace util
+} // namespace watcher
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashWatcher : public Watcher {
+ friend struct watcher::util::HandlePayloadVisitor<TrashWatcher<ImageCtxT>>;
+public:
+ TrashWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue);
+
+ static void notify_image_added(librados::IoCtx &io_ctx,
+ const std::string& image_id,
+ const cls::rbd::TrashImageSpec& spec,
+ Context *on_finish);
+ static void notify_image_removed(librados::IoCtx &io_ctx,
+ const std::string& image_id,
+ Context *on_finish);
+
+protected:
+ virtual void handle_image_added(const std::string &image_id,
+ const cls::rbd::TrashImageSpec& spec) = 0;
+ virtual void handle_image_removed(const std::string &image_id) = 0;
+
+private:
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+
+ bool handle_payload(const trash_watcher::ImageAddedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const trash_watcher::ImageRemovedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const trash_watcher::UnknownPayload &payload,
+ Context *on_notify_ack);
+};
+
+} // namespace librbd
+
+extern template class librbd::TrashWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_TRASH_WATCHER_H
diff --git a/src/librbd/Types.h b/src/librbd/Types.h
new file mode 100644
index 00000000..3f110447
--- /dev/null
+++ b/src/librbd/Types.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_TYPES_H
+#define LIBRBD_TYPES_H
+
+#include "include/types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "deep_copy/Types.h"
+#include <map>
+#include <string>
+
+namespace librbd {
+
+// Performance counters
+enum {
+ l_librbd_first = 26000,
+
+ l_librbd_rd, // read ops
+ l_librbd_rd_bytes, // bytes read
+ l_librbd_rd_latency, // average latency
+ l_librbd_wr,
+ l_librbd_wr_bytes,
+ l_librbd_wr_latency,
+ l_librbd_discard,
+ l_librbd_discard_bytes,
+ l_librbd_discard_latency,
+ l_librbd_flush,
+ l_librbd_flush_latency,
+
+ l_librbd_ws,
+ l_librbd_ws_bytes,
+ l_librbd_ws_latency,
+
+ l_librbd_cmp,
+ l_librbd_cmp_bytes,
+ l_librbd_cmp_latency,
+
+ l_librbd_snap_create,
+ l_librbd_snap_remove,
+ l_librbd_snap_rollback,
+ l_librbd_snap_rename,
+
+ l_librbd_notify,
+ l_librbd_resize,
+
+ l_librbd_readahead,
+ l_librbd_readahead_bytes,
+
+ l_librbd_invalidate_cache,
+
+ l_librbd_opened_time,
+ l_librbd_lock_acquired_time,
+
+ l_librbd_last,
+};
+
+typedef std::map<uint64_t, uint64_t> SnapSeqs;
+
+/// Full information about an image's parent.
+struct ParentImageInfo {
+ /// Identification of the parent.
+ cls::rbd::ParentImageSpec spec;
+
+ /** @brief Where the portion of data shared with the child image ends.
+ * Since images can be resized multiple times, the portion of data shared
+ * with the child image is not necessarily min(parent size, child size).
+ * If the child image is first shrunk and then enlarged, the common portion
+ * will be shorter. */
+ uint64_t overlap = 0;
+};
+
+struct SnapInfo {
+ std::string name;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ uint64_t size;
+ ParentImageInfo parent;
+ uint8_t protection_status;
+ uint64_t flags;
+ utime_t timestamp;
+ SnapInfo(std::string _name,
+ const cls::rbd::SnapshotNamespace &_snap_namespace,
+ uint64_t _size, const ParentImageInfo &_parent,
+ uint8_t _protection_status, uint64_t _flags, utime_t _timestamp)
+ : name(_name), snap_namespace(_snap_namespace), size(_size),
+ parent(_parent), protection_status(_protection_status), flags(_flags),
+ timestamp(_timestamp) {
+ }
+};
+
+enum {
+ OPEN_FLAG_SKIP_OPEN_PARENT = 1 << 0,
+ OPEN_FLAG_OLD_FORMAT = 1 << 1,
+ OPEN_FLAG_IGNORE_MIGRATING = 1 << 2
+};
+
+struct MigrationInfo {
+ int64_t pool_id = -1;
+ std::string pool_namespace;
+ std::string image_name;
+ std::string image_id;
+ deep_copy::SnapMap snap_map;
+ uint64_t overlap = 0;
+ bool flatten = false;
+
+ MigrationInfo() {
+ }
+ MigrationInfo(int64_t pool_id, const std::string& pool_namespace,
+ const std::string& image_name, const std::string& image_id,
+ const deep_copy::SnapMap &snap_map, uint64_t overlap,
+ bool flatten)
+ : pool_id(pool_id), pool_namespace(pool_namespace), image_name(image_name),
+ image_id(image_id), snap_map(snap_map), overlap(overlap),
+ flatten(flatten) {
+ }
+
+ bool empty() const {
+ return pool_id == -1;
+ }
+};
+
+} // namespace librbd
+
+#endif // LIBRBD_TYPES_H
diff --git a/src/librbd/Utils.cc b/src/librbd/Utils.cc
new file mode 100644
index 00000000..c9d7b041
--- /dev/null
+++ b/src/librbd/Utils.cc
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "librbd/Utils.h"
+#include "include/rbd_types.h"
+#include "include/stringify.h"
+#include "include/rbd/features.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Features.h"
+#include <random>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::util::" << __func__ << ": "
+
+namespace librbd {
+namespace util {
+
+const std::string group_header_name(const std::string &group_id)
+{
+ return RBD_GROUP_HEADER_PREFIX + group_id;
+}
+
+const std::string id_obj_name(const std::string &name)
+{
+ return RBD_ID_PREFIX + name;
+}
+
+const std::string header_name(const std::string &image_id)
+{
+ return RBD_HEADER_PREFIX + image_id;
+}
+
+const std::string old_header_name(const std::string &image_name)
+{
+ return image_name + RBD_SUFFIX;
+}
+
+std::string unique_lock_name(const std::string &name, void *address) {
+ return name + " (" + stringify(address) + ")";
+}
+
+librados::AioCompletion *create_rados_callback(Context *on_finish) {
+ return create_rados_callback<Context, &Context::complete>(on_finish);
+}
+
+std::string generate_image_id(librados::IoCtx &ioctx) {
+ librados::Rados rados(ioctx);
+
+ uint64_t bid = rados.get_instance_id();
+ std::mt19937 generator{std::random_device{}()};
+ std::uniform_int_distribution<uint32_t> distribution{0, 0xFFFFFFFF};
+ uint32_t extra = distribution(generator);
+
+ ostringstream bid_ss;
+ bid_ss << std::hex << bid << std::hex << extra;
+ std::string id = bid_ss.str();
+
+ // ensure the image id won't overflow the fixed block name size
+ if (id.length() > RBD_MAX_IMAGE_ID_LENGTH) {
+ id = id.substr(id.length() - RBD_MAX_IMAGE_ID_LENGTH);
+ }
+
+ return id;
+}
+
+uint64_t get_rbd_default_features(CephContext* cct)
+{
+ auto value = cct->_conf.get_val<std::string>("rbd_default_features");
+ return librbd::rbd_features_from_string(value, nullptr);
+}
+
+
+bool calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ uint64_t length,
+ size_t *write_offset,
+ size_t *write_length,
+ size_t *offset) {
+ size_t extent_size;
+ if (*offset + sparse_size > length) {
+ extent_size = length - *offset;
+ } else {
+ extent_size = sparse_size;
+ }
+
+ bufferptr extent(bp, *offset, extent_size);
+ *offset += extent_size;
+
+ bool extent_is_zero = extent.is_zero();
+ if (!extent_is_zero) {
+ *write_length += extent_size;
+ }
+ if (extent_is_zero && *write_length == 0) {
+ *write_offset += extent_size;
+ }
+
+ if ((extent_is_zero || *offset == length) && *write_length != 0) {
+ return true;
+ }
+ return false;
+}
+
+bool is_metadata_config_override(const std::string& metadata_key,
+ std::string* config_key) {
+ size_t prefix_len = librbd::ImageCtx::METADATA_CONF_PREFIX.size();
+ if (metadata_key.size() > prefix_len &&
+ metadata_key.compare(0, prefix_len,
+ librbd::ImageCtx::METADATA_CONF_PREFIX) == 0) {
+ *config_key = metadata_key.substr(prefix_len,
+ metadata_key.size() - prefix_len);
+ return true;
+ }
+ return false;
+}
+
+int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc,
+ int64_t pool_id,
+ const std::optional<std::string>& pool_namespace,
+ librados::IoCtx* dst_io_ctx) {
+ auto cct = (CephContext *)src_io_ctx.cct();
+
+ librados::Rados rados(src_io_ctx);
+ int r = rados.ioctx_create2(pool_id, *dst_io_ctx);
+ if (r == -ENOENT) {
+ ldout(cct, 1) << pool_desc << " pool " << pool_id << " no longer exists"
+ << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "error accessing " << pool_desc << " pool " << pool_id
+ << dendl;
+ return r;
+ }
+
+ dst_io_ctx->set_namespace(
+ pool_namespace ? *pool_namespace : src_io_ctx.get_namespace());
+ return 0;
+}
+
+} // namespace util
+} // namespace librbd
diff --git a/src/librbd/Utils.h b/src/librbd/Utils.h
new file mode 100644
index 00000000..c0b68b9f
--- /dev/null
+++ b/src/librbd/Utils.h
@@ -0,0 +1,221 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_UTILS_H
+#define CEPH_LIBRBD_UTILS_H
+
+#include "include/rados/librados.hpp"
+#include "include/rbd_types.h"
+#include "include/Context.h"
+#include "common/zipkin_trace.h"
+
+#include <atomic>
+#include <type_traits>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace util {
+namespace detail {
+
+template <typename T>
+void rados_callback(rados_completion_t c, void *arg) {
+ reinterpret_cast<T*>(arg)->complete(rados_aio_get_return_value(c));
+}
+
+template <typename T, void(T::*MF)(int)>
+void rados_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ (obj->*MF)(r);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy>
+void rados_state_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ Context *on_finish = (obj->*MF)(&r);
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ if (destroy) {
+ delete obj;
+ }
+ }
+}
+
+template <typename T, void (T::*MF)(int)>
+class C_CallbackAdapter : public Context {
+ T *obj;
+public:
+ C_CallbackAdapter(T *obj) : obj(obj) {
+ }
+
+protected:
+ void finish(int r) override {
+ (obj->*MF)(r);
+ }
+};
+
+template <typename T, Context*(T::*MF)(int*), bool destroy>
+class C_StateCallbackAdapter : public Context {
+ T *obj;
+public:
+ C_StateCallbackAdapter(T *obj) : obj(obj){
+ }
+
+protected:
+ void complete(int r) override {
+ Context *on_finish = (obj->*MF)(&r);
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ if (destroy) {
+ delete obj;
+ }
+ }
+ Context::complete(r);
+ }
+ void finish(int r) override {
+ }
+};
+
+template <typename WQ>
+struct C_AsyncCallback : public Context {
+ WQ *op_work_queue;
+ Context *on_finish;
+
+ C_AsyncCallback(WQ *op_work_queue, Context *on_finish)
+ : op_work_queue(op_work_queue), on_finish(on_finish) {
+ }
+ void finish(int r) override {
+ op_work_queue->queue(on_finish, r);
+ }
+};
+
+} // namespace detail
+
+std::string generate_image_id(librados::IoCtx &ioctx);
+
+template <typename T>
+inline std::string generate_image_id(librados::IoCtx &ioctx) {
+ return generate_image_id(ioctx);
+}
+
+const std::string group_header_name(const std::string &group_id);
+const std::string id_obj_name(const std::string &name);
+const std::string header_name(const std::string &image_id);
+const std::string old_header_name(const std::string &image_name);
+std::string unique_lock_name(const std::string &name, void *address);
+
+librados::AioCompletion *create_rados_callback(Context *on_finish);
+
+template <typename T>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_callback<T>, nullptr);
+}
+
+template <typename T, void(T::*MF)(int)>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_callback<T, MF>, nullptr);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_state_callback<T, MF, destroy>, nullptr);
+}
+
+template <typename T, void(T::*MF)(int) = &T::complete>
+Context *create_context_callback(T *obj) {
+ return new detail::C_CallbackAdapter<T, MF>(obj);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+Context *create_context_callback(T *obj) {
+ return new detail::C_StateCallbackAdapter<T, MF, destroy>(obj);
+}
+
+template <typename I>
+Context *create_async_context_callback(I &image_ctx, Context *on_finish) {
+ // use async callback to acquire a clean lock context
+ return new detail::C_AsyncCallback<
+ typename std::decay<decltype(*image_ctx.op_work_queue)>::type>(
+ image_ctx.op_work_queue, on_finish);
+}
+
+template <typename WQ>
+Context *create_async_context_callback(WQ *work_queue, Context *on_finish) {
+ // use async callback to acquire a clean lock context
+ return new detail::C_AsyncCallback<WQ>(work_queue, on_finish);
+}
+
+// TODO: temporary until AioCompletion supports templated ImageCtx
+inline ImageCtx *get_image_ctx(ImageCtx *image_ctx) {
+ return image_ctx;
+}
+
+/// helper for tracking in-flight async ops when coordinating
+/// a shut down of the invoking class instance
+class AsyncOpTracker {
+public:
+ void start_op() {
+ m_refs++;
+ }
+
+ void finish_op() {
+ if (--m_refs == 0 && m_on_finish != nullptr) {
+ Context *on_finish = nullptr;
+ std::swap(on_finish, m_on_finish);
+ on_finish->complete(0);
+ }
+ }
+
+ template <typename I>
+ void wait(I &image_ctx, Context *on_finish) {
+ ceph_assert(m_on_finish == nullptr);
+
+ on_finish = create_async_context_callback(image_ctx, on_finish);
+ if (m_refs == 0) {
+ on_finish->complete(0);
+ return;
+ }
+ m_on_finish = on_finish;
+ }
+
+private:
+ std::atomic<uint64_t> m_refs = { 0 };
+ Context *m_on_finish = nullptr;
+};
+
+uint64_t get_rbd_default_features(CephContext* cct);
+
+bool calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ uint64_t length,
+ size_t *write_offset,
+ size_t *write_length,
+ size_t *offset);
+
+template <typename I>
+inline ZTracer::Trace create_trace(const I &image_ctx, const char *trace_name,
+ const ZTracer::Trace &parent_trace) {
+ if (parent_trace.valid()) {
+ return ZTracer::Trace(trace_name, &image_ctx.trace_endpoint, &parent_trace);
+ }
+ return ZTracer::Trace();
+}
+
+bool is_metadata_config_override(const std::string& metadata_key,
+ std::string* config_key);
+
+int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc,
+ int64_t pool_id,
+ const std::optional<std::string>& pool_namespace,
+ librados::IoCtx* dst_io_ctx);
+
+} // namespace util
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_UTILS_H
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
new file mode 100644
index 00000000..2ed85f78
--- /dev/null
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -0,0 +1,525 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "librbd/watcher/Utils.h"
+
+namespace librbd {
+namespace watch_notify {
+
+namespace {
+
+class CheckForRefreshVisitor : public boost::static_visitor<bool> {
+public:
+ template <typename Payload>
+ inline bool operator()(const Payload &payload) const {
+ return Payload::CHECK_FOR_REFRESH;
+ }
+};
+
+class GetNotifyOpVisitor : public boost::static_visitor<NotifyOp> {
+public:
+ template <typename Payload>
+ NotifyOp operator()(const Payload &payload) const {
+ return Payload::NOTIFY_OP;
+ }
+};
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void AsyncRequestId::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+ encode(request_id, bl);
+}
+
+void AsyncRequestId::decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(client_id, iter);
+ decode(request_id, iter);
+}
+
+void AsyncRequestId::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+ f->dump_unsigned("request_id", request_id);
+}
+
+void AcquiredLockPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+}
+
+void AcquiredLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ if (version >= 2) {
+ decode(client_id, iter);
+ }
+}
+
+void AcquiredLockPayload::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+}
+
+void ReleasedLockPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+}
+
+void ReleasedLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ if (version >= 2) {
+ decode(client_id, iter);
+ }
+}
+
+void ReleasedLockPayload::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+}
+
+void RequestLockPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+ encode(force, bl);
+}
+
+void RequestLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ if (version >= 2) {
+ decode(client_id, iter);
+ }
+ if (version >= 3) {
+ decode(force, iter);
+ }
+}
+
+void RequestLockPayload::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+ f->dump_bool("force", force);
+}
+
+void HeaderUpdatePayload::encode(bufferlist &bl) const {
+}
+
+void HeaderUpdatePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void HeaderUpdatePayload::dump(Formatter *f) const {
+}
+
+void AsyncRequestPayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(async_request_id, bl);
+}
+
+void AsyncRequestPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(async_request_id, iter);
+}
+
+void AsyncRequestPayloadBase::dump(Formatter *f) const {
+ f->open_object_section("async_request_id");
+ async_request_id.dump(f);
+ f->close_section();
+}
+
+void AsyncProgressPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ AsyncRequestPayloadBase::encode(bl);
+ encode(offset, bl);
+ encode(total, bl);
+}
+
+void AsyncProgressPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ AsyncRequestPayloadBase::decode(version, iter);
+ decode(offset, iter);
+ decode(total, iter);
+}
+
+void AsyncProgressPayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("total", total);
+}
+
+void AsyncCompletePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ AsyncRequestPayloadBase::encode(bl);
+ encode(result, bl);
+}
+
+void AsyncCompletePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ AsyncRequestPayloadBase::decode(version, iter);
+ decode(result, iter);
+}
+
+void AsyncCompletePayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_int("result", result);
+}
+
+void ResizePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(size, bl);
+ AsyncRequestPayloadBase::encode(bl);
+ encode(allow_shrink, bl);
+}
+
+void ResizePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(size, iter);
+ AsyncRequestPayloadBase::decode(version, iter);
+
+ if (version >= 4) {
+ decode(allow_shrink, iter);
+ }
+}
+
+void ResizePayload::dump(Formatter *f) const {
+ f->dump_unsigned("size", size);
+ f->dump_bool("allow_shrink", allow_shrink);
+ AsyncRequestPayloadBase::dump(f);
+}
+
+void SnapPayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(snap_name, bl);
+ encode(snap_namespace, bl);
+}
+
+void SnapPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(snap_name, iter);
+ if (version >= 6) {
+ decode(snap_namespace, iter);
+ }
+}
+
+void SnapPayloadBase::dump(Formatter *f) const {
+ f->dump_string("snap_name", snap_name);
+ snap_namespace.dump(f);
+}
+
+void SnapCreatePayload::encode(bufferlist &bl) const {
+ SnapPayloadBase::encode(bl);
+}
+
+void SnapCreatePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ SnapPayloadBase::decode(version, iter);
+ if (version == 5) {
+ decode(snap_namespace, iter);
+ }
+}
+
+void SnapCreatePayload::dump(Formatter *f) const {
+ SnapPayloadBase::dump(f);
+}
+
+void SnapRenamePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(snap_id, bl);
+ SnapPayloadBase::encode(bl);
+}
+
+void SnapRenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(snap_id, iter);
+ SnapPayloadBase::decode(version, iter);
+}
+
+void SnapRenamePayload::dump(Formatter *f) const {
+ f->dump_unsigned("src_snap_id", snap_id);
+ SnapPayloadBase::dump(f);
+}
+
+void RenamePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(image_name, bl);
+}
+
+void RenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(image_name, iter);
+}
+
+void RenamePayload::dump(Formatter *f) const {
+ f->dump_string("image_name", image_name);
+}
+
+void UpdateFeaturesPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(features, bl);
+ encode(enabled, bl);
+}
+
+void UpdateFeaturesPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(features, iter);
+ decode(enabled, iter);
+}
+
+void UpdateFeaturesPayload::dump(Formatter *f) const {
+ f->dump_unsigned("features", features);
+ f->dump_bool("enabled", enabled);
+}
+
+void SparsifyPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ AsyncRequestPayloadBase::encode(bl);
+ encode(sparse_size, bl);
+}
+
+void SparsifyPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ AsyncRequestPayloadBase::decode(version, iter);
+ decode(sparse_size, iter);
+}
+
+void SparsifyPayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_unsigned("sparse_size", sparse_size);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+bool NotifyMessage::check_for_refresh() const {
+ return boost::apply_visitor(CheckForRefreshVisitor(), payload);
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(6, 1, bl);
+ boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_ACQUIRED_LOCK:
+ payload = AcquiredLockPayload();
+ break;
+ case NOTIFY_OP_RELEASED_LOCK:
+ payload = ReleasedLockPayload();
+ break;
+ case NOTIFY_OP_REQUEST_LOCK:
+ payload = RequestLockPayload();
+ break;
+ case NOTIFY_OP_HEADER_UPDATE:
+ payload = HeaderUpdatePayload();
+ break;
+ case NOTIFY_OP_ASYNC_PROGRESS:
+ payload = AsyncProgressPayload();
+ break;
+ case NOTIFY_OP_ASYNC_COMPLETE:
+ payload = AsyncCompletePayload();
+ break;
+ case NOTIFY_OP_FLATTEN:
+ payload = FlattenPayload();
+ break;
+ case NOTIFY_OP_RESIZE:
+ payload = ResizePayload();
+ break;
+ case NOTIFY_OP_SNAP_CREATE:
+ payload = SnapCreatePayload();
+ break;
+ case NOTIFY_OP_SNAP_REMOVE:
+ payload = SnapRemovePayload();
+ break;
+ case NOTIFY_OP_SNAP_RENAME:
+ payload = SnapRenamePayload();
+ break;
+ case NOTIFY_OP_SNAP_PROTECT:
+ payload = SnapProtectPayload();
+ break;
+ case NOTIFY_OP_SNAP_UNPROTECT:
+ payload = SnapUnprotectPayload();
+ break;
+ case NOTIFY_OP_REBUILD_OBJECT_MAP:
+ payload = RebuildObjectMapPayload();
+ break;
+ case NOTIFY_OP_RENAME:
+ payload = RenamePayload();
+ break;
+ case NOTIFY_OP_UPDATE_FEATURES:
+ payload = UpdateFeaturesPayload();
+ break;
+ case NOTIFY_OP_MIGRATE:
+ payload = MigratePayload();
+ break;
+ case NOTIFY_OP_SPARSIFY:
+ payload = SparsifyPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+NotifyOp NotifyMessage::get_notify_op() const {
+ return apply_visitor(GetNotifyOpVisitor(), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(AcquiredLockPayload(ClientId(1, 2))));
+ o.push_back(new NotifyMessage(ReleasedLockPayload(ClientId(1, 2))));
+ o.push_back(new NotifyMessage(RequestLockPayload(ClientId(1, 2), true)));
+ o.push_back(new NotifyMessage(HeaderUpdatePayload()));
+ o.push_back(new NotifyMessage(AsyncProgressPayload(AsyncRequestId(ClientId(0, 1), 2), 3, 4)));
+ o.push_back(new NotifyMessage(AsyncCompletePayload(AsyncRequestId(ClientId(0, 1), 2), 3)));
+ o.push_back(new NotifyMessage(FlattenPayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(ResizePayload(123, true, AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(SnapCreatePayload(cls::rbd::UserSnapshotNamespace(),
+ "foo")));
+ o.push_back(new NotifyMessage(SnapRemovePayload(cls::rbd::UserSnapshotNamespace(), "foo")));
+ o.push_back(new NotifyMessage(SnapProtectPayload(cls::rbd::UserSnapshotNamespace(), "foo")));
+ o.push_back(new NotifyMessage(SnapUnprotectPayload(cls::rbd::UserSnapshotNamespace(), "foo")));
+ o.push_back(new NotifyMessage(RebuildObjectMapPayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(RenamePayload("foo")));
+ o.push_back(new NotifyMessage(UpdateFeaturesPayload(1, true)));
+ o.push_back(new NotifyMessage(MigratePayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(SparsifyPayload(AsyncRequestId(ClientId(0, 1), 2), 1)));
+}
+
+void ResponseMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(result, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ResponseMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+ decode(result, iter);
+ DECODE_FINISH(iter);
+}
+
+void ResponseMessage::dump(Formatter *f) const {
+ f->dump_int("result", result);
+}
+
+void ResponseMessage::generate_test_instances(std::list<ResponseMessage *> &o) {
+ o.push_back(new ResponseMessage(1));
+}
+
+std::ostream &operator<<(std::ostream &out,
+ const librbd::watch_notify::NotifyOp &op) {
+ using namespace librbd::watch_notify;
+
+ switch (op) {
+ case NOTIFY_OP_ACQUIRED_LOCK:
+ out << "AcquiredLock";
+ break;
+ case NOTIFY_OP_RELEASED_LOCK:
+ out << "ReleasedLock";
+ break;
+ case NOTIFY_OP_REQUEST_LOCK:
+ out << "RequestLock";
+ break;
+ case NOTIFY_OP_HEADER_UPDATE:
+ out << "HeaderUpdate";
+ break;
+ case NOTIFY_OP_ASYNC_PROGRESS:
+ out << "AsyncProgress";
+ break;
+ case NOTIFY_OP_ASYNC_COMPLETE:
+ out << "AsyncComplete";
+ break;
+ case NOTIFY_OP_FLATTEN:
+ out << "Flatten";
+ break;
+ case NOTIFY_OP_RESIZE:
+ out << "Resize";
+ break;
+ case NOTIFY_OP_SNAP_CREATE:
+ out << "SnapCreate";
+ break;
+ case NOTIFY_OP_SNAP_REMOVE:
+ out << "SnapRemove";
+ break;
+ case NOTIFY_OP_SNAP_RENAME:
+ out << "SnapRename";
+ break;
+ case NOTIFY_OP_SNAP_PROTECT:
+ out << "SnapProtect";
+ break;
+ case NOTIFY_OP_SNAP_UNPROTECT:
+ out << "SnapUnprotect";
+ break;
+ case NOTIFY_OP_REBUILD_OBJECT_MAP:
+ out << "RebuildObjectMap";
+ break;
+ case NOTIFY_OP_RENAME:
+ out << "Rename";
+ break;
+ case NOTIFY_OP_UPDATE_FEATURES:
+ out << "UpdateFeatures";
+ break;
+ case NOTIFY_OP_MIGRATE:
+ out << "Migrate";
+ break;
+ case NOTIFY_OP_SPARSIFY:
+ out << "Sparsify";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out,
+ const librbd::watch_notify::AsyncRequestId &request) {
+ out << "[" << request.client_id.gid << "," << request.client_id.handle << ","
+ << request.request_id << "]";
+ return out;
+}
+} // namespace watch_notify
+} // namespace librbd
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
new file mode 100644
index 00000000..02d2b60b
--- /dev/null
+++ b/src/librbd/WatchNotifyTypes.h
@@ -0,0 +1,400 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef LIBRBD_WATCH_NOTIFY_TYPES_H
+#define LIBRBD_WATCH_NOTIFY_TYPES_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "librbd/watcher/Types.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace librbd {
+namespace watch_notify {
+
+using librbd::watcher::ClientId;
+
+WRITE_CLASS_ENCODER(ClientId);
+
+struct AsyncRequestId {
+ ClientId client_id;
+ uint64_t request_id;
+
+ AsyncRequestId() : request_id() {}
+ AsyncRequestId(const ClientId &client_id_, uint64_t request_id_)
+ : client_id(client_id_), request_id(request_id_) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ inline bool operator<(const AsyncRequestId &rhs) const {
+ if (client_id != rhs.client_id) {
+ return client_id < rhs.client_id;
+ } else {
+ return request_id < rhs.request_id;
+ }
+ }
+ inline bool operator!=(const AsyncRequestId &rhs) const {
+ return (client_id != rhs.client_id || request_id != rhs.request_id);
+ }
+};
+
+enum NotifyOp {
+ NOTIFY_OP_ACQUIRED_LOCK = 0,
+ NOTIFY_OP_RELEASED_LOCK = 1,
+ NOTIFY_OP_REQUEST_LOCK = 2,
+ NOTIFY_OP_HEADER_UPDATE = 3,
+ NOTIFY_OP_ASYNC_PROGRESS = 4,
+ NOTIFY_OP_ASYNC_COMPLETE = 5,
+ NOTIFY_OP_FLATTEN = 6,
+ NOTIFY_OP_RESIZE = 7,
+ NOTIFY_OP_SNAP_CREATE = 8,
+ NOTIFY_OP_SNAP_REMOVE = 9,
+ NOTIFY_OP_REBUILD_OBJECT_MAP = 10,
+ NOTIFY_OP_SNAP_RENAME = 11,
+ NOTIFY_OP_SNAP_PROTECT = 12,
+ NOTIFY_OP_SNAP_UNPROTECT = 13,
+ NOTIFY_OP_RENAME = 14,
+ NOTIFY_OP_UPDATE_FEATURES = 15,
+ NOTIFY_OP_MIGRATE = 16,
+ NOTIFY_OP_SPARSIFY = 17,
+};
+
+struct AcquiredLockPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_ACQUIRED_LOCK;
+ static const bool CHECK_FOR_REFRESH = false;
+
+ ClientId client_id;
+
+ AcquiredLockPayload() {}
+ AcquiredLockPayload(const ClientId &client_id_) : client_id(client_id_) {}
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ReleasedLockPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_RELEASED_LOCK;
+ static const bool CHECK_FOR_REFRESH = false;
+
+ ClientId client_id;
+
+ ReleasedLockPayload() {}
+ ReleasedLockPayload(const ClientId &client_id_) : client_id(client_id_) {}
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct RequestLockPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_REQUEST_LOCK;
+ static const bool CHECK_FOR_REFRESH = false;
+
+ ClientId client_id;
+ bool force = false;
+
+ RequestLockPayload() {}
+ RequestLockPayload(const ClientId &client_id_, bool force_)
+ : client_id(client_id_), force(force_) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct HeaderUpdatePayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_HEADER_UPDATE;
+ static const bool CHECK_FOR_REFRESH = false;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct AsyncRequestPayloadBase {
+public:
+ AsyncRequestId async_request_id;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+
+protected:
+ AsyncRequestPayloadBase() {}
+ AsyncRequestPayloadBase(const AsyncRequestId &id) : async_request_id(id) {}
+};
+
+struct AsyncProgressPayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_PROGRESS;
+ static const bool CHECK_FOR_REFRESH = false;
+
+ AsyncProgressPayload() : offset(0), total(0) {}
+ AsyncProgressPayload(const AsyncRequestId &id, uint64_t offset_, uint64_t total_)
+ : AsyncRequestPayloadBase(id), offset(offset_), total(total_) {}
+
+ uint64_t offset;
+ uint64_t total;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct AsyncCompletePayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_COMPLETE;
+ static const bool CHECK_FOR_REFRESH = false;
+
+ AsyncCompletePayload() : result(0) {}
+ AsyncCompletePayload(const AsyncRequestId &id, int r)
+ : AsyncRequestPayloadBase(id), result(r) {}
+
+ int result;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct FlattenPayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_FLATTEN;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ FlattenPayload() {}
+ FlattenPayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
+};
+
+struct ResizePayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_RESIZE;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ ResizePayload() : size(0), allow_shrink(true) {}
+ ResizePayload(uint64_t size_, bool allow_shrink_, const AsyncRequestId &id)
+ : AsyncRequestPayloadBase(id), size(size_), allow_shrink(allow_shrink_) {}
+
+ uint64_t size;
+ bool allow_shrink;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SnapPayloadBase {
+public:
+ static const bool CHECK_FOR_REFRESH = true;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+
+protected:
+ SnapPayloadBase() {}
+ SnapPayloadBase(const cls::rbd::SnapshotNamespace& _snap_namespace,
+ const std::string &name)
+ : snap_namespace(_snap_namespace), snap_name(name) {}
+};
+
+struct SnapCreatePayload : public SnapPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_CREATE;
+
+ SnapCreatePayload() {}
+ SnapCreatePayload(const cls::rbd::SnapshotNamespace &_snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(_snap_namespace, name) {}
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SnapRenamePayload : public SnapPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_RENAME;
+
+ SnapRenamePayload() {}
+ SnapRenamePayload(const uint64_t &src_snap_id,
+ const std::string &dst_name)
+ : SnapPayloadBase(cls::rbd::UserSnapshotNamespace(), dst_name), snap_id(src_snap_id) {}
+
+ uint64_t snap_id = 0;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SnapRemovePayload : public SnapPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_REMOVE;
+
+ SnapRemovePayload() {}
+ SnapRemovePayload(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(snap_namespace, name) {}
+};
+
+struct SnapProtectPayload : public SnapPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_PROTECT;
+
+ SnapProtectPayload() {}
+ SnapProtectPayload(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(snap_namespace, name) {}
+};
+
+struct SnapUnprotectPayload : public SnapPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_UNPROTECT;
+
+ SnapUnprotectPayload() {}
+ SnapUnprotectPayload(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(snap_namespace, name) {}
+};
+
+struct RebuildObjectMapPayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_REBUILD_OBJECT_MAP;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ RebuildObjectMapPayload() {}
+ RebuildObjectMapPayload(const AsyncRequestId &id)
+ : AsyncRequestPayloadBase(id) {}
+};
+
+struct RenamePayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_RENAME;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ RenamePayload() {}
+ RenamePayload(const std::string _image_name) : image_name(_image_name) {}
+
+ std::string image_name;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UpdateFeaturesPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_UPDATE_FEATURES;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ UpdateFeaturesPayload() : features(0), enabled(false) {}
+ UpdateFeaturesPayload(uint64_t features_, bool enabled_)
+ : features(features_), enabled(enabled_) {}
+
+ uint64_t features;
+ bool enabled;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct MigratePayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_MIGRATE;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ MigratePayload() {}
+ MigratePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
+};
+
+struct SparsifyPayload : public AsyncRequestPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SPARSIFY;
+ static const bool CHECK_FOR_REFRESH = true;
+
+ SparsifyPayload() {}
+ SparsifyPayload(const AsyncRequestId &id, size_t sparse_size)
+ : AsyncRequestPayloadBase(id), sparse_size(sparse_size) {}
+
+ size_t sparse_size = 0;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+ static const bool CHECK_FOR_REFRESH = false;
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<AcquiredLockPayload,
+ ReleasedLockPayload,
+ RequestLockPayload,
+ HeaderUpdatePayload,
+ AsyncProgressPayload,
+ AsyncCompletePayload,
+ FlattenPayload,
+ ResizePayload,
+ SnapCreatePayload,
+ SnapRemovePayload,
+ SnapRenamePayload,
+ SnapProtectPayload,
+ SnapUnprotectPayload,
+ RebuildObjectMapPayload,
+ RenamePayload,
+ UpdateFeaturesPayload,
+ MigratePayload,
+ SparsifyPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage() : payload(UnknownPayload()) {}
+ NotifyMessage(const Payload &payload_) : payload(payload_) {}
+
+ Payload payload;
+
+ bool check_for_refresh() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+ NotifyOp get_notify_op() const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+struct ResponseMessage {
+ ResponseMessage() : result(0) {}
+ ResponseMessage(int result_) : result(result_) {}
+
+ int result;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<ResponseMessage *> &o);
+};
+
+std::ostream &operator<<(std::ostream &out,
+ const NotifyOp &op);
+std::ostream &operator<<(std::ostream &out,
+ const AsyncRequestId &request);
+
+WRITE_CLASS_ENCODER(AsyncRequestId);
+WRITE_CLASS_ENCODER(NotifyMessage);
+WRITE_CLASS_ENCODER(ResponseMessage);
+
+} // namespace watch_notify
+} // namespace librbd
+
+
+#endif // LIBRBD_WATCH_NOTIFY_TYPES_H
diff --git a/src/librbd/Watcher.cc b/src/librbd/Watcher.cc
new file mode 100644
index 00000000..9f866f25
--- /dev/null
+++ b/src/librbd/Watcher.cc
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/Watcher.h"
+#include "librbd/watcher/RewatchRequest.h"
+#include "librbd/Utils.h"
+#include "librbd/TaskFinisher.h"
+#include "include/encoding.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include <boost/bind.hpp>
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+
+using namespace watcher;
+
+using util::create_context_callback;
+using util::create_rados_callback;
+using std::string;
+
+namespace {
+
+struct C_UnwatchAndFlush : public Context {
+ librados::Rados rados;
+ Context *on_finish;
+ bool flushing = false;
+ int ret_val = 0;
+
+ C_UnwatchAndFlush(librados::IoCtx &io_ctx, Context *on_finish)
+ : rados(io_ctx), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ if (ret_val == 0 && r < 0) {
+ ret_val = r;
+ }
+
+ if (!flushing) {
+ flushing = true;
+
+ librados::AioCompletion *aio_comp = create_rados_callback(this);
+ r = rados.aio_watch_flush(aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ return;
+ }
+
+ // ensure our reference to the RadosClient is released prior
+ // to completing the callback to avoid racing an explicit
+ // librados shutdown
+ Context *ctx = on_finish;
+ r = ret_val;
+ delete this;
+
+ ctx->complete(r);
+ }
+
+ void finish(int r) override {
+ }
+};
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Watcher::C_NotifyAck " << this << " " \
+ << __func__ << ": "
+
+Watcher::C_NotifyAck::C_NotifyAck(Watcher *watcher, uint64_t notify_id,
+ uint64_t handle)
+ : watcher(watcher), cct(watcher->m_cct), notify_id(notify_id),
+ handle(handle) {
+ ldout(cct, 10) << "id=" << notify_id << ", " << "handle=" << handle << dendl;
+}
+
+void Watcher::C_NotifyAck::finish(int r) {
+ ldout(cct, 10) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+ watcher->acknowledge_notify(notify_id, handle, out);
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Watcher: " << this << " " << __func__ \
+ << ": "
+
+Watcher::Watcher(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const string& oid)
+ : m_ioctx(ioctx), m_work_queue(work_queue), m_oid(oid),
+ m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
+ m_watch_lock(util::unique_lock_name("librbd::Watcher::m_watch_lock", this)),
+ m_watch_handle(0), m_notifier(work_queue, ioctx, oid),
+ m_watch_state(WATCH_STATE_IDLE), m_watch_ctx(*this) {
+}
+
+Watcher::~Watcher() {
+ RWLock::RLocker l(m_watch_lock);
+ ceph_assert(is_unregistered(m_watch_lock));
+}
+
+void Watcher::register_watch(Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ RWLock::WLocker watch_locker(m_watch_lock);
+ ceph_assert(is_unregistered(m_watch_lock));
+ m_watch_state = WATCH_STATE_REGISTERING;
+ m_watch_blacklisted = false;
+
+ librados::AioCompletion *aio_comp = create_rados_callback(
+ new C_RegisterWatch(this, on_finish));
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void Watcher::handle_register_watch(int r, Context *on_finish) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ ceph_assert(m_watch_state == WATCH_STATE_REGISTERING);
+
+ m_watch_state = WATCH_STATE_IDLE;
+ if (r < 0) {
+ lderr(m_cct) << "failed to register watch: " << cpp_strerror(r)
+ << dendl;
+ m_watch_handle = 0;
+ }
+
+ if (m_unregister_watch_ctx != nullptr) {
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == 0 && m_watch_error) {
+ lderr(m_cct) << "re-registering watch after error" << dendl;
+ m_watch_state = WATCH_STATE_REWATCHING;
+ watch_error = true;
+ } else {
+ m_watch_blacklisted = (r == -EBLACKLISTED);
+ }
+ }
+
+ on_finish->complete(r);
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::unregister_watch(Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ if (m_watch_state != WATCH_STATE_IDLE) {
+ ldout(m_cct, 10) << "delaying unregister until register completed"
+ << dendl;
+
+ ceph_assert(m_unregister_watch_ctx == nullptr);
+ m_unregister_watch_ctx = new FunctionContext([this, on_finish](int r) {
+ unregister_watch(on_finish);
+ });
+ return;
+ } else if (is_registered(m_watch_lock)) {
+ librados::AioCompletion *aio_comp = create_rados_callback(
+ new C_UnwatchAndFlush(m_ioctx, on_finish));
+ int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+
+ m_watch_handle = 0;
+ m_watch_blacklisted = false;
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+bool Watcher::notifications_blocked() const {
+ RWLock::RLocker locker(m_watch_lock);
+
+ bool blocked = (m_blocked_count > 0);
+ ldout(m_cct, 5) << "blocked=" << blocked << dendl;
+ return blocked;
+}
+
+void Watcher::block_notifies(Context *on_finish) {
+ {
+ RWLock::WLocker locker(m_watch_lock);
+ ++m_blocked_count;
+ ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl;
+ }
+ m_async_op_tracker.wait_for_ops(on_finish);
+}
+
+void Watcher::unblock_notifies() {
+ RWLock::WLocker locker(m_watch_lock);
+ ceph_assert(m_blocked_count > 0);
+ --m_blocked_count;
+ ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl;
+}
+
+void Watcher::flush(Context *on_finish) {
+ m_notifier.flush(on_finish);
+}
+
+std::string Watcher::get_oid() const {
+ RWLock::RLocker locker(m_watch_lock);
+ return m_oid;
+}
+
+void Watcher::set_oid(const string& oid) {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ ceph_assert(is_unregistered(m_watch_lock));
+
+ m_oid = oid;
+}
+
+void Watcher::handle_error(uint64_t handle, int err) {
+ lderr(m_cct) << "handle=" << handle << ": " << cpp_strerror(err) << dendl;
+
+ RWLock::WLocker watch_locker(m_watch_lock);
+ m_watch_error = true;
+
+ if (is_registered(m_watch_lock)) {
+ m_watch_state = WATCH_STATE_REWATCHING;
+ if (err == -EBLACKLISTED) {
+ m_watch_blacklisted = true;
+ }
+
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&Watcher::rewatch, this));
+ m_work_queue->queue(ctx);
+ }
+}
+
+void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
+ bufferlist &out) {
+ m_ioctx.notify_ack(m_oid, notify_id, handle, out);
+}
+
+void Watcher::rewatch() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *unregister_watch_ctx = nullptr;
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ ceph_assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else {
+ m_watch_error = false;
+ auto ctx = create_context_callback<
+ Watcher, &Watcher::handle_rewatch>(this);
+ auto req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock,
+ &m_watch_ctx, &m_watch_handle, ctx);
+ req->send();
+ return;
+ }
+ }
+
+ unregister_watch_ctx->complete(0);
+}
+
+void Watcher::handle_rewatch(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ ceph_assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ m_watch_blacklisted = false;
+ if (m_unregister_watch_ctx != nullptr) {
+ ldout(m_cct, 10) << "image is closing, skip rewatch" << dendl;
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLACKLISTED) {
+ lderr(m_cct) << "client blacklisted" << dendl;
+ m_watch_blacklisted = true;
+ } else if (r == -ENOENT) {
+ ldout(m_cct, 5) << "object does not exist" << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to rewatch: " << cpp_strerror(r) << dendl;
+ watch_error = true;
+ } else if (m_watch_error) {
+ lderr(m_cct) << "re-registering watch after error" << dendl;
+ watch_error = true;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ return;
+ } else if (watch_error) {
+ rewatch();
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ Watcher, &Watcher::handle_rewatch_callback>(this);
+ m_work_queue->queue(ctx, r);
+}
+
+void Watcher::handle_rewatch_callback(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+ handle_rewatch_complete(r);
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ ceph_assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLACKLISTED || r == -ENOENT) {
+ m_watch_state = WATCH_STATE_IDLE;
+ } else if (r < 0 || m_watch_error) {
+ watch_error = true;
+ } else {
+ m_watch_state = WATCH_STATE_IDLE;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::send_notify(bufferlist& payload,
+ watcher::NotifyResponse *response,
+ Context *on_finish) {
+ m_notifier.notify(payload, response, on_finish);
+}
+
+void Watcher::WatchCtx::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ // if notifications are blocked, finish the notification w/o
+ // bubbling the notification up to the derived class
+ watcher.m_async_op_tracker.start_op();
+ if (watcher.notifications_blocked()) {
+ bufferlist bl;
+ watcher.acknowledge_notify(notify_id, handle, bl);
+ } else {
+ watcher.handle_notify(notify_id, handle, notifier_id, bl);
+ }
+ watcher.m_async_op_tracker.finish_op();
+}
+
+void Watcher::WatchCtx::handle_error(uint64_t handle, int err) {
+ watcher.handle_error(handle, err);
+}
+
+} // namespace librbd
diff --git a/src/librbd/Watcher.h b/src/librbd/Watcher.h
new file mode 100644
index 00000000..69e87ad3
--- /dev/null
+++ b/src/librbd/Watcher.h
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_H
+#define CEPH_LIBRBD_WATCHER_H
+
+#include "common/AsyncOpTracker.h"
+#include "common/Mutex.h"
+#include "common/RWLock.h"
+#include "include/rados/librados.hpp"
+#include "librbd/watcher/Notifier.h"
+#include "librbd/watcher/Types.h"
+#include <string>
+#include <utility>
+
+class ContextWQ;
+
+namespace librbd {
+
+namespace watcher { struct NotifyResponse; }
+
+class Watcher {
+public:
+ struct C_NotifyAck : public Context {
+ Watcher *watcher;
+ CephContext *cct;
+ uint64_t notify_id;
+ uint64_t handle;
+ bufferlist out;
+
+ C_NotifyAck(Watcher *watcher, uint64_t notify_id, uint64_t handle);
+ void finish(int r) override;
+ };
+
+ Watcher(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid);
+ virtual ~Watcher();
+
+ void register_watch(Context *on_finish);
+ virtual void unregister_watch(Context *on_finish);
+ void flush(Context *on_finish);
+
+ bool notifications_blocked() const;
+ virtual void block_notifies(Context *on_finish);
+ void unblock_notifies();
+
+ std::string get_oid() const;
+ void set_oid(const string& oid);
+
+ uint64_t get_watch_handle() const {
+ RWLock::RLocker watch_locker(m_watch_lock);
+ return m_watch_handle;
+ }
+
+ bool is_registered() const {
+ RWLock::RLocker locker(m_watch_lock);
+ return is_registered(m_watch_lock);
+ }
+ bool is_unregistered() const {
+ RWLock::RLocker locker(m_watch_lock);
+ return is_unregistered(m_watch_lock);
+ }
+ bool is_blacklisted() const {
+ RWLock::RLocker locker(m_watch_lock);
+ return m_watch_blacklisted;
+ }
+
+protected:
+ enum WatchState {
+ WATCH_STATE_IDLE,
+ WATCH_STATE_REGISTERING,
+ WATCH_STATE_REWATCHING
+ };
+
+ librados::IoCtx& m_ioctx;
+ ContextWQ *m_work_queue;
+ std::string m_oid;
+ CephContext *m_cct;
+ mutable RWLock m_watch_lock;
+ uint64_t m_watch_handle;
+ watcher::Notifier m_notifier;
+
+ WatchState m_watch_state;
+ bool m_watch_blacklisted = false;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ bool is_registered(const RWLock&) const {
+ return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle != 0);
+ }
+ bool is_unregistered(const RWLock&) const {
+ return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle == 0);
+ }
+
+ void send_notify(bufferlist &payload,
+ watcher::NotifyResponse *response = nullptr,
+ Context *on_finish = nullptr);
+
+ virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) = 0;
+
+ virtual void handle_error(uint64_t cookie, int err);
+
+ void acknowledge_notify(uint64_t notify_id, uint64_t handle,
+ bufferlist &out);
+
+ virtual void handle_rewatch_complete(int r) { }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNREGISTERED
+ * |
+ * | (register_watch)
+ * |
+ * REGISTERING
+ * |
+ * v (watch error)
+ * REGISTERED * * * * * * * > ERROR
+ * | ^ |
+ * | | | (rewatch)
+ * | | v
+ * | | REWATCHING
+ * | | |
+ * | | |
+ * | \---------------------/
+ * |
+ * | (unregister_watch)
+ * |
+ * v
+ * UNREGISTERED
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ struct WatchCtx : public librados::WatchCtx2 {
+ Watcher &watcher;
+
+ WatchCtx(Watcher &parent) : watcher(parent) {}
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ bufferlist& bl) override;
+ void handle_error(uint64_t handle, int err) override;
+ };
+
+ struct C_RegisterWatch : public Context {
+ Watcher *watcher;
+ Context *on_finish;
+
+ C_RegisterWatch(Watcher *watcher, Context *on_finish)
+ : watcher(watcher), on_finish(on_finish) {
+ }
+ void finish(int r) override {
+ watcher->handle_register_watch(r, on_finish);
+ }
+ };
+
+ WatchCtx m_watch_ctx;
+ Context *m_unregister_watch_ctx = nullptr;
+
+ bool m_watch_error = false;
+
+ uint32_t m_blocked_count = 0;
+
+ void handle_register_watch(int r, Context *on_finish);
+
+ void rewatch();
+ void handle_rewatch(int r);
+ void handle_rewatch_callback(int r);
+
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_H
diff --git a/src/librbd/api/Config.cc b/src/librbd/api/Config.cc
new file mode 100644
index 00000000..08be9d2c
--- /dev/null
+++ b/src/librbd/api/Config.cc
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Config.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/api/PoolMetadata.h"
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Config: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+const uint32_t MAX_KEYS = 64;
+
+typedef std::map<std::string, std::pair<std::string, config_source_t>> Parent;
+
+static std::set<std::string> EXCLUDE_OPTIONS {
+ "rbd_auto_exclusive_lock_until_manual_request",
+ "rbd_default_format",
+ "rbd_default_map_options",
+ "rbd_default_pool",
+ "rbd_discard_on_zeroed_write_same",
+ "rbd_op_thread_timeout",
+ "rbd_op_threads",
+ "rbd_tracing",
+ "rbd_validate_names",
+ "rbd_validate_pool",
+ "rbd_mirror_pool_replayers_refresh_interval",
+ "rbd_config_pool_override_update_timestamp"
+ };
+static std::set<std::string> EXCLUDE_IMAGE_OPTIONS {
+ "rbd_default_clone_format",
+ "rbd_default_data_pool",
+ "rbd_default_features",
+ "rbd_default_format",
+ "rbd_default_order",
+ "rbd_default_stripe_count",
+ "rbd_default_stripe_unit",
+ "rbd_journal_order",
+ "rbd_journal_pool",
+ "rbd_journal_splay_width"
+ };
+
+struct Options : Parent {
+ librados::IoCtx m_io_ctx;
+
+ Options(librados::IoCtx& io_ctx, bool image_apply_only_options) {
+ m_io_ctx.dup(io_ctx);
+ m_io_ctx.set_namespace("");
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_io_ctx.cct());
+
+ const std::string rbd_key_prefix("rbd_");
+ const std::string rbd_mirror_key_prefix("rbd_mirror_");
+ auto& schema = cct->_conf.get_schema();
+ for (auto& pair : schema) {
+ if (!boost::starts_with(pair.first, rbd_key_prefix)) {
+ continue;
+ } else if (EXCLUDE_OPTIONS.count(pair.first) != 0) {
+ continue;
+ } else if (image_apply_only_options &&
+ EXCLUDE_IMAGE_OPTIONS.count(pair.first) != 0) {
+ continue;
+ } else if (image_apply_only_options &&
+ boost::starts_with(pair.first, rbd_mirror_key_prefix)) {
+ continue;
+ }
+
+ insert({pair.first, {}});
+ }
+ }
+
+ int init() {
+ CephContext *cct = (CephContext *)m_io_ctx.cct();
+
+ for (auto &it : *this) {
+ int r = cct->_conf.get_val(it.first.c_str(), &it.second.first);
+ ceph_assert(r == 0);
+ it.second.second = RBD_CONFIG_SOURCE_CONFIG;
+ }
+
+ std::string last_key = ImageCtx::METADATA_CONF_PREFIX;
+ bool more_results = true;
+
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+
+ int r = librbd::api::PoolMetadata<>::list(m_io_ctx, last_key, MAX_KEYS,
+ &pairs);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pairs.empty()) {
+ break;
+ }
+
+ more_results = (pairs.size() == MAX_KEYS);
+ last_key = pairs.rbegin()->first;
+
+ for (auto kv : pairs) {
+ std::string key;
+ if (!util::is_metadata_config_override(kv.first, &key)) {
+ more_results = false;
+ break;
+ }
+ auto it = find(key);
+ if (it != end()) {
+ it->second = {{kv.second.c_str(), kv.second.length()},
+ RBD_CONFIG_SOURCE_POOL};
+ }
+ }
+ }
+ return 0;
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+bool Config<I>::is_option_name(librados::IoCtx& io_ctx,
+ const std::string &name) {
+ Options opts(io_ctx, false);
+
+ return (opts.find(name) != opts.end());
+}
+
+template <typename I>
+int Config<I>::list(librados::IoCtx& io_ctx,
+ std::vector<config_option_t> *options) {
+ Options opts(io_ctx, false);
+
+ int r = opts.init();
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto &it : opts) {
+ options->push_back({it.first, it.second.first, it.second.second});
+ }
+
+ return 0;
+}
+
+template <typename I>
+bool Config<I>::is_option_name(I *image_ctx, const std::string &name) {
+ Options opts(image_ctx->md_ctx, true);
+
+ return (opts.find(name) != opts.end());
+}
+
+template <typename I>
+int Config<I>::list(I *image_ctx, std::vector<config_option_t> *options) {
+ CephContext *cct = image_ctx->cct;
+ Options opts(image_ctx->md_ctx, true);
+
+ int r = opts.init();
+ if (r < 0) {
+ return r;
+ }
+
+ std::string last_key = ImageCtx::METADATA_CONF_PREFIX;
+ bool more_results = true;
+
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+
+ r = cls_client::metadata_list(&image_ctx->md_ctx, image_ctx->header_oid,
+ last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ lderr(cct) << "failed reading image metadata: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (pairs.empty()) {
+ break;
+ }
+
+ more_results = (pairs.size() == MAX_KEYS);
+ last_key = pairs.rbegin()->first;
+
+ for (auto kv : pairs) {
+ std::string key;
+ if (!util::is_metadata_config_override(kv.first, &key)) {
+ more_results = false;
+ break;
+ }
+ auto it = opts.find(key);
+ if (it != opts.end()) {
+ it->second = {{kv.second.c_str(), kv.second.length()},
+ RBD_CONFIG_SOURCE_IMAGE};
+ }
+ }
+ }
+
+ for (auto &it : opts) {
+ options->push_back({it.first, it.second.first, it.second.second});
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Config<I>::apply_pool_overrides(librados::IoCtx& io_ctx,
+ ConfigProxy* config) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ Options opts(io_ctx, false);
+ int r = opts.init();
+ if (r < 0) {
+ lderr(cct) << "failed to read pool config overrides: " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ for (auto& pair : opts) {
+ if (pair.second.second == RBD_CONFIG_SOURCE_POOL) {
+ r = config->set_val(pair.first, pair.second.first);
+ if (r < 0) {
+ lderr(cct) << "failed to override pool config " << pair.first << "="
+ << pair.second.first << ": " << cpp_strerror(r) << dendl;
+ }
+ }
+ }
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Config<librbd::ImageCtx>;
diff --git a/src/librbd/api/Config.h b/src/librbd/api/Config.h
new file mode 100644
index 00000000..daa718c9
--- /dev/null
+++ b/src/librbd/api/Config.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_CONFIG_H
+#define CEPH_LIBRBD_API_CONFIG_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/rados/librados_fwd.hpp"
+
+struct ConfigProxy;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Config {
+public:
+ static bool is_option_name(librados::IoCtx& io_ctx, const std::string &name);
+ static int list(librados::IoCtx& io_ctx,
+ std::vector<config_option_t> *options);
+
+ static bool is_option_name(ImageCtxT *image_ctx, const std::string &name);
+ static int list(ImageCtxT *image_ctx, std::vector<config_option_t> *options);
+
+ static void apply_pool_overrides(librados::IoCtx& io_ctx,
+ ConfigProxy* config);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Config<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_CONFIG_H
diff --git a/src/librbd/api/DiffIterate.cc b/src/librbd/api/DiffIterate.cc
new file mode 100644
index 00000000..f96264e3
--- /dev/null
+++ b/src/librbd/api/DiffIterate.cc
@@ -0,0 +1,554 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/DiffIterate.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/internal.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "include/rados/librados.hpp"
+#include "include/interval_set.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "osdc/Striper.h"
+#include "librados/snap_set_diff.h"
+#include <boost/tuple/tuple.hpp>
+#include <list>
+#include <map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DiffIterate: "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+enum ObjectDiffState {
+ OBJECT_DIFF_STATE_NONE = 0,
+ OBJECT_DIFF_STATE_UPDATED = 1,
+ OBJECT_DIFF_STATE_HOLE = 2
+};
+
+struct DiffContext {
+ DiffIterate<>::Callback callback;
+ void *callback_arg;
+ bool whole_object;
+ uint64_t from_snap_id;
+ uint64_t end_snap_id;
+ interval_set<uint64_t> parent_diff;
+ OrderedThrottle throttle;
+
+ template <typename I>
+ DiffContext(I &image_ctx, DiffIterate<>::Callback callback,
+ void *callback_arg, bool _whole_object, uint64_t _from_snap_id,
+ uint64_t _end_snap_id)
+ : callback(callback), callback_arg(callback_arg),
+ whole_object(_whole_object), from_snap_id(_from_snap_id),
+ end_snap_id(_end_snap_id),
+ throttle(image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"), true) {
+ }
+};
+
+class C_DiffObject : public Context {
+public:
+ template <typename I>
+ C_DiffObject(I &image_ctx, librados::IoCtx &head_ctx,
+ DiffContext &diff_context, const std::string &oid,
+ uint64_t offset, const std::vector<ObjectExtent> &object_extents)
+ : m_cct(image_ctx.cct), m_head_ctx(head_ctx),
+ m_diff_context(diff_context), m_oid(oid), m_offset(offset),
+ m_object_extents(object_extents), m_snap_ret(0) {
+ }
+
+ void send() {
+ C_OrderedThrottle *ctx = m_diff_context.throttle.start_op(this);
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(ctx);
+
+ librados::ObjectReadOperation op;
+ op.list_snaps(&m_snap_set, &m_snap_ret);
+
+ int r = m_head_ctx.aio_operate(m_oid, rados_completion, &op, NULL);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ }
+
+protected:
+ typedef boost::tuple<uint64_t, size_t, bool> Diff;
+ typedef std::list<Diff> Diffs;
+
+ void finish(int r) override {
+ CephContext *cct = m_cct;
+ if (r == 0 && m_snap_ret < 0) {
+ r = m_snap_ret;
+ }
+
+ Diffs diffs;
+ if (r == 0) {
+ ldout(cct, 20) << "object " << m_oid << ": list_snaps complete" << dendl;
+ compute_diffs(&diffs);
+ } else if (r == -ENOENT) {
+ ldout(cct, 20) << "object " << m_oid << ": list_snaps (not found)"
+ << dendl;
+ r = 0;
+ compute_parent_overlap(&diffs);
+ } else {
+ ldout(cct, 20) << "object " << m_oid << ": list_snaps failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ if (r == 0) {
+ for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) {
+ r = m_diff_context.callback(d->get<0>(), d->get<1>(), d->get<2>(),
+ m_diff_context.callback_arg);
+ if (r < 0) {
+ break;
+ }
+ }
+ }
+ m_diff_context.throttle.end_op(r);
+ }
+
+private:
+ CephContext *m_cct;
+ librados::IoCtx &m_head_ctx;
+ DiffContext &m_diff_context;
+ std::string m_oid;
+ uint64_t m_offset;
+ std::vector<ObjectExtent> m_object_extents;
+
+ librados::snap_set_t m_snap_set;
+ int m_snap_ret;
+
+ void compute_diffs(Diffs *diffs) {
+ CephContext *cct = m_cct;
+
+ // calc diff from from_snap_id -> to_snap_id
+ interval_set<uint64_t> diff;
+ uint64_t end_size;
+ bool end_exists;
+ librados::snap_t clone_end_snap_id;
+ bool whole_object;
+ calc_snap_set_diff(cct, m_snap_set, m_diff_context.from_snap_id,
+ m_diff_context.end_snap_id, &diff, &end_size,
+ &end_exists, &clone_end_snap_id, &whole_object);
+ if (whole_object) {
+ ldout(cct, 1) << "object " << m_oid << ": need to provide full object"
+ << dendl;
+ }
+ ldout(cct, 20) << " diff " << diff << " end_exists=" << end_exists
+ << dendl;
+ if (diff.empty() && !whole_object) {
+ if (m_diff_context.from_snap_id == 0 && !end_exists) {
+ compute_parent_overlap(diffs);
+ }
+ return;
+ } else if (m_diff_context.whole_object || whole_object) {
+ // provide the full object extents to the callback
+ for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+ q != m_object_extents.end(); ++q) {
+ diffs->push_back(boost::make_tuple(m_offset + q->offset, q->length,
+ end_exists));
+ }
+ return;
+ }
+
+ for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+ q != m_object_extents.end(); ++q) {
+ ldout(cct, 20) << "diff_iterate object " << m_oid << " extent "
+ << q->offset << "~" << q->length << " from "
+ << q->buffer_extents << dendl;
+ uint64_t opos = q->offset;
+ for (vector<pair<uint64_t,uint64_t> >::iterator r =
+ q->buffer_extents.begin();
+ r != q->buffer_extents.end(); ++r) {
+ interval_set<uint64_t> overlap; // object extents
+ overlap.insert(opos, r->second);
+ overlap.intersection_of(diff);
+ ldout(cct, 20) << " opos " << opos
+ << " buf " << r->first << "~" << r->second
+ << " overlap " << overlap << dendl;
+ for (interval_set<uint64_t>::iterator s = overlap.begin();
+ s != overlap.end(); ++s) {
+ uint64_t su_off = s.get_start() - opos;
+ uint64_t logical_off = m_offset + r->first + su_off;
+ ldout(cct, 20) << " overlap extent " << s.get_start() << "~"
+ << s.get_len() << " logical " << logical_off << "~"
+ << s.get_len() << dendl;
+ diffs->push_back(boost::make_tuple(logical_off, s.get_len(),
+ end_exists));
+ }
+ opos += r->second;
+ }
+ ceph_assert(opos == q->offset + q->length);
+ }
+ }
+
+ void compute_parent_overlap(Diffs *diffs) {
+ if (m_diff_context.from_snap_id == 0 &&
+ !m_diff_context.parent_diff.empty()) {
+ // report parent diff instead
+ for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+ q != m_object_extents.end(); ++q) {
+ for (vector<pair<uint64_t,uint64_t> >::iterator r =
+ q->buffer_extents.begin();
+ r != q->buffer_extents.end(); ++r) {
+ interval_set<uint64_t> o;
+ o.insert(m_offset + r->first, r->second);
+ o.intersection_of(m_diff_context.parent_diff);
+ ldout(m_cct, 20) << " reporting parent overlap " << o << dendl;
+ for (interval_set<uint64_t>::iterator s = o.begin(); s != o.end();
+ ++s) {
+ diffs->push_back(boost::make_tuple(s.get_start(), s.get_len(),
+ true));
+ }
+ }
+ }
+ }
+ }
+};
+
+int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) {
+ // it's possible for a discard to create a hole in the parent image -- ignore
+ if (exists) {
+ interval_set<uint64_t> *diff = static_cast<interval_set<uint64_t> *>(arg);
+ diff->insert(off, len);
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int DiffIterate<I>::diff_iterate(I *ictx,
+ const cls::rbd::SnapshotNamespace& from_snap_namespace,
+ const char *fromsnapname,
+ uint64_t off, uint64_t len,
+ bool include_parent, bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+{
+ ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off
+ << " len = " << len << dendl;
+
+ if (!ictx->data_ctx.is_valid()) {
+ return -ENODEV;
+ }
+
+ // ensure previous writes are visible to listsnaps
+ C_SaferCond flush_ctx;
+ {
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, ictx,
+ io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec<I>::create_flush_request(
+ *ictx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ delete req;
+ }
+ int r = flush_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ ictx->snap_lock.get_read();
+ r = clip_io(ictx, off, &len);
+ ictx->snap_lock.put_read();
+ if (r < 0) {
+ return r;
+ }
+
+ DiffIterate command(*ictx, from_snap_namespace, fromsnapname, off, len,
+ include_parent, whole_object, cb, arg);
+ r = command.execute();
+ return r;
+}
+
+template <typename I>
+int DiffIterate<I>::execute() {
+ CephContext* cct = m_image_ctx.cct;
+
+ ceph_assert(m_image_ctx.data_ctx.is_valid());
+
+ librados::IoCtx head_ctx;
+ librados::snap_t from_snap_id = 0;
+ librados::snap_t end_snap_id;
+ uint64_t from_size = 0;
+ uint64_t end_size;
+ {
+ RWLock::RLocker md_locker(m_image_ctx.md_lock);
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ head_ctx.dup(m_image_ctx.data_ctx);
+ if (m_from_snap_name) {
+ from_snap_id = m_image_ctx.get_snap_id(m_from_snap_namespace, m_from_snap_name);
+ from_size = m_image_ctx.get_image_size(from_snap_id);
+ }
+ end_snap_id = m_image_ctx.snap_id;
+ end_size = m_image_ctx.get_image_size(end_snap_id);
+ }
+
+ if (from_snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+ if (from_snap_id == end_snap_id) {
+ // no diff.
+ return 0;
+ }
+ if (from_snap_id >= end_snap_id) {
+ return -EINVAL;
+ }
+
+ int r;
+ bool fast_diff_enabled = false;
+ BitVector<2> object_diff_state;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ if (m_whole_object && (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ r = diff_object_map(from_snap_id, end_snap_id, &object_diff_state);
+ if (r < 0) {
+ ldout(cct, 5) << "fast diff disabled" << dendl;
+ } else {
+ ldout(cct, 5) << "fast diff enabled" << dendl;
+ fast_diff_enabled = true;
+ }
+ }
+ }
+
+ // we must list snaps via the head, not end snap
+ head_ctx.snap_set_read(CEPH_SNAPDIR);
+
+ ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to "
+ << end_snap_id << " size from " << from_size
+ << " to " << end_size << dendl;
+
+ // check parent overlap only if we are comparing to the beginning of time
+ DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg,
+ m_whole_object, from_snap_id, end_snap_id);
+ if (m_include_parent && from_snap_id == 0) {
+ RWLock::RLocker l(m_image_ctx.snap_lock);
+ RWLock::RLocker l2(m_image_ctx.parent_lock);
+ uint64_t overlap = 0;
+ m_image_ctx.get_parent_overlap(m_image_ctx.snap_id, &overlap);
+ r = 0;
+ if (m_image_ctx.parent && overlap > 0) {
+ ldout(cct, 10) << " first getting parent diff" << dendl;
+ DiffIterate diff_parent(*m_image_ctx.parent, {},
+ nullptr, 0, overlap,
+ m_include_parent, m_whole_object,
+ &simple_diff_cb,
+ &diff_context.parent_diff);
+ r = diff_parent.execute();
+ }
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ uint64_t period = m_image_ctx.get_stripe_period();
+ uint64_t off = m_offset;
+ uint64_t left = m_length;
+
+ while (left > 0) {
+ uint64_t period_off = off - (off % period);
+ uint64_t read_len = min(period_off + period - off, left);
+
+ // map to extents
+ map<object_t,vector<ObjectExtent> > object_extents;
+ Striper::file_to_extents(cct, m_image_ctx.format_string,
+ &m_image_ctx.layout, off, read_len, 0,
+ object_extents, 0);
+
+ // get snap info for each object
+ for (map<object_t,vector<ObjectExtent> >::iterator p =
+ object_extents.begin();
+ p != object_extents.end(); ++p) {
+ ldout(cct, 20) << "object " << p->first << dendl;
+
+ if (fast_diff_enabled) {
+ const uint64_t object_no = p->second.front().objectno;
+ if (object_diff_state[object_no] == OBJECT_DIFF_STATE_NONE &&
+ from_snap_id == 0 && !diff_context.parent_diff.empty()) {
+ // no data in child object -- report parent diff instead
+ for (auto& oe : p->second) {
+ for (auto& be : oe.buffer_extents) {
+ interval_set<uint64_t> o;
+ o.insert(off + be.first, be.second);
+ o.intersection_of(diff_context.parent_diff);
+ ldout(cct, 20) << " reporting parent overlap " << o << dendl;
+ for (auto e = o.begin(); e != o.end(); ++e) {
+ r = m_callback(e.get_start(), e.get_len(), true,
+ m_callback_arg);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+ }
+ } else if (object_diff_state[object_no] != OBJECT_DIFF_STATE_NONE) {
+ bool updated = (object_diff_state[object_no] ==
+ OBJECT_DIFF_STATE_UPDATED);
+ for (std::vector<ObjectExtent>::iterator q = p->second.begin();
+ q != p->second.end(); ++q) {
+ r = m_callback(off + q->offset, q->length, updated, m_callback_arg);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+ } else {
+ C_DiffObject *diff_object = new C_DiffObject(m_image_ctx, head_ctx,
+ diff_context,
+ p->first.name, off,
+ p->second);
+ diff_object->send();
+
+ if (diff_context.throttle.pending_error()) {
+ r = diff_context.throttle.wait_for_ret();
+ return r;
+ }
+ }
+ }
+
+ left -= read_len;
+ off += read_len;
+ }
+
+ r = diff_context.throttle.wait_for_ret();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int DiffIterate<I>::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id,
+ BitVector<2>* object_diff_state) {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ CephContext* cct = m_image_ctx.cct;
+
+ bool diff_from_start = (from_snap_id == 0);
+ if (from_snap_id == 0) {
+ if (!m_image_ctx.snaps.empty()) {
+ from_snap_id = m_image_ctx.snaps.back();
+ } else {
+ from_snap_id = CEPH_NOSNAP;
+ }
+ }
+
+ object_diff_state->clear();
+ uint64_t current_snap_id = from_snap_id;
+ uint64_t next_snap_id = to_snap_id;
+ BitVector<2> prev_object_map;
+ bool prev_object_map_valid = false;
+ while (true) {
+ uint64_t current_size = m_image_ctx.size;
+ if (current_snap_id != CEPH_NOSNAP) {
+ std::map<librados::snap_t, SnapInfo>::const_iterator snap_it =
+ m_image_ctx.snap_info.find(current_snap_id);
+ ceph_assert(snap_it != m_image_ctx.snap_info.end());
+ current_size = snap_it->second.size;
+
+ ++snap_it;
+ if (snap_it != m_image_ctx.snap_info.end()) {
+ next_snap_id = snap_it->first;
+ } else {
+ next_snap_id = CEPH_NOSNAP;
+ }
+ }
+
+ uint64_t flags;
+ int r = m_image_ctx.get_flags(from_snap_id, &flags);
+ if (r < 0) {
+ lderr(cct) << "diff_object_map: failed to retrieve image flags" << dendl;
+ return r;
+ }
+ if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+ ldout(cct, 1) << "diff_object_map: cannot perform fast diff on invalid "
+ << "object map" << dendl;
+ return -EINVAL;
+ }
+
+ BitVector<2> object_map;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ current_snap_id));
+ r = cls_client::object_map_load(&m_image_ctx.md_ctx, oid, &object_map);
+ if (r < 0) {
+ lderr(cct) << "diff_object_map: failed to load object map " << oid
+ << dendl;
+ return r;
+ }
+ ldout(cct, 20) << "diff_object_map: loaded object map " << oid << dendl;
+
+ uint64_t num_objs = Striper::get_num_objects(m_image_ctx.layout,
+ current_size);
+ if (object_map.size() < num_objs) {
+ ldout(cct, 1) << "diff_object_map: object map too small: "
+ << object_map.size() << " < " << num_objs << dendl;
+ return -EINVAL;
+ }
+ object_map.resize(num_objs);
+ object_diff_state->resize(object_map.size());
+
+ uint64_t overlap = std::min(object_map.size(), prev_object_map.size());
+ auto it = object_map.begin();
+ auto overlap_end_it = it + overlap;
+ auto pre_it = prev_object_map.begin();
+ auto diff_it = object_diff_state->begin();
+ uint64_t i = 0;
+ for (; it != overlap_end_it; ++it, ++pre_it, ++diff_it, ++i) {
+ ldout(cct, 20) << __func__ << ": object state: " << i << " "
+ << static_cast<uint32_t>(*pre_it)
+ << "->" << static_cast<uint32_t>(*it) << dendl;
+ if (*it == OBJECT_NONEXISTENT) {
+ if (*pre_it != OBJECT_NONEXISTENT) {
+ *diff_it = OBJECT_DIFF_STATE_HOLE;
+ }
+ } else if (*it == OBJECT_EXISTS ||
+ (*pre_it != *it &&
+ !(*pre_it == OBJECT_EXISTS &&
+ *it == OBJECT_EXISTS_CLEAN))) {
+ *diff_it = OBJECT_DIFF_STATE_UPDATED;
+ }
+ }
+ ldout(cct, 20) << "diff_object_map: computed overlap diffs" << dendl;
+ auto end_it = object_map.end();
+ if (object_map.size() > prev_object_map.size() &&
+ (diff_from_start || prev_object_map_valid)) {
+ for (; it != end_it; ++it,++diff_it, ++i) {
+ ldout(cct, 20) << __func__ << ": object state: " << i << " "
+ << "->" << static_cast<uint32_t>(*it) << dendl;
+ if (*it == OBJECT_NONEXISTENT) {
+ *diff_it = OBJECT_DIFF_STATE_NONE;
+ } else {
+ *diff_it = OBJECT_DIFF_STATE_UPDATED;
+ }
+ }
+ }
+ ldout(cct, 20) << "diff_object_map: computed resize diffs" << dendl;
+
+ if (current_snap_id == next_snap_id || next_snap_id > to_snap_id) {
+ break;
+ }
+ current_snap_id = next_snap_id;
+ prev_object_map = object_map;
+ prev_object_map_valid = true;
+ }
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::DiffIterate<librbd::ImageCtx>;
diff --git a/src/librbd/api/DiffIterate.h b/src/librbd/api/DiffIterate.h
new file mode 100644
index 00000000..e6074d9c
--- /dev/null
+++ b/src/librbd/api/DiffIterate.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_DIFF_ITERATE_H
+#define CEPH_LIBRBD_API_DIFF_ITERATE_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DiffIterate {
+public:
+ typedef int (*Callback)(uint64_t, size_t, int, void *);
+
+ static int diff_iterate(ImageCtxT *ictx,
+ const cls::rbd::SnapshotNamespace& from_snap_namespace,
+ const char *fromsnapname,
+ uint64_t off, uint64_t len, bool include_parent,
+ bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg);
+
+private:
+ ImageCtxT &m_image_ctx;
+ cls::rbd::SnapshotNamespace m_from_snap_namespace;
+ const char* m_from_snap_name;
+ uint64_t m_offset;
+ uint64_t m_length;
+ bool m_include_parent;
+ bool m_whole_object;
+ Callback m_callback;
+ void *m_callback_arg;
+
+ DiffIterate(ImageCtxT &image_ctx,
+ const cls::rbd::SnapshotNamespace& from_snap_namespace,
+ const char *from_snap_name, uint64_t off, uint64_t len,
+ bool include_parent, bool whole_object, Callback callback,
+ void *callback_arg)
+ : m_image_ctx(image_ctx), m_from_snap_namespace(from_snap_namespace),
+ m_from_snap_name(from_snap_name), m_offset(off),
+ m_length(len), m_include_parent(include_parent),
+ m_whole_object(whole_object), m_callback(callback),
+ m_callback_arg(callback_arg)
+ {
+ }
+
+ int execute();
+
+ int diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id,
+ BitVector<2>* object_diff_state);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::DiffIterate<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_DIFF_ITERATE_H
diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc
new file mode 100644
index 00000000..8b75bd94
--- /dev/null
+++ b/src/librbd/api/Group.cc
@@ -0,0 +1,1207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/api/Group.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Group: " << __func__ << ": "
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+// list binds to list() here, so std::list is explicitly used below
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+using librados::Rados;
+
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+template <typename I>
+snap_t get_group_snap_id(I* ictx,
+ const cls::rbd::SnapshotNamespace& in_snap_namespace) {
+ ceph_assert(ictx->snap_lock.is_locked());
+ auto it = ictx->snap_ids.lower_bound({in_snap_namespace, ""});
+ if (it != ictx->snap_ids.end() && it->first.first == in_snap_namespace) {
+ return it->second;
+ }
+ return CEPH_NOSNAP;
+}
+
+string generate_uuid(librados::IoCtx& io_ctx)
+{
+ Rados rados(io_ctx);
+ uint64_t bid = rados.get_instance_id();
+
+ uint32_t extra = rand() % 0xFFFFFFFF;
+ ostringstream bid_ss;
+ bid_ss << std::hex << bid << std::hex << extra;
+ return bid_ss.str();
+}
+
+int group_snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<cls::rbd::GroupSnapshot> *cls_snaps)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ vector<string> ind_snap_names;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+ const int max_read = 1024;
+ cls::rbd::GroupSnapshot snap_last;
+
+ for (;;) {
+ vector<cls::rbd::GroupSnapshot> snaps_page;
+
+ r = cls_client::group_snap_list(&group_ioctx, group_header_oid,
+ snap_last, max_read, &snaps_page);
+
+ if (r < 0) {
+ lderr(cct) << "error reading snap list from group: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ cls_snaps->insert(cls_snaps->end(), snaps_page.begin(), snaps_page.end());
+ if (snaps_page.size() < max_read) {
+ break;
+ }
+ snap_last = *snaps_page.rbegin();
+ }
+
+ return 0;
+}
+
+std::string calc_ind_image_snap_name(uint64_t pool_id,
+ const std::string &group_id,
+ const std::string &snap_id)
+{
+ std::stringstream ind_snap_name_stream;
+ ind_snap_name_stream << ".group." << std::hex << pool_id << "_"
+ << group_id << "_" << snap_id;
+ return ind_snap_name_stream.str();
+}
+
+int group_image_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<cls::rbd::GroupImageStatus> *image_ids)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+ ldout(cct, 20) << "listing images in group name "
+ << group_name << " group id " << group_header_oid << dendl;
+ image_ids->clear();
+
+ const int max_read = 1024;
+ cls::rbd::GroupImageSpec start_last;
+ do {
+ std::vector<cls::rbd::GroupImageStatus> image_ids_page;
+
+ r = cls_client::group_image_list(&group_ioctx, group_header_oid,
+ start_last, max_read, &image_ids_page);
+
+ if (r < 0) {
+ lderr(cct) << "error reading image list from group: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ image_ids->insert(image_ids->end(),
+ image_ids_page.begin(), image_ids_page.end());
+
+ if (image_ids_page.size() > 0)
+ start_last = image_ids_page.rbegin()->spec;
+
+ r = image_ids_page.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+int group_image_remove(librados::IoCtx& group_ioctx, string group_id,
+ librados::IoCtx& image_ioctx, string image_id)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_header_oid = util::group_header_name(group_id);
+
+ string image_header_oid = util::header_name(image_id);
+
+ ldout(cct, 20) << "removing image " << image_id
+ << " image id " << image_header_oid << dendl;
+
+ cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id());
+
+ cls::rbd::GroupImageStatus incomplete_st(image_id, image_ioctx.get_id(),
+ cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE);
+
+ cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id());
+
+ int r = cls_client::group_image_set(&group_ioctx, group_header_oid,
+ incomplete_st);
+
+ if (r < 0) {
+ lderr(cct) << "couldn't put image into removing state: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = cls_client::image_group_remove(&image_ioctx, image_header_oid,
+ group_spec);
+ if ((r < 0) && (r != -ENOENT)) {
+ lderr(cct) << "couldn't remove group reference from image"
+ << cpp_strerror(-r) << dendl;
+ return r;
+ } else if (r >= 0) {
+ ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid);
+ }
+
+ r = cls_client::group_image_remove(&group_ioctx, group_header_oid, spec);
+ if (r < 0) {
+ lderr(cct) << "couldn't remove image from group"
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int group_snap_remove_by_record(librados::IoCtx& group_ioctx,
+ const cls::rbd::GroupSnapshot& group_snap,
+ const std::string& group_id,
+ const std::string& group_header_oid) {
+
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ std::vector<C_SaferCond*> on_finishes;
+ int r, ret_code;
+
+ std::vector<librbd::ImageCtx*> ictxs;
+
+ cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id,
+ group_snap.id};
+
+ ldout(cct, 20) << "Removing snapshots" << dendl;
+ int snap_count = group_snap.snaps.size();
+
+ for (int i = 0; i < snap_count; ++i) {
+ librbd::IoCtx image_io_ctx;
+ r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {},
+ &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id,
+ nullptr, image_io_ctx, false);
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ image_ctx->state->open(0, on_finish);
+
+ ictxs.push_back(image_ctx);
+ on_finishes.push_back(on_finish);
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ delete ictxs[i];
+ ictxs[i] = nullptr;
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Opened participating images. " <<
+ "Deleting snapshots themselves." << dendl;
+
+ for (int i = 0; i < snap_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ on_finishes[i] = new C_SaferCond;
+
+ std::string snap_name;
+ ictx->snap_lock.get_read();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ r = ictx->get_snap_name(snap_id, &snap_name);
+ ictx->snap_lock.put_read();
+
+ if (r >= 0) {
+ ldout(cct, 20) << "removing individual snapshot from image " << ictx->name
+ << dendl;
+ ictx->operations->snap_remove(ne, snap_name, on_finishes[i]);
+ } else {
+ // We are ok to ignore missing image snapshots. The snapshot could have
+ // been inconsistent in the first place.
+ on_finishes[i]->complete(0);
+ }
+ }
+
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0 && r != -ENOENT) {
+ // if previous attempts to remove this snapshot failed then the image's
+ // snapshot may not exist
+ lderr(cct) << "Failed deleting image snapshot. Ret code: " << r << dendl;
+ ret_code = r;
+ }
+ }
+
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Removed images snapshots removing snapshot record."
+ << dendl;
+
+ r = cls_client::group_snap_remove(&group_ioctx, group_header_oid,
+ group_snap.id);
+ if (r < 0) {
+ ret_code = r;
+ goto finish;
+ }
+
+finish:
+ for (int i = 0; i < snap_count; ++i) {
+ if (ictxs[i] != nullptr) {
+ ictxs[i]->state->close();
+ }
+ }
+ return ret_code;
+}
+
+int group_snap_rollback_by_record(librados::IoCtx& group_ioctx,
+ const cls::rbd::GroupSnapshot& group_snap,
+ const std::string& group_id,
+ const std::string& group_header_oid,
+ ProgressContext& pctx) {
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ std::vector<C_SaferCond*> on_finishes;
+ int r, ret_code;
+
+ std::vector<librbd::ImageCtx*> ictxs;
+
+ cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id,
+ group_snap.id};
+
+ ldout(cct, 20) << "Rolling back snapshots" << dendl;
+ int snap_count = group_snap.snaps.size();
+
+ for (int i = 0; i < snap_count; ++i) {
+ librados::IoCtx image_io_ctx;
+ r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {},
+ &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id,
+ nullptr, image_io_ctx, false);
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ image_ctx->state->open(0, on_finish);
+
+ ictxs.push_back(image_ctx);
+ on_finishes.push_back(on_finish);
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ delete ictxs[i];
+ ictxs[i] = nullptr;
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Requesting exclusive locks for images" << dendl;
+ for (auto ictx: ictxs) {
+ RWLock::RLocker owner_lock(ictx->owner_lock);
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->block_requests(-EBUSY);
+ }
+ }
+ for (int i = 0; i < snap_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ RWLock::RLocker owner_lock(ictx->owner_lock);
+
+ on_finishes[i] = new C_SaferCond;
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->acquire_lock(on_finishes[i]);
+ }
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < snap_count; ++i) {
+ r = 0;
+ ImageCtx *ictx = ictxs[i];
+ if (ictx->exclusive_lock != nullptr) {
+ r = on_finishes[i]->wait();
+ }
+ delete on_finishes[i];
+ if (r < 0) {
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ for (int i = 0; i < snap_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ on_finishes[i] = new C_SaferCond;
+
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ std::string snap_name;
+ ictx->snap_lock.get_read();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ r = ictx->get_snap_name(snap_id, &snap_name);
+ ictx->snap_lock.put_read();
+
+ if (r >= 0) {
+ ldout(cct, 20) << "rolling back to individual snapshot for image " << ictx->name
+ << dendl;
+ ictx->operations->execute_snap_rollback(ne, snap_name, pctx, on_finishes[i]);
+ } else {
+ on_finishes[i]->complete(r);
+ }
+ }
+
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "Failed rolling back group to snapshot. Ret code: " << r << dendl;
+ ret_code = r;
+ }
+ }
+
+finish:
+ for (int i = 0; i < snap_count; ++i) {
+ if (ictxs[i] != nullptr) {
+ ictxs[i]->state->close();
+ }
+ }
+ return ret_code;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int Group<I>::image_remove_by_id(librados::IoCtx& group_ioctx,
+ const char *group_name,
+ librados::IoCtx& image_ioctx,
+ const char *image_id)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << " image "
+ << &image_ioctx << " id " << image_id << dendl;
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "removing image from group name " << group_name
+ << " group id " << group_id << dendl;
+
+ return group_image_remove(group_ioctx, group_id, image_ioctx, string(image_id));
+}
+
+template <typename I>
+int Group<I>::create(librados::IoCtx& io_ctx, const char *group_name)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ string id = generate_uuid(io_ctx);
+
+ ldout(cct, 2) << "adding group to directory..." << dendl;
+
+ int r = cls_client::group_dir_add(&io_ctx, RBD_GROUP_DIRECTORY, group_name,
+ id);
+ if (r < 0) {
+ lderr(cct) << "error adding group to directory: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string header_oid = util::group_header_name(id);
+
+ r = io_ctx.create(header_oid, true);
+ if (r < 0) {
+ lderr(cct) << "error creating group header: " << cpp_strerror(r) << dendl;
+ goto err_remove_from_dir;
+ }
+
+ return 0;
+
+err_remove_from_dir:
+ int remove_r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY,
+ group_name, id);
+ if (remove_r < 0) {
+ lderr(cct) << "error cleaning up group from rbd_directory "
+ << "object after creation failed: " << cpp_strerror(remove_r)
+ << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int Group<I>::remove(librados::IoCtx& io_ctx, const char *group_name)
+{
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "group_remove " << &io_ctx << " " << group_name << dendl;
+
+ std::string group_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY,
+ std::string(group_name), &group_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error getting id of group" << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ r = group_snap_list(io_ctx, group_name, &snaps);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing group snapshots" << dendl;
+ return r;
+ }
+
+ for (auto &snap : snaps) {
+ r = group_snap_remove_by_record(io_ctx, snap, group_id, group_header_oid);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ std::vector<cls::rbd::GroupImageStatus> images;
+ r = group_image_list(io_ctx, group_name, &images);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing group images" << dendl;
+ return r;
+ }
+
+ for (auto image : images) {
+ IoCtx image_ioctx;
+ r = util::create_ioctx(io_ctx, "image", image.spec.pool_id, {},
+ &image_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = group_image_remove(io_ctx, group_id, image_ioctx, image.spec.image_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing image from a group" << dendl;
+ return r;
+ }
+ }
+
+ string header_oid = util::group_header_name(group_id);
+
+ r = io_ctx.remove(header_oid);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing header: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY,
+ group_name, group_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing group from directory" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::list(IoCtx& io_ctx, vector<string> *names)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl;
+
+ int max_read = 1024;
+ string last_read = "";
+ int r;
+ do {
+ map<string, string> groups;
+ r = cls_client::group_dir_list(&io_ctx, RBD_GROUP_DIRECTORY, last_read,
+ max_read, &groups);
+ if (r == -ENOENT) {
+ return 0; // Ignore missing rbd group directory. It means we don't have any groups yet.
+ }
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "error listing group in directory: "
+ << cpp_strerror(r) << dendl;
+ } else {
+ r = 0;
+ }
+ return r;
+ }
+ for (pair<string, string> group : groups) {
+ names->push_back(group.first);
+ }
+ if (!groups.empty()) {
+ last_read = groups.rbegin()->first;
+ }
+ r = groups.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::image_add(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << " image "
+ << &image_ioctx << " name " << image_name << dendl;
+
+ if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) {
+ lderr(cct) << "group and image cannot be in different namespaces" << dendl;
+ return -EINVAL;
+ }
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+
+ ldout(cct, 20) << "adding image to group name " << group_name
+ << " group id " << group_header_oid << dendl;
+
+ string image_id;
+
+ r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r < 0) {
+ lderr(cct) << "error reading image id object: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ string image_header_oid = util::header_name(image_id);
+
+ ldout(cct, 20) << "adding image " << image_name
+ << " image id " << image_header_oid << dendl;
+
+ cls::rbd::GroupImageStatus incomplete_st(
+ image_id, image_ioctx.get_id(),
+ cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE);
+ cls::rbd::GroupImageStatus attached_st(
+ image_id, image_ioctx.get_id(), cls::rbd::GROUP_IMAGE_LINK_STATE_ATTACHED);
+
+ r = cls_client::group_image_set(&group_ioctx, group_header_oid,
+ incomplete_st);
+
+ cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id());
+
+ if (r < 0) {
+ lderr(cct) << "error adding image reference to group: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = cls_client::image_group_add(&image_ioctx, image_header_oid, group_spec);
+ if (r < 0) {
+ lderr(cct) << "error adding group reference to image: "
+ << cpp_strerror(-r) << dendl;
+ cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id());
+ cls_client::group_image_remove(&group_ioctx, group_header_oid, spec);
+ // Ignore errors in the clean up procedure.
+ return r;
+ }
+ ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid);
+
+ r = cls_client::group_image_set(&group_ioctx, group_header_oid,
+ attached_st);
+
+ return r;
+}
+
+template <typename I>
+int Group<I>::image_remove(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << " image "
+ << &image_ioctx << " name " << image_name << dendl;
+
+ if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) {
+ lderr(cct) << "group and image cannot be in different namespaces" << dendl;
+ return -EINVAL;
+ }
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "removing image from group name " << group_name
+ << " group id " << group_id << dendl;
+
+ string image_id;
+ r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r < 0) {
+ lderr(cct) << "error reading image id object: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = group_image_remove(group_ioctx, group_id, image_ioctx, image_id);
+
+ return r;
+}
+
+template <typename I>
+int Group<I>::image_list(librados::IoCtx& group_ioctx,
+ const char *group_name,
+ std::vector<group_image_info_t>* images)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << dendl;
+
+ std::vector<cls::rbd::GroupImageStatus> image_ids;
+
+ group_image_list(group_ioctx, group_name, &image_ids);
+
+ for (auto image_id : image_ids) {
+ IoCtx ioctx;
+ int r = util::create_ioctx(group_ioctx, "image", image_id.spec.pool_id, {},
+ &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string image_name;
+ r = cls_client::dir_get_name(&ioctx, RBD_DIRECTORY,
+ image_id.spec.image_id, &image_name);
+ if (r < 0) {
+ return r;
+ }
+
+ images->push_back(
+ group_image_info_t {
+ image_name,
+ ioctx.get_id(),
+ static_cast<group_image_state_t>(image_id.state)});
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::rename(librados::IoCtx& io_ctx, const char *src_name,
+ const char *dest_name)
+{
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "group_rename " << &io_ctx << " " << src_name
+ << " -> " << dest_name << dendl;
+
+ std::string group_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY,
+ std::string(src_name), &group_id);
+ if (r < 0) {
+ if (r != -ENOENT)
+ lderr(cct) << "error getting id of group" << dendl;
+ return r;
+ }
+
+ r = cls_client::group_dir_rename(&io_ctx, RBD_GROUP_DIRECTORY,
+ src_name, dest_name, group_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error renaming group from directory" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+template <typename I>
+int Group<I>::image_get_group(I *ictx, group_info_t *group_info)
+{
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ if (RBD_GROUP_INVALID_POOL != ictx->group_spec.pool_id) {
+ IoCtx ioctx;
+ r = util::create_ioctx(ictx->md_ctx, "group", ictx->group_spec.pool_id, {},
+ &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string group_name;
+ r = cls_client::dir_get_name(&ioctx, RBD_GROUP_DIRECTORY,
+ ictx->group_spec.group_id, &group_name);
+ if (r < 0)
+ return r;
+ group_info->pool = ioctx.get_id();
+ group_info->name = group_name;
+ } else {
+ group_info->pool = RBD_GROUP_INVALID_POOL;
+ group_info->name = "";
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::snap_create(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ cls::rbd::GroupSnapshot group_snap;
+ vector<cls::rbd::ImageSnapshotSpec> image_snaps;
+ std::string ind_snap_name;
+
+ std::vector<librbd::ImageCtx*> ictxs;
+ std::vector<C_SaferCond*> on_finishes;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupImageStatus> images;
+ r = group_image_list(group_ioctx, group_name, &images);
+ if (r < 0) {
+ return r;
+ }
+ int image_count = images.size();
+
+ ldout(cct, 20) << "Found " << image_count << " images in group" << dendl;
+
+ image_snaps = vector<cls::rbd::ImageSnapshotSpec>(image_count,
+ cls::rbd::ImageSnapshotSpec());
+
+ for (int i = 0; i < image_count; ++i) {
+ image_snaps[i].pool = images[i].spec.pool_id;
+ image_snaps[i].image_id = images[i].spec.image_id;
+ }
+
+ string group_header_oid = util::group_header_name(group_id);
+
+ group_snap.id = generate_uuid(group_ioctx);
+ group_snap.name = string(snap_name);
+ group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_INCOMPLETE;
+ group_snap.snaps = image_snaps;
+
+ cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id,
+ group_snap.id};
+
+ r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap);
+ if (r == -EEXIST) {
+ lderr(cct) << "snapshot with this name already exists: "
+ << cpp_strerror(r)
+ << dendl;
+ }
+ int ret_code = 0;
+ if (r < 0) {
+ ret_code = r;
+ goto finish;
+ }
+
+ for (auto image: images) {
+ librbd::IoCtx image_io_ctx;
+ r = util::create_ioctx(group_ioctx, "image", image.spec.pool_id, {},
+ &image_io_ctx);
+ if (r < 0) {
+ ret_code = r;
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Opening image with id " << image.spec.image_id << dendl;
+
+ librbd::ImageCtx* image_ctx = new ImageCtx("", image.spec.image_id.c_str(),
+ nullptr, image_io_ctx, false);
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ image_ctx->state->open(0, on_finish);
+
+ ictxs.push_back(image_ctx);
+ on_finishes.push_back(on_finish);
+ }
+ ldout(cct, 20) << "Issued open request waiting for the completion" << dendl;
+ ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+
+ ldout(cct, 20) << "Waiting for completion on on_finish: " <<
+ on_finishes[i] << dendl;
+
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ delete ictxs[i];
+ ictxs[i] = nullptr;
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto remove_record;
+ }
+ ldout(cct, 20) << "Requesting exclusive locks for images" << dendl;
+
+ for (auto ictx: ictxs) {
+ RWLock::RLocker owner_lock(ictx->owner_lock);
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->block_requests(-EBUSY);
+ }
+ }
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ RWLock::RLocker owner_lock(ictx->owner_lock);
+
+ on_finishes[i] = new C_SaferCond;
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->acquire_lock(on_finishes[i]);
+ }
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+ r = 0;
+ ImageCtx *ictx = ictxs[i];
+ if (ictx->exclusive_lock != nullptr) {
+ r = on_finishes[i]->wait();
+ }
+ delete on_finishes[i];
+ if (r < 0) {
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto remove_record;
+ }
+
+ ind_snap_name = calc_ind_image_snap_name(group_ioctx.get_id(), group_id,
+ group_snap.id);
+
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ ictx->operations->snap_create(ne, ind_snap_name.c_str(), on_finish);
+
+ on_finishes[i] = on_finish;
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ ret_code = r;
+ } else {
+ ImageCtx *ictx = ictxs[i];
+ ictx->snap_lock.get_read();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ ictx->snap_lock.put_read();
+ if (snap_id == CEPH_NOSNAP) {
+ ldout(cct, 20) << "Couldn't find created snapshot with namespace: "
+ << ne << dendl;
+ ret_code = -ENOENT;
+ } else {
+ image_snaps[i].snap_id = snapid_t(snap_id);
+ image_snaps[i].pool = ictx->md_ctx.get_id();
+ image_snaps[i].image_id = ictx->id;
+ }
+ }
+ }
+ if (ret_code != 0) {
+ goto remove_image_snaps;
+ }
+
+ group_snap.snaps = image_snaps;
+ group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_COMPLETE;
+
+ r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap);
+ if (r < 0) {
+ ret_code = r;
+ goto remove_image_snaps;
+ }
+
+ goto finish;
+
+remove_image_snaps:
+
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ ldout(cct, 20) << "Removing individual snapshot with name: " <<
+ ind_snap_name << dendl;
+
+ on_finishes[i] = new C_SaferCond;
+ std::string snap_name;
+ ictx->snap_lock.get_read();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ r = ictx->get_snap_name(snap_id, &snap_name);
+ ictx->snap_lock.put_read();
+ if (r >= 0) {
+ ictx->operations->snap_remove(ne, snap_name.c_str(), on_finishes[i]);
+ } else {
+ // Ignore missing image snapshots. The whole snapshot could have been
+ // inconsistent.
+ on_finishes[i]->complete(0);
+ }
+ }
+
+ for (int i = 0, n = on_finishes.size(); i < n; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0 && r != -ENOENT) { // if previous attempts to remove this snapshot failed then the image's snapshot may not exist
+ lderr(cct) << "Failed cleaning up image snapshot. Ret code: " << r << dendl;
+ // just report error, but don't abort the process
+ }
+ }
+
+remove_record:
+ r = cls_client::group_snap_remove(&group_ioctx, group_header_oid,
+ group_snap.id);
+ if (r < 0) {
+ lderr(cct) << "error while cleaning up group snapshot" << dendl;
+ // we ignore return value in clean up
+ }
+
+finish:
+ for (int i = 0, n = ictxs.size(); i < n; ++i) {
+ if (ictxs[i] != nullptr) {
+ ictxs[i]->state->close();
+ }
+ }
+ return ret_code;
+}
+
+template <typename I>
+int Group<I>::snap_remove(librados::IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ r = group_snap_list(group_ioctx, group_name, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot *group_snap = nullptr;
+ for (auto &snap : snaps) {
+ if (snap.name == string(snap_name)) {
+ group_snap = &snap;
+ break;
+ }
+ }
+ if (group_snap == nullptr) {
+ return -ENOENT;
+ }
+
+ string group_header_oid = util::group_header_name(group_id);
+ r = group_snap_remove_by_record(group_ioctx, *group_snap, group_id,
+ group_header_oid);
+ return r;
+}
+
+template <typename I>
+int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name) {
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ if (0 == strcmp(old_snap_name, new_snap_name))
+ return -EEXIST;
+
+ std::string group_id;
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "error reading group id object: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupSnapshot> group_snaps;
+ r = group_snap_list(group_ioctx, group_name, &group_snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot group_snap;
+ for (auto &snap : group_snaps) {
+ if (snap.name == old_snap_name) {
+ group_snap = snap;
+ break;
+ }
+ }
+
+ if (group_snap.id.empty()) {
+ return -ENOENT;
+ }
+
+ std::string group_header_oid = util::group_header_name(group_id);
+ group_snap.name = new_snap_name;
+ r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps)
+{
+ std::vector<cls::rbd::GroupSnapshot> cls_snaps;
+
+ int r = group_snap_list(group_ioctx, group_name, &cls_snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto snap : cls_snaps) {
+ snaps->push_back(
+ group_snap_info_t {
+ snap.name,
+ static_cast<group_snap_state_t>(snap.state)});
+
+ }
+ return 0;
+}
+
+template <typename I>
+int Group<I>::snap_rollback(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name,
+ ProgressContext& pctx)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ r = group_snap_list(group_ioctx, group_name, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot *group_snap = nullptr;
+ for (auto &snap : snaps) {
+ if (snap.name == string(snap_name)) {
+ group_snap = &snap;
+ break;
+ }
+ }
+ if (group_snap == nullptr) {
+ return -ENOENT;
+ }
+
+ string group_header_oid = util::group_header_name(group_id);
+ r = group_snap_rollback_by_record(group_ioctx, *group_snap, group_id,
+ group_header_oid, pctx);
+ return r;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Group<librbd::ImageCtx>;
diff --git a/src/librbd/api/Group.h b/src/librbd/api/Group.h
new file mode 100644
index 00000000..59b978a8
--- /dev/null
+++ b/src/librbd/api/Group.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_GROUP_H
+#define CEPH_LIBRBD_API_GROUP_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+#include <vector>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Group {
+
+ static int create(librados::IoCtx& io_ctx, const char *group_name);
+ static int remove(librados::IoCtx& io_ctx, const char *group_name);
+ static int list(librados::IoCtx& io_ctx, std::vector<std::string> *names);
+ static int rename(librados::IoCtx& io_ctx, const char *src_group_name,
+ const char *dest_group_name);
+
+ static int image_add(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name);
+ static int image_remove(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name);
+ static int image_remove_by_id(librados::IoCtx& group_ioctx,
+ const char *group_name,
+ librados::IoCtx& image_ioctx,
+ const char *image_id);
+ static int image_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_image_info_t> *images);
+
+ static int image_get_group(ImageCtxT *ictx, group_info_t *group_info);
+
+ static int snap_create(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name);
+ static int snap_remove(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name);
+ static int snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name, const char *new_snap_name);
+ static int snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps);
+ static int snap_rollback(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name,
+ ProgressContext& pctx);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Group<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_GROUP_H
diff --git a/src/librbd/api/Image.cc b/src/librbd/api/Image.cc
new file mode 100644
index 00000000..aa427535
--- /dev/null
+++ b/src/librbd/api/Image.cc
@@ -0,0 +1,836 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Image.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/DeepCopyRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Trash.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/image/PreRemoveRequest.h"
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Image: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+bool compare_by_pool(const librbd::linked_image_spec_t& lhs,
+ const librbd::linked_image_spec_t& rhs)
+{
+ if (lhs.pool_id != rhs.pool_id) {
+ return lhs.pool_id < rhs.pool_id;
+ } else if (lhs.pool_namespace != rhs.pool_namespace) {
+ return lhs.pool_namespace < rhs.pool_namespace;
+ }
+ return false;
+}
+
+bool compare(const librbd::linked_image_spec_t& lhs,
+ const librbd::linked_image_spec_t& rhs)
+{
+ if (lhs.pool_name != rhs.pool_name) {
+ return lhs.pool_name < rhs.pool_name;
+ } else if (lhs.pool_id != rhs.pool_id) {
+ return lhs.pool_id < rhs.pool_id;
+ } else if (lhs.pool_namespace != rhs.pool_namespace) {
+ return lhs.pool_namespace < rhs.pool_namespace;
+ } else if (lhs.image_name != rhs.image_name) {
+ return lhs.image_name < rhs.image_name;
+ } else if (lhs.image_id != rhs.image_id) {
+ return lhs.image_id < rhs.image_id;
+ }
+ return false;
+}
+
+template <typename I>
+int pre_remove_image(librados::IoCtx& io_ctx, const std::string& image_id) {
+ I *image_ctx = I::create("", image_id, nullptr, io_ctx, false);
+ int r = image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto req = image::PreRemoveRequest<I>::create(image_ctx, false, &ctx);
+ req->send();
+
+ r = ctx.wait();
+ image_ctx->state->close();
+ return r;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int64_t Image<I>::get_data_pool_id(I *ictx) {
+ if (ictx->data_ctx.is_valid()) {
+ return ictx->data_ctx.get_id();
+ }
+
+ int64_t pool_id;
+ int r = cls_client::get_data_pool(&ictx->md_ctx, ictx->header_oid, &pool_id);
+ if (r < 0) {
+ CephContext *cct = ictx->cct;
+ lderr(cct) << "error getting data pool ID: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return pool_id;
+}
+
+template <typename I>
+int Image<I>::get_op_features(I *ictx, uint64_t *op_features) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "image_ctx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ *op_features = ictx->op_features;
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_images(librados::IoCtx& io_ctx,
+ std::vector<image_spec_t> *images) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "list " << &io_ctx << dendl;
+
+ int r;
+ images->clear();
+
+ if (io_ctx.get_namespace().empty()) {
+ bufferlist bl;
+ r = io_ctx.read(RBD_DIRECTORY, bl, 0, 0);
+ if (r == -ENOENT) {
+ return 0;
+ } else if (r < 0) {
+ lderr(cct) << "error listing v1 images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // V1 format images are in a tmap
+ if (bl.length()) {
+ auto p = bl.cbegin();
+ bufferlist header;
+ std::map<std::string, bufferlist> m;
+ decode(header, p);
+ decode(m, p);
+ for (auto& it : m) {
+ images->push_back({.id ="", .name = it.first});
+ }
+ }
+ }
+
+ // V2 format images
+ std::map<std::string, std::string> image_names_to_ids;
+ r = list_images_v2(io_ctx, &image_names_to_ids);
+ if (r < 0) {
+ lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& img_pair : image_names_to_ids) {
+ images->push_back({.id = img_pair.second,
+ .name = img_pair.first});
+ }
+
+ // include V2 images in a partially removed state
+ std::vector<librbd::trash_image_info_t> trash_images;
+ r = Trash<I>::list(io_ctx, trash_images, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error listing trash images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& trash_image : trash_images) {
+ if (trash_image.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ images->push_back({.id = trash_image.id,
+ .name = trash_image.name});
+
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_images_v2(librados::IoCtx& io_ctx, ImageNameToIds *images) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl;
+
+ // new format images are accessed by class methods
+ int r;
+ int max_read = 1024;
+ string last_read = "";
+ do {
+ map<string, string> images_page;
+ r = cls_client::dir_list(&io_ctx, RBD_DIRECTORY, last_read, max_read,
+ &images_page);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing image in directory: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (r == -ENOENT) {
+ break;
+ }
+ for (map<string, string>::const_iterator it = images_page.begin();
+ it != images_page.end(); ++it) {
+ images->insert(*it);
+ }
+ if (!images_page.empty()) {
+ last_read = images_page.rbegin()->first;
+ }
+ r = images_page.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::get_parent(I *ictx,
+ librbd::linked_image_spec_t *parent_image,
+ librbd::snap_spec_t *parent_snap) {
+ auto cct = ictx->cct;
+ ldout(cct, 20) << "image_ctx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ RWLock::RLocker parent_locker(ictx->parent_lock);
+
+ bool release_parent_locks = false;
+ BOOST_SCOPE_EXIT_ALL(ictx, &release_parent_locks) {
+ if (release_parent_locks) {
+ ictx->parent->parent_lock.put_read();
+ ictx->parent->snap_lock.put_read();
+ }
+ };
+
+ // if a migration is in-progress, the true parent is the parent
+ // of the migration source image
+ auto parent = ictx->parent;
+ if (!ictx->migration_info.empty() && ictx->parent != nullptr) {
+ release_parent_locks = true;
+ ictx->parent->snap_lock.get_read();
+ ictx->parent->parent_lock.get_read();
+
+ parent = ictx->parent->parent;
+ }
+
+ if (parent == nullptr) {
+ return -ENOENT;
+ }
+
+ parent_image->pool_id = parent->md_ctx.get_id();
+ parent_image->pool_name = parent->md_ctx.get_pool_name();
+ parent_image->pool_namespace = parent->md_ctx.get_namespace();
+
+ RWLock::RLocker parent_snap_locker(parent->snap_lock);
+ parent_snap->id = parent->snap_id;
+ parent_snap->namespace_type = RBD_SNAP_NAMESPACE_TYPE_USER;
+ if (parent->snap_id != CEPH_NOSNAP) {
+ auto snap_info = parent->get_snap_info(parent->snap_id);
+ if (snap_info == nullptr) {
+ lderr(cct) << "error finding parent snap name: " << cpp_strerror(r)
+ << dendl;
+ return -ENOENT;
+ }
+
+ parent_snap->namespace_type = static_cast<snap_namespace_type_t>(
+ cls::rbd::get_snap_namespace_type(snap_info->snap_namespace));
+ parent_snap->name = snap_info->name;
+ }
+
+ parent_image->image_id = parent->id;
+ parent_image->image_name = parent->name;
+ parent_image->trash = true;
+
+ librbd::trash_image_info_t trash_info;
+ r = Trash<I>::get(parent->md_ctx, parent->id, &trash_info);
+ if (r == -ENOENT || r == -EOPNOTSUPP) {
+ parent_image->trash = false;
+ } else if (r < 0) {
+ lderr(cct) << "error looking up trash status: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_children(I *ictx,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ images->clear();
+ return list_descendants(ictx, 1, images);
+}
+
+template <typename I>
+int Image<I>::list_children(I *ictx,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ images->clear();
+ return list_descendants(ictx, parent_spec, 1, images);
+}
+
+template <typename I>
+int Image<I>::list_descendants(
+ librados::IoCtx& io_ctx, const std::string &image_id,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ ImageCtx *ictx = new librbd::ImageCtx("", image_id, nullptr,
+ io_ctx, true);
+ int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ return 0;
+ }
+ lderr(ictx->cct) << "failed to open descendant " << image_id
+ << " from pool " << io_ctx.get_pool_name() << ":"
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = list_descendants(ictx, max_level, images);
+
+ int r1 = ictx->state->close();
+ if (r1 < 0) {
+ lderr(ictx->cct) << "error when closing descendant " << image_id
+ << " from pool " << io_ctx.get_pool_name() << ":"
+ << cpp_strerror(r) << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int Image<I>::list_descendants(
+ I *ictx, const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ RWLock::RLocker l(ictx->snap_lock);
+ std::vector<librados::snap_t> snap_ids;
+ if (ictx->snap_id != CEPH_NOSNAP) {
+ snap_ids.push_back(ictx->snap_id);
+ } else {
+ snap_ids = ictx->snaps;
+ }
+ for (auto snap_id : snap_ids) {
+ cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(),
+ ictx->md_ctx.get_namespace(),
+ ictx->id, snap_id};
+ int r = list_descendants(ictx, parent_spec, max_level, images);
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_descendants(
+ I *ictx, const cls::rbd::ParentImageSpec &parent_spec,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ auto child_max_level = max_level;
+ if (child_max_level) {
+ if (child_max_level == 0) {
+ return 0;
+ }
+ (*child_max_level)--;
+ }
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ // no children for non-layered or old format image
+ if (!ictx->test_features(RBD_FEATURE_LAYERING, ictx->snap_lock)) {
+ return 0;
+ }
+
+ librados::Rados rados(ictx->md_ctx);
+
+ // search all pools for clone v1 children dependent on this snapshot
+ std::list<std::pair<int64_t, std::string> > pools;
+ int r = rados.pool_list2(pools);
+ if (r < 0) {
+ lderr(cct) << "error listing pools: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& it : pools) {
+ int64_t base_tier;
+ r = rados.pool_get_base_tier(it.first, &base_tier);
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "pool " << it.second << " no longer exists" << dendl;
+ continue;
+ } else if (r < 0) {
+ lderr(cct) << "error retrieving base tier for pool " << it.second
+ << dendl;
+ return r;
+ }
+ if (it.first != base_tier) {
+ // pool is a cache; skip it
+ continue;
+ }
+
+ IoCtx ioctx;
+ r = util::create_ioctx(ictx->md_ctx, "child image", it.first, {}, &ioctx);
+ if (r == -ENOENT) {
+ continue;
+ } else if (r < 0) {
+ return r;
+ }
+
+ std::set<std::string> image_ids;
+ r = cls_client::get_children(&ioctx, RBD_CHILDREN, parent_spec,
+ image_ids);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error reading list of children from pool " << it.second
+ << dendl;
+ return r;
+ }
+
+ for (auto& image_id : image_ids) {
+ images->push_back({
+ it.first, "", ictx->md_ctx.get_namespace(), image_id, "", false});
+ r = list_descendants(ioctx, image_id, child_max_level, images);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ // retrieve clone v2 children attached to this snapshot
+ IoCtx parent_io_ctx;
+ r = util::create_ioctx(ictx->md_ctx, "parent image", parent_spec.pool_id,
+ parent_spec.pool_namespace, &parent_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::ChildImageSpecs child_images;
+ r = cls_client::children_list(&parent_io_ctx,
+ util::header_name(parent_spec.image_id),
+ parent_spec.snap_id, &child_images);
+ if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) {
+ lderr(cct) << "error retrieving children: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& child_image : child_images) {
+ images->push_back({
+ child_image.pool_id, "", child_image.pool_namespace,
+ child_image.image_id, "", false});
+ if (!child_max_level || *child_max_level > 0) {
+ IoCtx ioctx;
+ r = util::create_ioctx(ictx->md_ctx, "child image", child_image.pool_id,
+ child_image.pool_namespace, &ioctx);
+ if (r == -ENOENT) {
+ continue;
+ } else if (r < 0) {
+ return r;
+ }
+ r = list_descendants(ioctx, child_image.image_id, child_max_level,
+ images);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ // batch lookups by pool + namespace
+ std::sort(images->begin(), images->end(), compare_by_pool);
+
+ int64_t child_pool_id = -1;
+ librados::IoCtx child_io_ctx;
+ std::map<std::string, std::pair<std::string, bool>> child_image_id_to_info;
+ for (auto& image : *images) {
+ if (child_pool_id == -1 || child_pool_id != image.pool_id ||
+ child_io_ctx.get_namespace() != image.pool_namespace) {
+ r = util::create_ioctx(ictx->md_ctx, "child image", image.pool_id,
+ image.pool_namespace, &child_io_ctx);
+ if (r == -ENOENT) {
+ image.pool_name = "";
+ image.image_name = "";
+ continue;
+ } else if (r < 0) {
+ return r;
+ }
+ child_pool_id = image.pool_id;
+
+ child_image_id_to_info.clear();
+
+ std::map<std::string, std::string> image_names_to_ids;
+ r = list_images_v2(child_io_ctx, &image_names_to_ids);
+ if (r < 0) {
+ lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& [name, id] : image_names_to_ids) {
+ child_image_id_to_info.insert({id, {name, false}});
+ }
+
+ std::vector<librbd::trash_image_info_t> trash_images;
+ r = Trash<I>::list(child_io_ctx, trash_images, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error listing trash images: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto& it : trash_images) {
+ child_image_id_to_info.insert({
+ it.id,
+ {it.name,
+ it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING ? false : true}});
+ }
+ }
+
+ auto it = child_image_id_to_info.find(image.image_id);
+ if (it == child_image_id_to_info.end()) {
+ lderr(cct) << "error looking up name for image id "
+ << image.image_id << " in pool "
+ << child_io_ctx.get_pool_name()
+ << (image.pool_namespace.empty() ?
+ "" : "/" + image.pool_namespace) << dendl;
+ return -ENOENT;
+ }
+
+ image.pool_name = child_io_ctx.get_pool_name();
+ image.image_name = it->second.first;
+ image.trash = it->second.second;
+ }
+
+ // final sort by pool + image names
+ std::sort(images->begin(), images->end(), compare);
+ return 0;
+}
+
+template <typename I>
+int Image<I>::deep_copy(I *src, librados::IoCtx& dest_md_ctx,
+ const char *destname, ImageOptions& opts,
+ ProgressContext &prog_ctx) {
+ CephContext *cct = (CephContext *)dest_md_ctx.cct();
+ ldout(cct, 20) << src->name
+ << (src->snap_name.length() ? "@" + src->snap_name : "")
+ << " -> " << destname << " opts = " << opts << dendl;
+
+ uint64_t features;
+ uint64_t src_size;
+ {
+ RWLock::RLocker snap_locker(src->snap_lock);
+
+ if (!src->migration_info.empty()) {
+ lderr(cct) << "cannot deep copy migrating image" << dendl;
+ return -EBUSY;
+ }
+
+ features = src->features;
+ src_size = src->get_image_size(src->snap_id);
+ }
+ uint64_t format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ if (format == 1) {
+ lderr(cct) << "old format not supported for destination image" << dendl;
+ return -EINVAL;
+ }
+ uint64_t stripe_unit = src->stripe_unit;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+ uint64_t stripe_count = src->stripe_count;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+ uint64_t order = src->order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ }
+ if (features & ~RBD_FEATURES_ALL) {
+ lderr(cct) << "librbd does not support requested features" << dendl;
+ return -ENOSYS;
+ }
+
+ uint64_t flatten = 0;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
+ opts.unset(RBD_IMAGE_OPTION_FLATTEN);
+ }
+
+ cls::rbd::ParentImageSpec parent_spec;
+ if (flatten > 0) {
+ parent_spec.pool_id = -1;
+ } else {
+ RWLock::RLocker snap_locker(src->snap_lock);
+ RWLock::RLocker parent_locker(src->parent_lock);
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!src->snap_info.empty()) {
+ parent_spec = src->snap_info.begin()->second.parent.spec;
+ } else {
+ parent_spec = src->parent_md.spec;
+ }
+ }
+
+ int r;
+ if (parent_spec.pool_id == -1) {
+ r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
+ } else {
+ librados::IoCtx parent_io_ctx;
+ r = util::create_ioctx(src->md_ctx, "parent image", parent_spec.pool_id,
+ parent_spec.pool_namespace, &parent_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ ConfigProxy config{cct->_conf};
+ api::Config<I>::apply_pool_overrides(dest_md_ctx, &config);
+
+ C_SaferCond ctx;
+ std::string dest_id = util::generate_image_id(dest_md_ctx);
+ auto *req = image::CloneRequest<I>::create(
+ config, parent_io_ctx, parent_spec.image_id, "", parent_spec.snap_id,
+ dest_md_ctx, destname, dest_id, opts, "", "", src->op_work_queue, &ctx);
+ req->send();
+ r = ctx.wait();
+ }
+ if (r < 0) {
+ lderr(cct) << "header creation failed" << dendl;
+ return r;
+ }
+ opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
+
+ auto dest = new I(destname, "", nullptr, dest_md_ctx, false);
+ r = dest->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "failed to read newly created header" << dendl;
+ return r;
+ }
+
+ C_SaferCond lock_ctx;
+ {
+ RWLock::WLocker locker(dest->owner_lock);
+
+ if (dest->exclusive_lock == nullptr ||
+ dest->exclusive_lock->is_lock_owner()) {
+ lock_ctx.complete(0);
+ } else {
+ dest->exclusive_lock->acquire_lock(&lock_ctx);
+ }
+ }
+
+ r = lock_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ dest->state->close();
+ return r;
+ }
+
+ r = deep_copy(src, dest, flatten > 0, prog_ctx);
+
+ int close_r = dest->state->close();
+ if (r == 0 && close_r < 0) {
+ r = close_r;
+ }
+ return r;
+}
+
+template <typename I>
+int Image<I>::deep_copy(I *src, I *dest, bool flatten,
+ ProgressContext &prog_ctx) {
+ CephContext *cct = src->cct;
+ librados::snap_t snap_id_start = 0;
+ librados::snap_t snap_id_end;
+ {
+ RWLock::RLocker snap_locker(src->snap_lock);
+ snap_id_end = src->snap_id;
+ }
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond cond;
+ SnapSeqs snap_seqs;
+ auto req = DeepCopyRequest<I>::create(src, dest, snap_id_start, snap_id_end,
+ 0U, flatten, boost::none, op_work_queue,
+ &snap_seqs, &prog_ctx, &cond);
+ req->send();
+ int r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::snap_set(I *ictx,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const char *snap_name) {
+ ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = "
+ << (snap_name ? snap_name : "NULL") << dendl;
+
+ // ignore return value, since we may be set to a non-existent
+ // snapshot and the user is trying to fix that
+ ictx->state->refresh_if_required();
+
+ uint64_t snap_id = CEPH_NOSNAP;
+ std::string name(snap_name == nullptr ? "" : snap_name);
+ if (!name.empty()) {
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace{},
+ snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+ }
+
+ return snap_set(ictx, snap_id);
+}
+
+template <typename I>
+int Image<I>::snap_set(I *ictx, uint64_t snap_id) {
+ ldout(ictx->cct, 20) << "snap_set " << ictx << " "
+ << "snap_id=" << snap_id << dendl;
+
+ // ignore return value, since we may be set to a non-existent
+ // snapshot and the user is trying to fix that
+ ictx->state->refresh_if_required();
+
+ C_SaferCond ctx;
+ ictx->state->snap_set(snap_id, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(ictx->cct) << "failed to " << (snap_id == CEPH_NOSNAP ? "un" : "")
+ << "set snapshot: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::remove(IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext& prog_ctx)
+{
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "name=" << image_name << dendl;
+
+ // look up the V2 image id based on the image name
+ std::string image_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r == -ENOENT) {
+ // check if it already exists in trash from an aborted trash remove attempt
+ std::vector<trash_image_info_t> trash_entries;
+ r = Trash<I>::list(io_ctx, trash_entries, false);
+ if (r < 0) {
+ return r;
+ } else if (r >= 0) {
+ for (auto& entry : trash_entries) {
+ if (entry.name == image_name &&
+ entry.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ return Trash<I>::remove(io_ctx, entry.id, true, prog_ctx);
+ }
+ }
+ }
+
+ // fall-through if we failed to locate the image in the V2 directory and
+ // trash
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ // attempt to move the image to the trash (and optionally immediately
+ // delete the image)
+ ConfigProxy config(cct->_conf);
+ Config<I>::apply_pool_overrides(io_ctx, &config);
+
+ rbd_trash_image_source_t trash_image_source =
+ RBD_TRASH_IMAGE_SOURCE_REMOVING;
+ uint64_t expire_seconds = 0;
+ if (config.get_val<bool>("rbd_move_to_trash_on_remove")) {
+ // keep the image in the trash upon remove requests
+ trash_image_source = RBD_TRASH_IMAGE_SOURCE_USER;
+ expire_seconds = config.get_val<uint64_t>(
+ "rbd_move_to_trash_on_remove_expire_seconds");
+ } else {
+ // attempt to pre-validate the removal before moving to trash and
+ // removing
+ r = pre_remove_image<I>(io_ctx, image_id);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ }
+
+ r = Trash<I>::move(io_ctx, trash_image_source, image_name, image_id,
+ expire_seconds);
+ if (r >= 0) {
+ if (trash_image_source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ // proceed with attempting to immediately remove the image
+ r = Trash<I>::remove(io_ctx, image_id, true, prog_ctx);
+
+ if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK) {
+ // best-effort try to restore the image if the removal
+ // failed for possible expected reasons
+ Trash<I>::restore(io_ctx, {cls::rbd::TRASH_IMAGE_SOURCE_REMOVING},
+ image_id, image_name);
+ }
+ }
+ return r;
+ } else if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ // fall-through if trash isn't supported
+ }
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ // might be a V1 image format that cannot be moved to the trash
+ // and would not have been listed in the V2 directory -- or the OSDs
+ // are too old and don't support the trash feature
+ C_SaferCond cond;
+ auto req = librbd::image::RemoveRequest<I>::create(
+ io_ctx, image_name, "", false, false, prog_ctx, op_work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Image<librbd::ImageCtx>;
diff --git a/src/librbd/api/Image.h b/src/librbd/api/Image.h
new file mode 100644
index 00000000..af928c84
--- /dev/null
+++ b/src/librbd/api/Image.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_IMAGE_H
+#define LIBRBD_API_IMAGE_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/rados/librados_fwd.hpp"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+
+namespace librbd {
+
+class ImageOptions;
+class ProgressContext;
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Image {
+ typedef std::map<std::string, std::string> ImageNameToIds;
+
+ static int64_t get_data_pool_id(ImageCtxT *ictx);
+
+ static int get_op_features(ImageCtxT *ictx, uint64_t *op_features);
+
+ static int list_images(librados::IoCtx& io_ctx,
+ std::vector<image_spec_t> *images);
+ static int list_images_v2(librados::IoCtx& io_ctx,
+ ImageNameToIds *images);
+
+ static int get_parent(ImageCtxT *ictx,
+ librbd::linked_image_spec_t *parent_image,
+ librbd::snap_spec_t *parent_snap);
+
+ static int list_children(ImageCtxT *ictx,
+ std::vector<librbd::linked_image_spec_t> *images);
+ static int list_children(ImageCtxT *ictx,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ std::vector<librbd::linked_image_spec_t> *images);
+
+ static int list_descendants(IoCtx& io_ctx, const std::string &image_id,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images);
+ static int list_descendants(ImageCtxT *ictx,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images);
+ static int list_descendants(ImageCtxT *ictx,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images);
+
+ static int deep_copy(ImageCtxT *ictx, librados::IoCtx& dest_md_ctx,
+ const char *destname, ImageOptions& opts,
+ ProgressContext &prog_ctx);
+ static int deep_copy(ImageCtxT *src, ImageCtxT *dest, bool flatten,
+ ProgressContext &prog_ctx);
+
+ static int snap_set(ImageCtxT *ictx,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const char *snap_name);
+ static int snap_set(ImageCtxT *ictx, uint64_t snap_id);
+
+ static int remove(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext& prog_ctx);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Image<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_IMAGE_H
diff --git a/src/librbd/api/Migration.cc b/src/librbd/api/Migration.cc
new file mode 100644
index 00000000..ae09d407
--- /dev/null
+++ b/src/librbd/api/Migration.cc
@@ -0,0 +1,1885 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Migration.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Group.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Snapshot.h"
+#include "librbd/api/Trash.h"
+#include "librbd/deep_copy/ImageCopyRequest.h"
+#include "librbd/deep_copy/MetadataCopyRequest.h"
+#include "librbd/deep_copy/SnapshotCopyRequest.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/image/AttachChildRequest.h"
+#include "librbd/image/AttachParentRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/image/DetachParentRequest.h"
+#include "librbd/image/ListWatchersRequest.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/internal.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Migration: " << __func__ << ": "
+
+namespace librbd {
+
+inline bool operator==(const linked_image_spec_t& rhs,
+ const linked_image_spec_t& lhs) {
+ bool result = (rhs.pool_id == lhs.pool_id &&
+ rhs.pool_namespace == lhs.pool_namespace &&
+ rhs.image_id == lhs.image_id);
+ return result;
+}
+
+namespace api {
+
+using util::create_rados_callback;
+
+namespace {
+
+class MigrationProgressContext : public ProgressContext {
+public:
+ MigrationProgressContext(librados::IoCtx& io_ctx,
+ const std::string &header_oid,
+ cls::rbd::MigrationState state,
+ ProgressContext *prog_ctx)
+ : m_io_ctx(io_ctx), m_header_oid(header_oid), m_state(state),
+ m_prog_ctx(prog_ctx), m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ m_lock(util::unique_lock_name("librbd::api::MigrationProgressContext",
+ this)) {
+ ceph_assert(m_prog_ctx != nullptr);
+ }
+
+ ~MigrationProgressContext() {
+ wait_for_in_flight_updates();
+ }
+
+ int update_progress(uint64_t offset, uint64_t total) override {
+ ldout(m_cct, 20) << "offset=" << offset << ", total=" << total << dendl;
+
+ m_prog_ctx->update_progress(offset, total);
+
+ std::string description = stringify(offset * 100 / total) + "% complete";
+
+ send_state_description_update(description);
+
+ return 0;
+ }
+
+private:
+ librados::IoCtx& m_io_ctx;
+ std::string m_header_oid;
+ cls::rbd::MigrationState m_state;
+ ProgressContext *m_prog_ctx;
+
+ CephContext* m_cct;
+ mutable Mutex m_lock;
+ Cond m_cond;
+ std::string m_state_description;
+ bool m_pending_update = false;
+ int m_in_flight_state_updates = 0;
+
+ void send_state_description_update(const std::string &description) {
+ Mutex::Locker locker(m_lock);
+
+ if (description == m_state_description) {
+ return;
+ }
+
+ m_state_description = description;
+
+ if (m_in_flight_state_updates > 0) {
+ m_pending_update = true;
+ return;
+ }
+
+ set_state_description();
+ }
+
+ void set_state_description() {
+ ldout(m_cct, 20) << "state_description=" << m_state_description << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ librados::ObjectWriteOperation op;
+ cls_client::migration_set_state(&op, m_state, m_state_description);
+
+ using klass = MigrationProgressContext;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_state_description>(this);
+ int r = m_io_ctx.aio_operate(m_header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+
+ m_in_flight_state_updates++;
+ }
+
+ void handle_set_state_description(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ m_in_flight_state_updates--;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update migration state: " << cpp_strerror(r)
+ << dendl;
+ } else if (m_pending_update) {
+ set_state_description();
+ m_pending_update = false;
+ } else {
+ m_cond.Signal();
+ }
+ }
+
+ void wait_for_in_flight_updates() {
+ Mutex::Locker locker(m_lock);
+
+ ldout(m_cct, 20) << "m_in_flight_state_updates="
+ << m_in_flight_state_updates << dendl;
+
+ m_pending_update = false;
+ while (m_in_flight_state_updates > 0) {
+ m_cond.Wait(m_lock);
+ }
+ }
+};
+
+int trash_search(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, std::string *image_id) {
+ std::vector<trash_image_info_t> entries;
+
+ int r = Trash<>::list(io_ctx, entries, false);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto &entry : entries) {
+ if (entry.source == source && entry.name == image_name) {
+ *image_id = entry.id;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+template <typename I>
+int open_source_image(librados::IoCtx& io_ctx, const std::string &image_name,
+ I **src_image_ctx, librados::IoCtx *dst_io_ctx,
+ std::string *dst_image_name, std::string *dst_image_id,
+ bool *flatten, bool *mirroring,
+ cls::rbd::MigrationState *state,
+ std::string *state_description) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ librados::IoCtx src_io_ctx;
+ std::string src_image_name;
+ std::string src_image_id;
+ cls::rbd::MigrationSpec migration_spec;
+ I *image_ctx = I::create(image_name, "", nullptr, io_ctx, false);
+
+ ldout(cct, 10) << "trying to open image by name " << io_ctx.get_pool_name()
+ << "/" << image_name << dendl;
+
+ int r = image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ image_ctx = nullptr;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(&r, &image_ctx) {
+ if (r != 0 && image_ctx != nullptr) {
+ image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ if (r == 0) {
+ // The opened image is either a source (then just proceed) or a
+ // destination (then look for the source image id in the migration
+ // header).
+
+ r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ &migration_spec);
+
+ if (r < 0) {
+ lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 10) << "migration spec: " << migration_spec << dendl;
+
+ if (migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_SRC &&
+ migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST) {
+ lderr(cct) << "unexpected migration header type: "
+ << migration_spec.header_type << dendl;
+ r = -EINVAL;
+ return r;
+ }
+
+ if (migration_spec.header_type == cls::rbd::MIGRATION_HEADER_TYPE_DST) {
+ ldout(cct, 10) << "the destination image is opened" << dendl;
+
+ // Close and look for the source image.
+ r = image_ctx->state->close();
+ image_ctx = nullptr;
+ if (r < 0) {
+ lderr(cct) << "failed closing image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = util::create_ioctx(io_ctx, "source image", migration_spec.pool_id,
+ migration_spec.pool_namespace, &src_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ src_image_name = migration_spec.image_name;
+ src_image_id = migration_spec.image_id;
+ } else {
+ ldout(cct, 10) << "the source image is opened" << dendl;
+ }
+ } else {
+ assert (r == -ENOENT);
+
+ ldout(cct, 10) << "source image is not found. Trying trash" << dendl;
+
+ r = trash_search(io_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION, image_name,
+ &src_image_id);
+ if (r < 0) {
+ lderr(cct) << "failed to determine image id: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 10) << "source image id from trash: " << src_image_id << dendl;
+
+ src_io_ctx.dup(io_ctx);
+ }
+
+ if (image_ctx == nullptr) {
+ int flags = OPEN_FLAG_IGNORE_MIGRATING;
+
+ if (src_image_id.empty()) {
+ ldout(cct, 20) << "trying to open v1 image by name "
+ << src_io_ctx.get_pool_name() << "/" << src_image_name
+ << dendl;
+
+ flags |= OPEN_FLAG_OLD_FORMAT;
+ } else {
+ ldout(cct, 20) << "trying to open v2 image by id "
+ << src_io_ctx.get_pool_name() << "/" << src_image_id
+ << dendl;
+ }
+
+ image_ctx = I::create(src_image_name, src_image_id, nullptr, src_io_ctx,
+ false);
+ r = image_ctx->state->open(flags);
+ if (r < 0) {
+ lderr(cct) << "failed to open source image " << src_io_ctx.get_pool_name()
+ << "/" << (src_image_id.empty() ? src_image_name : src_image_id)
+ << ": " << cpp_strerror(r) << dendl;
+ image_ctx = nullptr;
+ return r;
+ }
+
+ r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ &migration_spec);
+ if (r < 0) {
+ lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "migration spec: " << migration_spec << dendl;
+ }
+
+ r = util::create_ioctx(image_ctx->md_ctx, "source image",
+ migration_spec.pool_id, migration_spec.pool_namespace,
+ dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ *src_image_ctx = image_ctx;
+ *dst_image_name = migration_spec.image_name;
+ *dst_image_id = migration_spec.image_id;
+ *flatten = migration_spec.flatten;
+ *mirroring = migration_spec.mirroring;
+ *state = migration_spec.state;
+ *state_description = migration_spec.state_description;
+
+ return 0;
+}
+
+class SteppedProgressContext : public ProgressContext {
+public:
+ SteppedProgressContext(ProgressContext* progress_ctx, size_t total_steps)
+ : m_progress_ctx(progress_ctx), m_total_steps(total_steps) {
+ }
+
+ void next_step() {
+ ceph_assert(m_current_step < m_total_steps);
+ ++m_current_step;
+ }
+
+ int update_progress(uint64_t object_number,
+ uint64_t object_count) override {
+ return m_progress_ctx->update_progress(
+ object_number + (object_count * (m_current_step - 1)),
+ object_count * m_total_steps);
+ }
+
+private:
+ ProgressContext* m_progress_ctx;
+ size_t m_total_steps;
+ size_t m_current_step = 1;
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Migration<I>::prepare(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name_,
+ ImageOptions& opts) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ std::string dest_image_name = dest_image_name_.empty() ? image_name :
+ dest_image_name_;
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << " -> "
+ << dest_io_ctx.get_pool_name() << "/" << dest_image_name
+ << ", opts=" << opts << dendl;
+
+ auto image_ctx = I::create(image_name, "", nullptr, io_ctx, false);
+ int r = image_ctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ BOOST_SCOPE_EXIT_TPL(image_ctx) {
+ image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ std::list<obj_watch_t> watchers;
+ int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES;
+ C_SaferCond on_list_watchers;
+ auto list_watchers_request = librbd::image::ListWatchersRequest<I>::create(
+ *image_ctx, flags, &watchers, &on_list_watchers);
+ list_watchers_request->send();
+ r = on_list_watchers.wait();
+ if (r < 0) {
+ lderr(cct) << "failed listing watchers:" << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (!watchers.empty()) {
+ lderr(cct) << "image has watchers - not migrating" << dendl;
+ return -EBUSY;
+ }
+
+ uint64_t format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ if (format != 2) {
+ lderr(cct) << "unsupported destination image format: " << format << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t features;
+ {
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+ features = image_ctx->features;
+ }
+ opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
+ if ((features & ~RBD_FEATURES_ALL) != 0) {
+ lderr(cct) << "librbd does not support requested features" << dendl;
+ return -ENOSYS;
+ }
+ features &= ~RBD_FEATURES_INTERNAL;
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+
+ uint64_t order = image_ctx->order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ r = image::CreateRequest<I>::validate_order(cct, order);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t stripe_unit = image_ctx->stripe_unit;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+ uint64_t stripe_count = image_ctx->stripe_count;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+
+ uint64_t flatten = 0;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
+ opts.unset(RBD_IMAGE_OPTION_FLATTEN);
+ }
+
+ ldout(cct, 20) << "updated opts=" << opts << dendl;
+
+ Migration migration(image_ctx, dest_io_ctx, dest_image_name, "", opts, flatten > 0,
+ false, cls::rbd::MIGRATION_STATE_PREPARING, "", nullptr);
+ r = migration.prepare();
+
+ return r;
+}
+
+template <typename I>
+int Migration<I>::execute(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ ProgressContext &prog_ctx) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *image_ctx;
+ librados::IoCtx dest_io_ctx;
+ std::string dest_image_name;
+ std::string dest_image_id;
+ bool flatten;
+ bool mirroring;
+ cls::rbd::MigrationState state;
+ std::string state_description;
+
+ int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx,
+ &dest_image_name, &dest_image_id, &flatten,
+ &mirroring, &state, &state_description);
+ if (r < 0) {
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(image_ctx) {
+ image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ if (state != cls::rbd::MIGRATION_STATE_PREPARED) {
+ lderr(cct) << "current migration state is '" << state << "'"
+ << " (should be 'prepared')" << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 5) << "migrating " << image_ctx->md_ctx.get_pool_name() << "/"
+ << image_ctx->name << " -> " << dest_io_ctx.get_pool_name()
+ << "/" << dest_image_name << dendl;
+
+ ImageOptions opts;
+ Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id,
+ opts, flatten, mirroring, state, state_description,
+ &prog_ctx);
+ r = migration.execute();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::abort(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *image_ctx;
+ librados::IoCtx dest_io_ctx;
+ std::string dest_image_name;
+ std::string dest_image_id;
+ bool flatten;
+ bool mirroring;
+ cls::rbd::MigrationState state;
+ std::string state_description;
+
+ int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx,
+ &dest_image_name, &dest_image_id, &flatten,
+ &mirroring, &state, &state_description);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(cct, 5) << "canceling incomplete migration "
+ << image_ctx->md_ctx.get_pool_name() << "/" << image_ctx->name
+ << " -> " << dest_io_ctx.get_pool_name() << "/" << dest_image_name
+ << dendl;
+
+ ImageOptions opts;
+ Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id,
+ opts, flatten, mirroring, state, state_description,
+ &prog_ctx);
+ r = migration.abort();
+
+ image_ctx->state->close();
+
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::commit(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ ProgressContext &prog_ctx) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *image_ctx;
+ librados::IoCtx dest_io_ctx;
+ std::string dest_image_name;
+ std::string dest_image_id;
+ bool flatten;
+ bool mirroring;
+ cls::rbd::MigrationState state;
+ std::string state_description;
+
+ int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx,
+ &dest_image_name, &dest_image_id, &flatten,
+ &mirroring, &state, &state_description);
+ if (r < 0) {
+ return r;
+ }
+
+ if (state != cls::rbd::MIGRATION_STATE_EXECUTED) {
+ lderr(cct) << "current migration state is '" << state << "'"
+ << " (should be 'executed')" << dendl;
+ image_ctx->state->close();
+ return -EINVAL;
+ }
+
+ ldout(cct, 5) << "migrating " << image_ctx->md_ctx.get_pool_name() << "/"
+ << image_ctx->name << " -> " << dest_io_ctx.get_pool_name()
+ << "/" << dest_image_name << dendl;
+
+ ImageOptions opts;
+ Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id,
+ opts, flatten, mirroring, state, state_description,
+ &prog_ctx);
+ r = migration.commit();
+
+ // image_ctx is closed in commit when removing src image
+
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::status(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ image_migration_status_t *status) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *image_ctx;
+ librados::IoCtx dest_io_ctx;
+ std::string dest_image_name;
+ std::string dest_image_id;
+ bool flatten;
+ bool mirroring;
+ cls::rbd::MigrationState state;
+ std::string state_description;
+
+ int r = open_source_image(io_ctx, image_name, &image_ctx, &dest_io_ctx,
+ &dest_image_name, &dest_image_id, &flatten,
+ &mirroring, &state, &state_description);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(cct, 5) << "migrating " << image_ctx->md_ctx.get_pool_name() << "/"
+ << image_ctx->name << " -> " << dest_io_ctx.get_pool_name()
+ << "/" << dest_image_name << dendl;
+
+ ImageOptions opts;
+ Migration migration(image_ctx, dest_io_ctx, dest_image_name, dest_image_id,
+ opts, flatten, mirroring, state, state_description,
+ nullptr);
+ r = migration.status(status);
+
+ image_ctx->state->close();
+
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+Migration<I>::Migration(I *src_image_ctx, librados::IoCtx& dst_io_ctx,
+ const std::string &dstname,
+ const std::string &dst_image_id,
+ ImageOptions& opts, bool flatten, bool mirroring,
+ cls::rbd::MigrationState state,
+ const std::string &state_description,
+ ProgressContext *prog_ctx)
+ : m_cct(static_cast<CephContext *>(dst_io_ctx.cct())),
+ m_src_image_ctx(src_image_ctx), m_dst_io_ctx(dst_io_ctx),
+ m_src_old_format(m_src_image_ctx->old_format),
+ m_src_image_name(m_src_image_ctx->old_format ? m_src_image_ctx->name : ""),
+ m_src_image_id(m_src_image_ctx->id),
+ m_src_header_oid(m_src_image_ctx->header_oid), m_dst_image_name(dstname),
+ m_dst_image_id(dst_image_id.empty() ?
+ util::generate_image_id(m_dst_io_ctx) : dst_image_id),
+ m_dst_header_oid(util::header_name(m_dst_image_id)), m_image_options(opts),
+ m_flatten(flatten), m_mirroring(mirroring), m_prog_ctx(prog_ctx),
+ m_src_migration_spec(cls::rbd::MIGRATION_HEADER_TYPE_SRC,
+ m_dst_io_ctx.get_id(), m_dst_io_ctx.get_namespace(),
+ m_dst_image_name, m_dst_image_id, {}, 0, flatten,
+ mirroring, state, state_description),
+ m_dst_migration_spec(cls::rbd::MIGRATION_HEADER_TYPE_DST,
+ src_image_ctx->md_ctx.get_id(),
+ src_image_ctx->md_ctx.get_namespace(),
+ m_src_image_ctx->name, m_src_image_ctx->id, {}, 0,
+ flatten, mirroring, state, state_description) {
+ m_src_io_ctx.dup(src_image_ctx->md_ctx);
+}
+
+template <typename I>
+int Migration<I>::prepare() {
+ ldout(m_cct, 10) << dendl;
+
+ int r = validate_src_snaps();
+ if (r < 0) {
+ return r;
+ }
+
+ r = disable_mirroring(m_src_image_ctx, &m_mirroring);
+ if (r < 0) {
+ return r;
+ }
+
+ r = unlink_src_image();
+ if (r < 0) {
+ enable_mirroring(m_src_image_ctx, m_mirroring);
+ return r;
+ }
+
+ r = set_migration();
+ if (r < 0) {
+ relink_src_image();
+ enable_mirroring(m_src_image_ctx, m_mirroring);
+ return r;
+ }
+
+ r = create_dst_image();
+ if (r < 0) {
+ abort();
+ return r;
+ }
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::execute() {
+ ldout(m_cct, 10) << dendl;
+
+ auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr,
+ m_dst_io_ctx, false);
+ int r = dst_image_ctx->state->open(0);
+ if (r < 0) {
+ lderr(m_cct) << "failed to open destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(dst_image_ctx) {
+ dst_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ r = set_state(cls::rbd::MIGRATION_STATE_EXECUTING, "");
+ if (r < 0) {
+ return r;
+ }
+
+ while (true) {
+ MigrationProgressContext prog_ctx(m_src_io_ctx, m_src_header_oid,
+ cls::rbd::MIGRATION_STATE_EXECUTING,
+ m_prog_ctx);
+ r = dst_image_ctx->operations->migrate(prog_ctx);
+ if (r == -EROFS) {
+ RWLock::RLocker owner_locker(dst_image_ctx->owner_lock);
+ if (dst_image_ctx->exclusive_lock != nullptr &&
+ !dst_image_ctx->exclusive_lock->accept_ops()) {
+ ldout(m_cct, 5) << "lost exclusive lock, retrying remote" << dendl;
+ continue;
+ }
+ }
+ break;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "migration failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = set_state(cls::rbd::MIGRATION_STATE_EXECUTED, "");
+ if (r < 0) {
+ return r;
+ }
+
+ dst_image_ctx->notify_update();
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::abort() {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+
+ m_src_image_ctx->owner_lock.get_read();
+ if (m_src_image_ctx->exclusive_lock != nullptr &&
+ !m_src_image_ctx->exclusive_lock->is_lock_owner()) {
+ C_SaferCond ctx;
+ m_src_image_ctx->exclusive_lock->acquire_lock(&ctx);
+ m_src_image_ctx->owner_lock.put_read();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "error acquiring exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ } else {
+ m_src_image_ctx->owner_lock.put_read();
+ }
+
+ group_info_t group_info;
+ group_info.pool = -1;
+
+ auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr,
+ m_dst_io_ctx, false);
+ r = dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ if (r < 0) {
+ ldout(m_cct, 1) << "failed to open destination image: " << cpp_strerror(r)
+ << dendl;
+ } else {
+ BOOST_SCOPE_EXIT_TPL(&dst_image_ctx) {
+ if (dst_image_ctx != nullptr) {
+ dst_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ std::list<obj_watch_t> watchers;
+ int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES;
+ C_SaferCond on_list_watchers;
+ auto list_watchers_request = librbd::image::ListWatchersRequest<I>::create(
+ *dst_image_ctx, flags, &watchers, &on_list_watchers);
+ list_watchers_request->send();
+ r = on_list_watchers.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed listing watchers:" << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (!watchers.empty()) {
+ lderr(m_cct) << "image has watchers - cannot abort migration" << dendl;
+ return -EBUSY;
+ }
+
+ // ensure destination image is now read-only
+ r = set_state(cls::rbd::MIGRATION_STATE_ABORTING, "");
+ if (r < 0) {
+ return r;
+ }
+
+ // copy dst HEAD -> src HEAD
+ SteppedProgressContext progress_ctx(m_prog_ctx, 2);
+ revert_data(dst_image_ctx, m_src_image_ctx, &progress_ctx);
+ progress_ctx.next_step();
+
+ ldout(m_cct, 10) << "relinking children" << dendl;
+ r = relink_children(dst_image_ctx, m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(m_cct, 10) << "removing dst image snapshots" << dendl;
+
+ std::vector<librbd::snap_info_t> snaps;
+ r = snap_list(dst_image_ctx, snaps);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto &snap : snaps) {
+ librbd::NoOpProgressContext prog_ctx;
+ int r = snap_remove(dst_image_ctx, snap.name.c_str(),
+ RBD_SNAP_REMOVE_UNPROTECT, prog_ctx);
+ if (r < 0) {
+ lderr(m_cct) << "failed removing snapshot: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ ldout(m_cct, 10) << "removing group" << dendl;
+
+ r = remove_group(dst_image_ctx, &group_info);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ ldout(m_cct, 10) << "removing dst image" << dendl;
+
+ ceph_assert(dst_image_ctx->ignore_migrating);
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(m_cct, &thread_pool, &op_work_queue);
+ C_SaferCond on_remove;
+ auto req = librbd::image::RemoveRequest<>::create(
+ m_dst_io_ctx, dst_image_ctx, false, false, progress_ctx, op_work_queue,
+ &on_remove);
+ req->send();
+ r = on_remove.wait();
+
+ dst_image_ctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed removing destination image '"
+ << m_dst_io_ctx.get_pool_name() << "/" << m_dst_image_name
+ << " (" << m_dst_image_id << ")': " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ r = relink_src_image();
+ if (r < 0) {
+ return r;
+ }
+
+ r = add_group(m_src_image_ctx, group_info);
+ if (r < 0) {
+ return r;
+ }
+
+ r = remove_migration(m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = enable_mirroring(m_src_image_ctx, m_mirroring);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::commit() {
+ ldout(m_cct, 10) << dendl;
+
+ BOOST_SCOPE_EXIT_TPL(&m_src_image_ctx) {
+ if (m_src_image_ctx != nullptr) {
+ m_src_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr,
+ m_dst_io_ctx, false);
+ int r = dst_image_ctx->state->open(0);
+ if (r < 0) {
+ lderr(m_cct) << "failed to open destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(dst_image_ctx) {
+ dst_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ r = remove_migration(dst_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = remove_src_image();
+ if (r < 0) {
+ return r;
+ }
+
+ r = enable_mirroring(dst_image_ctx, m_mirroring);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::status(image_migration_status_t *status) {
+ ldout(m_cct, 10) << dendl;
+
+ status->source_pool_id = m_dst_migration_spec.pool_id;
+ status->source_pool_namespace = m_dst_migration_spec.pool_namespace;
+ status->source_image_name = m_dst_migration_spec.image_name;
+ status->source_image_id = m_dst_migration_spec.image_id;
+ status->dest_pool_id = m_src_migration_spec.pool_id;
+ status->dest_pool_namespace = m_src_migration_spec.pool_namespace;
+ status->dest_image_name = m_src_migration_spec.image_name;
+ status->dest_image_id = m_src_migration_spec.image_id;
+
+ switch (m_src_migration_spec.state) {
+ case cls::rbd::MIGRATION_STATE_ERROR:
+ status->state = RBD_IMAGE_MIGRATION_STATE_ERROR;
+ break;
+ case cls::rbd::MIGRATION_STATE_PREPARING:
+ status->state = RBD_IMAGE_MIGRATION_STATE_PREPARING;
+ break;
+ case cls::rbd::MIGRATION_STATE_PREPARED:
+ status->state = RBD_IMAGE_MIGRATION_STATE_PREPARED;
+ break;
+ case cls::rbd::MIGRATION_STATE_EXECUTING:
+ status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTING;
+ break;
+ case cls::rbd::MIGRATION_STATE_EXECUTED:
+ status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTED;
+ break;
+ default:
+ status->state = RBD_IMAGE_MIGRATION_STATE_UNKNOWN;
+ break;
+ }
+
+ status->state_description = m_src_migration_spec.state_description;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::set_state(cls::rbd::MigrationState state,
+ const std::string &description) {
+ int r = cls_client::migration_set_state(&m_src_io_ctx, m_src_header_oid,
+ state, description);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set source migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = cls_client::migration_set_state(&m_dst_io_ctx, m_dst_header_oid, state,
+ description);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set destination migration header: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::list_src_snaps(std::vector<librbd::snap_info_t> *snaps) {
+ ldout(m_cct, 10) << dendl;
+
+ int r = snap_list(m_src_image_ctx, *snaps);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto &snap : *snaps) {
+ librbd::snap_namespace_type_t namespace_type;
+ r = Snapshot<I>::get_namespace_type(m_src_image_ctx, snap.id,
+ &namespace_type);
+ if (r < 0) {
+ lderr(m_cct) << "error getting snap namespace type: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER) {
+ if (namespace_type == RBD_SNAP_NAMESPACE_TYPE_TRASH) {
+ lderr(m_cct) << "image has snapshots with linked clones that must be "
+ << "deleted or flattened before the image can be migrated"
+ << dendl;
+ } else {
+ lderr(m_cct) << "image has non-user type snapshots "
+ << "that are not supported by migration" << dendl;
+ }
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::validate_src_snaps() {
+ ldout(m_cct, 10) << dendl;
+
+ std::vector<librbd::snap_info_t> snaps;
+ int r = list_src_snaps(&snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t dst_features = 0;
+ r = m_image_options.get(RBD_IMAGE_OPTION_FEATURES, &dst_features);
+ ceph_assert(r == 0);
+
+ if (!m_src_image_ctx->test_features(RBD_FEATURE_LAYERING)) {
+ return 0;
+ }
+
+ for (auto &snap : snaps) {
+ RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock);
+ cls::rbd::ParentImageSpec parent_spec{m_src_image_ctx->md_ctx.get_id(),
+ m_src_image_ctx->md_ctx.get_namespace(),
+ m_src_image_ctx->id, snap.id};
+ std::vector<librbd::linked_image_spec_t> child_images;
+ r = api::Image<I>::list_children(m_src_image_ctx, parent_spec,
+ &child_images);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing children: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ if (!child_images.empty()) {
+ ldout(m_cct, 1) << m_src_image_ctx->name << "@" << snap.name
+ << " has children" << dendl;
+
+ if ((dst_features & RBD_FEATURE_LAYERING) == 0) {
+ lderr(m_cct) << "can't migrate to destination without layering feature: "
+ << "image has children" << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+template <typename I>
+int Migration<I>::set_migration() {
+ ldout(m_cct, 10) << dendl;
+
+ m_src_image_ctx->ignore_migrating = true;
+
+ int r = cls_client::migration_set(&m_src_io_ctx, m_src_header_oid,
+ m_src_migration_spec);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ m_src_image_ctx->notify_update();
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::remove_migration(I *image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+
+ r = cls_client::migration_remove(&image_ctx->md_ctx, image_ctx->header_oid);
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed removing migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ image_ctx->notify_update();
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::unlink_src_image() {
+ if (m_src_old_format) {
+ return v1_unlink_src_image();
+ } else {
+ return v2_unlink_src_image();
+ }
+}
+
+template <typename I>
+int Migration<I>::v1_unlink_src_image() {
+ ldout(m_cct, 10) << dendl;
+
+ int r = tmap_rm(m_src_io_ctx, m_src_image_name);
+ if (r < 0) {
+ lderr(m_cct) << "failed removing " << m_src_image_name << " from tmap: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::v2_unlink_src_image() {
+ ldout(m_cct, 10) << dendl;
+
+ m_src_image_ctx->owner_lock.get_read();
+ if (m_src_image_ctx->exclusive_lock != nullptr &&
+ m_src_image_ctx->exclusive_lock->is_lock_owner()) {
+ C_SaferCond ctx;
+ m_src_image_ctx->exclusive_lock->release_lock(&ctx);
+ m_src_image_ctx->owner_lock.put_read();
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "error releasing exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ } else {
+ m_src_image_ctx->owner_lock.put_read();
+ }
+
+ int r = Trash<I>::move(m_src_io_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION,
+ m_src_image_ctx->name, 0);
+ if (r < 0) {
+ lderr(m_cct) << "failed moving image to trash: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::relink_src_image() {
+ if (m_src_old_format) {
+ return v1_relink_src_image();
+ } else {
+ return v2_relink_src_image();
+ }
+}
+
+template <typename I>
+int Migration<I>::v1_relink_src_image() {
+ ldout(m_cct, 10) << dendl;
+
+ int r = tmap_set(m_src_io_ctx, m_src_image_name);
+ if (r < 0) {
+ lderr(m_cct) << "failed adding " << m_src_image_name << " to tmap: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::v2_relink_src_image() {
+ ldout(m_cct, 10) << dendl;
+
+ int r = Trash<I>::restore(m_src_io_ctx,
+ {cls::rbd::TRASH_IMAGE_SOURCE_MIGRATION},
+ m_src_image_ctx->id, m_src_image_ctx->name);
+ if (r < 0) {
+ lderr(m_cct) << "failed restoring image from trash: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::create_dst_image() {
+ ldout(m_cct, 10) << dendl;
+
+ uint64_t size;
+ cls::rbd::ParentImageSpec parent_spec;
+ {
+ RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock);
+ RWLock::RLocker parent_locker(m_src_image_ctx->parent_lock);
+ size = m_src_image_ctx->size;
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!m_src_image_ctx->snap_info.empty()) {
+ parent_spec = m_src_image_ctx->snap_info.begin()->second.parent.spec;
+ } else {
+ parent_spec = m_src_image_ctx->parent_md.spec;
+ }
+ }
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(m_cct, &thread_pool, &op_work_queue);
+
+ ConfigProxy config{m_cct->_conf};
+ api::Config<I>::apply_pool_overrides(m_dst_io_ctx, &config);
+
+ int r;
+ C_SaferCond on_create;
+ librados::IoCtx parent_io_ctx;
+ if (parent_spec.pool_id == -1) {
+ auto *req = image::CreateRequest<I>::create(
+ config, m_dst_io_ctx, m_dst_image_name, m_dst_image_id, size,
+ m_image_options, "", "", true /* skip_mirror_enable */, op_work_queue,
+ &on_create);
+ req->send();
+ } else {
+ r = util::create_ioctx(m_src_image_ctx->md_ctx, "destination image",
+ parent_spec.pool_id, parent_spec.pool_namespace,
+ &parent_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ auto *req = image::CloneRequest<I>::create(
+ config, parent_io_ctx, parent_spec.image_id, "", parent_spec.snap_id,
+ m_dst_io_ctx, m_dst_image_name, m_dst_image_id, m_image_options, "", "",
+ op_work_queue, &on_create);
+ req->send();
+ }
+
+ r = on_create.wait();
+ if (r < 0) {
+ lderr(m_cct) << "header creation failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto dst_image_ctx = I::create(m_dst_image_name, m_dst_image_id, nullptr,
+ m_dst_io_ctx, false);
+
+ r = dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ if (r < 0) {
+ lderr(m_cct) << "failed to open newly created header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(dst_image_ctx) {
+ dst_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ {
+ RWLock::RLocker owner_locker(dst_image_ctx->owner_lock);
+ r = dst_image_ctx->operations->prepare_image_update(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true);
+ if (r < 0) {
+ lderr(m_cct) << "cannot obtain exclusive lock" << dendl;
+ return r;
+ }
+ if (dst_image_ctx->exclusive_lock != nullptr) {
+ dst_image_ctx->exclusive_lock->block_requests(0);
+ }
+ }
+
+ SnapSeqs snap_seqs;
+
+ C_SaferCond on_snapshot_copy;
+ auto snapshot_copy_req = librbd::deep_copy::SnapshotCopyRequest<I>::create(
+ m_src_image_ctx, dst_image_ctx, 0, CEPH_NOSNAP, 0, m_flatten,
+ m_src_image_ctx->op_work_queue, &snap_seqs, &on_snapshot_copy);
+ snapshot_copy_req->send();
+ r = on_snapshot_copy.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy snapshots: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond on_metadata_copy;
+ auto metadata_copy_req = librbd::deep_copy::MetadataCopyRequest<I>::create(
+ m_src_image_ctx, dst_image_ctx, &on_metadata_copy);
+ metadata_copy_req->send();
+ r = on_metadata_copy.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ m_dst_migration_spec = {cls::rbd::MIGRATION_HEADER_TYPE_DST,
+ m_src_io_ctx.get_id(), m_src_io_ctx.get_namespace(),
+ m_src_image_name, m_src_image_id, snap_seqs, size,
+ m_flatten, m_mirroring,
+ cls::rbd::MIGRATION_STATE_PREPARING, ""};
+
+ r = cls_client::migration_set(&m_dst_io_ctx, m_dst_header_oid,
+ m_dst_migration_spec);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = update_group(m_src_image_ctx, dst_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = set_state(cls::rbd::MIGRATION_STATE_PREPARED, "");
+ if (r < 0) {
+ return r;
+ }
+
+ r = dst_image_ctx->state->refresh();
+ if (r < 0) {
+ lderr(m_cct) << "failed to refresh destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = relink_children(m_src_image_ctx, dst_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::remove_group(I *image_ctx, group_info_t *group_info) {
+ int r = librbd::api::Group<I>::image_get_group(image_ctx, group_info);
+ if (r < 0) {
+ lderr(m_cct) << "failed to get image group: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (group_info->pool == -1) {
+ return -ENOENT;
+ }
+
+ ceph_assert(!image_ctx->id.empty());
+
+ ldout(m_cct, 10) << dendl;
+
+ IoCtx group_ioctx;
+ r = util::create_ioctx(image_ctx->md_ctx, "group", group_info->pool, {},
+ &group_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = librbd::api::Group<I>::image_remove_by_id(group_ioctx,
+ group_info->name.c_str(),
+ image_ctx->md_ctx,
+ image_ctx->id.c_str());
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove image from group: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::add_group(I *image_ctx, group_info_t &group_info) {
+ if (group_info.pool == -1) {
+ return 0;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ IoCtx group_ioctx;
+ int r = util::create_ioctx(image_ctx->md_ctx, "group", group_info.pool, {},
+ &group_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = librbd::api::Group<I>::image_add(group_ioctx, group_info.name.c_str(),
+ image_ctx->md_ctx,
+ image_ctx->name.c_str());
+ if (r < 0) {
+ lderr(m_cct) << "failed to add image to group: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::update_group(I *from_image_ctx, I *to_image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ group_info_t group_info;
+
+ int r = remove_group(from_image_ctx, &group_info);
+ if (r < 0) {
+ return r == -ENOENT ? 0 : r;
+ }
+
+ r = add_group(to_image_ctx, group_info);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::disable_mirroring(I *image_ctx, bool *was_enabled) {
+ *was_enabled = false;
+
+ if (!image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ return 0;
+ }
+
+ cls::rbd::MirrorImage mirror_image;
+ int r = cls_client::mirror_image_get(&image_ctx->md_ctx, image_ctx->id,
+ &mirror_image);
+ if (r == -ENOENT) {
+ ldout(m_cct, 10) << "mirroring is not enabled for this image" << dendl;
+ return 0;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ *was_enabled = true;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ C_SaferCond ctx;
+ auto req = mirror::DisableRequest<I>::create(image_ctx, false, true, &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to disable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ m_src_migration_spec.mirroring = true;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::enable_mirroring(I *image_ctx, bool was_enabled) {
+
+ if (!image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ return 0;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ int r = cls_client::mirror_mode_get(&image_ctx->md_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ ldout(m_cct, 10) << "mirroring is not enabled for destination pool"
+ << dendl;
+ return 0;
+ }
+ if (mirror_mode == cls::rbd::MIRROR_MODE_IMAGE && !was_enabled) {
+ ldout(m_cct, 10) << "mirroring is not enabled for image" << dendl;
+ return 0;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ C_SaferCond ctx;
+ auto req = mirror::EnableRequest<I>::create(image_ctx->md_ctx, image_ctx->id,
+ "", image_ctx->op_work_queue,
+ &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+// When relinking children we should be careful as it my be interrupted
+// at any moment by some reason and we may end up in an inconsistent
+// state, which we have to be able to fix with "migration abort". Below
+// are all possible states during migration (P1 - sourse parent, P2 -
+// destination parent, C - child):
+//
+// P1 P2 P1 P2 P1 P2 P1 P2
+// ^\ \ ^ \ /^ /^
+// \v v/ v/ v/
+// C C C C
+//
+// 1 2 3 4
+//
+// (1) and (4) are the initial and the final consistent states. (2)
+// and (3) are intermediate inconsistent states that have to be fixed
+// by relink_children running in "migration abort" mode. For this, it
+// scans P2 for all children attached and relinks (fixes) states (3)
+// and (4) to state (1). Then it scans P1 for remaining children and
+// fixes the states (2).
+
+template <typename I>
+int Migration<I>::relink_children(I *from_image_ctx, I *to_image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ std::vector<librbd::snap_info_t> snaps;
+ int r = list_src_snaps(&snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ bool migration_abort = (to_image_ctx == m_src_image_ctx);
+
+ for (auto it = snaps.begin(); it != snaps.end(); it++) {
+ auto &snap = *it;
+ std::vector<librbd::linked_image_spec_t> src_child_images;
+
+ if (from_image_ctx != m_src_image_ctx) {
+ ceph_assert(migration_abort);
+
+ // We run list snaps against the src image to get only those snapshots
+ // that are migrated. If the "from" image is not the src image
+ // (abort migration case), we need to remap snap ids.
+ // Also collect the list of the children currently attached to the
+ // source, so we could make a proper decision later about relinking.
+
+ RWLock::RLocker src_snap_locker(to_image_ctx->snap_lock);
+ cls::rbd::ParentImageSpec src_parent_spec{to_image_ctx->md_ctx.get_id(),
+ to_image_ctx->md_ctx.get_namespace(),
+ to_image_ctx->id, snap.id};
+ r = api::Image<I>::list_children(to_image_ctx, src_parent_spec,
+ &src_child_images);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing children: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ RWLock::RLocker snap_locker(from_image_ctx->snap_lock);
+ snap.id = from_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ snap.name);
+ if (snap.id == CEPH_NOSNAP) {
+ ldout(m_cct, 5) << "skipping snapshot " << snap.name << dendl;
+ continue;
+ }
+ }
+
+ std::vector<librbd::linked_image_spec_t> child_images;
+ {
+ RWLock::RLocker snap_locker(from_image_ctx->snap_lock);
+ cls::rbd::ParentImageSpec parent_spec{from_image_ctx->md_ctx.get_id(),
+ from_image_ctx->md_ctx.get_namespace(),
+ from_image_ctx->id, snap.id};
+ r = api::Image<I>::list_children(from_image_ctx, parent_spec,
+ &child_images);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing children: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ for (auto &child_image : child_images) {
+ r = relink_child(from_image_ctx, to_image_ctx, snap, child_image,
+ migration_abort, true);
+ if (r < 0) {
+ return r;
+ }
+
+ src_child_images.erase(std::remove(src_child_images.begin(),
+ src_child_images.end(), child_image),
+ src_child_images.end());
+ }
+
+ for (auto &child_image : src_child_images) {
+ r = relink_child(from_image_ctx, to_image_ctx, snap, child_image,
+ migration_abort, false);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::relink_child(I *from_image_ctx, I *to_image_ctx,
+ const librbd::snap_info_t &from_snap,
+ const librbd::linked_image_spec_t &child_image,
+ bool migration_abort, bool reattach_child) {
+ ldout(m_cct, 10) << from_snap.name << " " << child_image.pool_name << "/"
+ << child_image.pool_namespace << "/"
+ << child_image.image_name << " (migration_abort="
+ << migration_abort << ", reattach_child=" << reattach_child
+ << ")" << dendl;
+
+ librados::snap_t to_snap_id;
+ {
+ RWLock::RLocker snap_locker(to_image_ctx->snap_lock);
+ to_snap_id = to_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ from_snap.name);
+ if (to_snap_id == CEPH_NOSNAP) {
+ lderr(m_cct) << "no snapshot " << from_snap.name << " on destination image"
+ << dendl;
+ return -ENOENT;
+ }
+ }
+
+ librados::IoCtx child_io_ctx;
+ int r = util::create_ioctx(to_image_ctx->md_ctx,
+ "child image " + child_image.image_name,
+ child_image.pool_id, child_image.pool_namespace,
+ &child_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ I *child_image_ctx = I::create("", child_image.image_id, nullptr,
+ child_io_ctx, false);
+ r = child_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r < 0) {
+ lderr(m_cct) << "failed to open child image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ BOOST_SCOPE_EXIT_TPL(child_image_ctx) {
+ child_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ uint32_t clone_format = 1;
+ if (child_image_ctx->test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) {
+ clone_format = 2;
+ }
+
+ cls::rbd::ParentImageSpec parent_spec;
+ uint64_t parent_overlap;
+ {
+ RWLock::RLocker snap_locker(child_image_ctx->snap_lock);
+ RWLock::RLocker parent_locker(child_image_ctx->parent_lock);
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!child_image_ctx->snap_info.empty()) {
+ parent_spec = child_image_ctx->snap_info.begin()->second.parent.spec;
+ parent_overlap = child_image_ctx->snap_info.begin()->second.parent.overlap;
+ } else {
+ parent_spec = child_image_ctx->parent_md.spec;
+ parent_overlap = child_image_ctx->parent_md.overlap;
+ }
+ }
+
+ if (migration_abort &&
+ parent_spec.pool_id == to_image_ctx->md_ctx.get_id() &&
+ parent_spec.pool_namespace == to_image_ctx->md_ctx.get_namespace() &&
+ parent_spec.image_id == to_image_ctx->id &&
+ parent_spec.snap_id == to_snap_id) {
+ ldout(m_cct, 10) << "no need for parent re-attach" << dendl;
+ } else {
+ if (parent_spec.pool_id != from_image_ctx->md_ctx.get_id() ||
+ parent_spec.pool_namespace != from_image_ctx->md_ctx.get_namespace() ||
+ parent_spec.image_id != from_image_ctx->id ||
+ parent_spec.snap_id != from_snap.id) {
+ lderr(m_cct) << "parent is not source image: " << parent_spec.pool_id
+ << "/" << parent_spec.pool_namespace << "/"
+ << parent_spec.image_id << "@" << parent_spec.snap_id
+ << dendl;
+ return -ESTALE;
+ }
+
+ parent_spec.pool_id = to_image_ctx->md_ctx.get_id();
+ parent_spec.pool_namespace = to_image_ctx->md_ctx.get_namespace();
+ parent_spec.image_id = to_image_ctx->id;
+ parent_spec.snap_id = to_snap_id;
+
+ C_SaferCond on_reattach_parent;
+ auto reattach_parent_req = image::AttachParentRequest<I>::create(
+ *child_image_ctx, parent_spec, parent_overlap, true, &on_reattach_parent);
+ reattach_parent_req->send();
+ r = on_reattach_parent.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to re-attach parent: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (reattach_child) {
+ C_SaferCond on_reattach_child;
+ auto reattach_child_req = image::AttachChildRequest<I>::create(
+ child_image_ctx, to_image_ctx, to_snap_id, from_image_ctx, from_snap.id,
+ clone_format, &on_reattach_child);
+ reattach_child_req->send();
+ r = on_reattach_child.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to re-attach child: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ child_image_ctx->notify_update();
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::remove_src_image() {
+ ldout(m_cct, 10) << dendl;
+
+ std::vector<librbd::snap_info_t> snaps;
+ int r = list_src_snaps(&snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto it = snaps.rbegin(); it != snaps.rend(); it++) {
+ auto &snap = *it;
+
+ librbd::NoOpProgressContext prog_ctx;
+ int r = snap_remove(m_src_image_ctx, snap.name.c_str(),
+ RBD_SNAP_REMOVE_UNPROTECT, prog_ctx);
+ if (r < 0) {
+ lderr(m_cct) << "failed removing source image snapshot '" << snap.name
+ << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ ceph_assert(m_src_image_ctx->ignore_migrating);
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(m_cct, &thread_pool, &op_work_queue);
+ C_SaferCond on_remove;
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_src_io_ctx, m_src_image_ctx, false, true, *m_prog_ctx, op_work_queue,
+ &on_remove);
+ req->send();
+ r = on_remove.wait();
+
+ m_src_image_ctx = nullptr;
+
+ // For old format image it will return -ENOENT due to expected
+ // tmap_rm failure at the end.
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed removing source image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (!m_src_image_id.empty()) {
+ r = cls_client::trash_remove(&m_src_io_ctx, m_src_image_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing image " << m_src_image_id
+ << " from rbd_trash object" << dendl;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::revert_data(I* src_image_ctx, I* dst_image_ctx,
+ ProgressContext* prog_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ cls::rbd::MigrationSpec migration_spec;
+ int r = cls_client::migration_get(&src_image_ctx->md_ctx,
+ src_image_ctx->header_oid,
+ &migration_spec);
+
+ if (r < 0) {
+ lderr(m_cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST) {
+ lderr(m_cct) << "unexpected migration header type: "
+ << migration_spec.header_type << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t src_snap_id_start = 0;
+ uint64_t src_snap_id_end = CEPH_NOSNAP;
+ uint64_t dst_snap_id_start = 0;
+ if (!migration_spec.snap_seqs.empty()) {
+ src_snap_id_start = migration_spec.snap_seqs.rbegin()->second;
+ }
+
+ // we only care about the HEAD revision so only add a single mapping to
+ // represent the most recent state
+ SnapSeqs snap_seqs;
+ snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP;
+
+ ldout(m_cct, 20) << "src_snap_id_start=" << src_snap_id_start << ", "
+ << "src_snap_id_end=" << src_snap_id_end << ", "
+ << "snap_seqs=" << snap_seqs << dendl;
+
+ C_SaferCond ctx;
+ auto request = deep_copy::ImageCopyRequest<I>::create(
+ src_image_ctx, dst_image_ctx, src_snap_id_start, src_snap_id_end,
+ dst_snap_id_start, false, {}, snap_seqs, prog_ctx, &ctx);
+ request->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "error reverting destination image data blocks back to "
+ << "source image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Migration<librbd::ImageCtx>;
diff --git a/src/librbd/api/Migration.h b/src/librbd/api/Migration.h
new file mode 100644
index 00000000..52a4517b
--- /dev/null
+++ b/src/librbd/api/Migration.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_MIGRATION_H
+#define CEPH_LIBRBD_API_MIGRATION_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+#include <vector>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Migration {
+public:
+ static int prepare(librados::IoCtx& io_ctx, const std::string &image_name,
+ librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name, ImageOptions& opts);
+ static int execute(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx);
+ static int abort(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx);
+ static int commit(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx);
+ static int status(librados::IoCtx& io_ctx, const std::string &image_name,
+ image_migration_status_t *status);
+
+private:
+ CephContext* m_cct;
+ ImageCtxT *m_src_image_ctx;
+ librados::IoCtx m_src_io_ctx;
+ librados::IoCtx &m_dst_io_ctx;
+ bool m_src_old_format;
+ std::string m_src_image_name;
+ std::string m_src_image_id;
+ std::string m_src_header_oid;
+ std::string m_dst_image_name;
+ std::string m_dst_image_id;
+ std::string m_dst_header_oid;
+ ImageOptions &m_image_options;
+ bool m_flatten;
+ bool m_mirroring;
+ ProgressContext *m_prog_ctx;
+
+ cls::rbd::MigrationSpec m_src_migration_spec;
+ cls::rbd::MigrationSpec m_dst_migration_spec;
+
+ Migration(ImageCtxT *image_ctx, librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name, const std::string &dst_image_id,
+ ImageOptions& opts, bool flatten, bool mirroring,
+ cls::rbd::MigrationState state, const std::string &state_desc,
+ ProgressContext *prog_ctx);
+
+ int prepare();
+ int execute();
+ int abort();
+ int commit();
+ int status(image_migration_status_t *status);
+
+ int set_state(cls::rbd::MigrationState state, const std::string &description);
+
+ int list_src_snaps(std::vector<librbd::snap_info_t> *snaps);
+ int validate_src_snaps();
+ int disable_mirroring(ImageCtxT *image_ctx, bool *was_enabled);
+ int enable_mirroring(ImageCtxT *image_ctx, bool was_enabled);
+ int set_migration();
+ int unlink_src_image();
+ int relink_src_image();
+ int create_dst_image();
+ int remove_group(ImageCtxT *image_ctx, group_info_t *group_info);
+ int add_group(ImageCtxT *image_ctx, group_info_t &group_info);
+ int update_group(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx);
+ int remove_migration(ImageCtxT *image_ctx);
+ int relink_children(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx);
+ int remove_src_image();
+
+ int v1_set_migration();
+ int v2_set_migration();
+ int v1_unlink_src_image();
+ int v2_unlink_src_image();
+ int v1_relink_src_image();
+ int v2_relink_src_image();
+
+ int relink_child(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx,
+ const librbd::snap_info_t &src_snap,
+ const librbd::linked_image_spec_t &child_image,
+ bool migration_abort, bool reattach_child);
+
+ int revert_data(ImageCtxT* src_image_ctx, ImageCtxT* dst_image_ctx,
+ ProgressContext *prog_ctx);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Migration<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_MIGRATION_H
diff --git a/src/librbd/api/Mirror.cc b/src/librbd/api/Mirror.cc
new file mode 100644
index 00000000..4e7b7dc8
--- /dev/null
+++ b/src/librbd/api/Mirror.cc
@@ -0,0 +1,1541 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Mirror.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/ceph_json.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Image.h"
+#include "librbd/mirror/DemoteRequest.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "librbd/mirror/GetStatusRequest.h"
+#include "librbd/mirror/PromoteRequest.h"
+#include "librbd/mirror/Types.h"
+#include "librbd/MirroringWatcher.h"
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/algorithm/string/replace.hpp>
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Mirror: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+int get_config_key(librados::Rados& rados, const std::string& key,
+ std::string* value) {
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" + key + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -EINVAL) {
+ return -EOPNOTSUPP;
+ } else if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ *value = out_bl.to_str();
+ return 0;
+}
+
+int set_config_key(librados::Rados& rados, const std::string& key,
+ const std::string& value) {
+ std::string cmd;
+ if (value.empty()) {
+ cmd = "{"
+ "\"prefix\": \"config-key rm\", "
+ "\"key\": \"" + key + "\""
+ "}";
+ } else {
+ cmd = "{"
+ "\"prefix\": \"config-key set\", "
+ "\"key\": \"" + key + "\", "
+ "\"val\": \"" + value + "\""
+ "}";
+ }
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -EINVAL) {
+ return -EOPNOTSUPP;
+ } else if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+std::string get_peer_config_key_name(int64_t pool_id,
+ const std::string& peer_uuid) {
+ return RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) + "/" +
+ peer_uuid;
+}
+
+int remove_peer_config_key(librados::IoCtx& io_ctx,
+ const std::string& peer_uuid) {
+ int64_t pool_id = io_ctx.get_id();
+ auto key = get_peer_config_key_name(pool_id, peer_uuid);
+
+ librados::Rados rados(io_ctx);
+ int r = set_config_key(rados, key, "");
+ if (r < 0 && r != -ENOENT && r != -EPERM) {
+ return r;
+ }
+ return 0;
+}
+
+int create_bootstrap_user(CephContext* cct, librados::Rados& rados,
+ std::string* peer_client_id, std::string* cephx_key) {
+ ldout(cct, 20) << dendl;
+
+ // retrieve peer CephX user from config-key
+ int r = get_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY,
+ peer_client_id);
+ if (r == -EACCES) {
+ ldout(cct, 5) << "insufficient permissions to get peer-client-id "
+ << "config-key" << dendl;
+ return r;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve peer client id key: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (r == -ENOENT || peer_client_id->empty()) {
+ ldout(cct, 20) << "creating new peer-client-id config-key" << dendl;
+
+ *peer_client_id = "rbd-mirror-peer";
+ r = set_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY,
+ *peer_client_id);
+ if (r == -EACCES) {
+ ldout(cct, 5) << "insufficient permissions to update peer-client-id "
+ << "config-key" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to update peer client id key: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ ldout(cct, 20) << "peer_client_id=" << *peer_client_id << dendl;
+
+ // create peer client user
+ std::string cmd =
+ R"({)" \
+ R"( "prefix": "auth get-or-create",)" \
+ R"( "entity": "client.)" + *peer_client_id + R"(",)" \
+ R"( "caps": [)" \
+ R"( "mon", "profile rbd-mirror-peer",)" \
+ R"( "osd", "profile rbd"],)" \
+ R"( "format": "json")" \
+ R"(})";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -EINVAL) {
+ ldout(cct, 5) << "caps mismatch for existing user" << dendl;
+ return -EEXIST;
+ } else if (r == -EACCES) {
+ ldout(cct, 5) << "insufficient permissions to create user" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to create or update RBD mirroring bootstrap user: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // extract key from response
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(out_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_array()[0].get_obj();
+ *cephx_key = json_obj["key"].get_str();
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(cct) << "invalid auth keyring JSON received" << dendl;
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+int create_bootstrap_peer(CephContext* cct, librados::IoCtx& io_ctx,
+ const std::string& site_name, const std::string& fsid,
+ const std::string& client_id, const std::string& key,
+ const std::string& mon_host,
+ const std::string& cluster1,
+ const std::string& cluster2) {
+ ldout(cct, 20) << dendl;
+
+ std::string peer_uuid;
+ std::vector<mirror_peer_t> peers;
+ int r = Mirror<>::peer_list(io_ctx, &peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror peers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (peers.empty()) {
+ r = Mirror<>::peer_add(io_ctx, &peer_uuid, site_name,
+ "client." + client_id);
+ if (r < 0) {
+ lderr(cct) << "failed to add " << cluster1 << " peer to "
+ << cluster2 << " " << "cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (peers[0].cluster_name != site_name &&
+ peers[0].cluster_name != fsid) {
+ // only support a single peer
+ lderr(cct) << "multiple peers are not currently supported" << dendl;
+ return -EINVAL;
+ } else {
+ peer_uuid = peers[0].uuid;
+
+ if (peers[0].cluster_name != site_name) {
+ r = Mirror<>::peer_set_cluster(io_ctx, peer_uuid, site_name);
+ if (r < 0) {
+ // non-fatal attempt to update site name
+ lderr(cct) << "failed to update peer site name" << dendl;
+ }
+ }
+ }
+
+ Mirror<>::Attributes attributes {
+ {"mon_host", mon_host},
+ {"key", key}};
+ r = Mirror<>::peer_set_attributes(io_ctx, peer_uuid, attributes);
+ if (r < 0) {
+ lderr(cct) << "failed to update " << cluster1 << " cluster connection "
+ << "attributes in " << cluster2 << " cluster: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int validate_mirroring_enabled(I *ictx) {
+ CephContext *cct = ictx->cct;
+ cls::rbd::MirrorImage mirror_image_internal;
+ int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
+ &mirror_image_internal);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (mirror_image_internal.state !=
+ cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int list_mirror_images(librados::IoCtx& io_ctx,
+ std::set<std::string>& mirror_image_ids) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ std::string last_read = "";
+ int max_read = 1024;
+ int r;
+ do {
+ std::map<std::string, std::string> mirror_images;
+ r = cls_client::mirror_image_list(&io_ctx, last_read, max_read,
+ &mirror_images);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing mirrored image directory: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (auto it = mirror_images.begin(); it != mirror_images.end(); ++it) {
+ mirror_image_ids.insert(it->first);
+ }
+ if (!mirror_images.empty()) {
+ last_read = mirror_images.rbegin()->first;
+ }
+ r = mirror_images.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+struct C_ImageGetInfo : public Context {
+ mirror_image_info_t *mirror_image_info;
+ Context *on_finish;
+
+ cls::rbd::MirrorImage mirror_image;
+ mirror::PromotionState promotion_state = mirror::PROMOTION_STATE_PRIMARY;
+
+ C_ImageGetInfo(mirror_image_info_t *mirror_image_info, Context *on_finish)
+ : mirror_image_info(mirror_image_info), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_info->global_id = mirror_image.global_image_id;
+ mirror_image_info->state = static_cast<rbd_mirror_image_state_t>(
+ mirror_image.state);
+ mirror_image_info->primary = (
+ promotion_state == mirror::PROMOTION_STATE_PRIMARY);
+ on_finish->complete(0);
+ }
+};
+
+struct C_ImageGetStatus : public C_ImageGetInfo {
+ std::string image_name;
+ mirror_image_status_t *mirror_image_status;
+
+ cls::rbd::MirrorImageStatus mirror_image_status_internal;
+
+ C_ImageGetStatus(const std::string &image_name,
+ mirror_image_status_t *mirror_image_status,
+ Context *on_finish)
+ : C_ImageGetInfo(&mirror_image_status->info, on_finish),
+ image_name(image_name), mirror_image_status(mirror_image_status) {
+ }
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_status->name = image_name;
+ mirror_image_status->state = static_cast<mirror_image_status_state_t>(
+ mirror_image_status_internal.state);
+ mirror_image_status->description = mirror_image_status_internal.description;
+ mirror_image_status->last_update =
+ mirror_image_status_internal.last_update.sec();
+ mirror_image_status->up = mirror_image_status_internal.up;
+ C_ImageGetInfo::finish(0);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Mirror<I>::image_enable(I *ictx, bool relax_same_pool_parent_check) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ // TODO
+ if (!ictx->md_ctx.get_namespace().empty()) {
+ lderr(cct) << "namespaces are not supported" << dendl;
+ return -EINVAL;
+ }
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "cannot enable mirroring: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ lderr(cct) << "cannot enable mirroring in the current pool mirroring mode"
+ << dendl;
+ return -EINVAL;
+ }
+
+ // is mirroring not enabled for the parent?
+ {
+ RWLock::RLocker l(ictx->parent_lock);
+ ImageCtx *parent = ictx->parent;
+ if (parent) {
+ if (relax_same_pool_parent_check &&
+ parent->md_ctx.get_id() == ictx->md_ctx.get_id()) {
+ if (!parent->test_features(RBD_FEATURE_JOURNALING)) {
+ lderr(cct) << "journaling is not enabled for the parent" << dendl;
+ return -EINVAL;
+ }
+ } else {
+ cls::rbd::MirrorImage mirror_image_internal;
+ r = cls_client::mirror_image_get(&(parent->md_ctx), parent->id,
+ &mirror_image_internal);
+ if (r == -ENOENT) {
+ lderr(cct) << "mirroring is not enabled for the parent" << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+
+ if ((ictx->features & RBD_FEATURE_JOURNALING) == 0) {
+ lderr(cct) << "cannot enable mirroring: journaling is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ C_SaferCond ctx;
+ auto req = mirror::EnableRequest<ImageCtx>::create(ictx, &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_disable(I *ictx, bool force) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "cannot disable mirroring: failed to retrieve pool "
+ "mirroring mode: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ lderr(cct) << "cannot disable mirroring in the current pool mirroring "
+ "mode" << dendl;
+ return -EINVAL;
+ }
+
+ // is mirroring enabled for the child?
+ cls::rbd::MirrorImage mirror_image_internal;
+ r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
+ &mirror_image_internal);
+ if (r == -ENOENT) {
+ // mirroring is not enabled for this image
+ ldout(cct, 20) << "ignoring disable command: mirroring is not enabled for "
+ << "this image" << dendl;
+ return 0;
+ } else if (r == -EOPNOTSUPP) {
+ ldout(cct, 5) << "mirroring not supported by OSD" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror image metadata: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
+ r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
+ mirror_image_internal);
+ if (r < 0) {
+ lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ bool rollback = false;
+ BOOST_SCOPE_EXIT_ALL(ictx, &mirror_image_internal, &rollback) {
+ if (rollback) {
+ CephContext *cct = ictx->cct;
+ mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+ int r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
+ mirror_image_internal);
+ if (r < 0) {
+ lderr(cct) << "failed to re-enable image mirroring: "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+ };
+
+ {
+ RWLock::RLocker l(ictx->snap_lock);
+ map<librados::snap_t, SnapInfo> snap_info = ictx->snap_info;
+ for (auto &info : snap_info) {
+ cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(),
+ ictx->md_ctx.get_namespace(),
+ ictx->id, info.first};
+ std::vector<librbd::linked_image_spec_t> child_images;
+ r = Image<I>::list_children(ictx, parent_spec, &child_images);
+ if (r < 0) {
+ rollback = true;
+ return r;
+ }
+
+ if (child_images.empty()) {
+ continue;
+ }
+
+ librados::IoCtx child_io_ctx;
+ int64_t child_pool_id = -1;
+ for (auto &child_image : child_images){
+ std::string pool = child_image.pool_name;
+ if (child_pool_id == -1 ||
+ child_pool_id != child_image.pool_id ||
+ child_io_ctx.get_namespace() != child_image.pool_namespace) {
+ r = util::create_ioctx(ictx->md_ctx, "child image",
+ child_image.pool_id,
+ child_image.pool_namespace,
+ &child_io_ctx);
+ if (r < 0) {
+ rollback = true;
+ return r;
+ }
+
+ child_pool_id = child_image.pool_id;
+ }
+
+ cls::rbd::MirrorImage mirror_image_internal;
+ r = cls_client::mirror_image_get(&child_io_ctx, child_image.image_id,
+ &mirror_image_internal);
+ if (r != -ENOENT) {
+ rollback = true;
+ lderr(cct) << "mirroring is enabled on one or more children "
+ << dendl;
+ return -EBUSY;
+ }
+ }
+ }
+ }
+
+ C_SaferCond ctx;
+ auto req = mirror::DisableRequest<ImageCtx>::create(ictx, force, true,
+ &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+ rollback = true;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_promote(I *ictx, bool force) {
+ CephContext *cct = ictx->cct;
+
+ C_SaferCond ctx;
+ Mirror<I>::image_promote(ictx, force, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to promote image" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_promote(I *ictx, bool force, Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << ", "
+ << "force=" << force << dendl;
+
+ auto req = mirror::PromoteRequest<>::create(*ictx, force, on_finish);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::image_demote(I *ictx) {
+ CephContext *cct = ictx->cct;
+
+ C_SaferCond ctx;
+ Mirror<I>::image_demote(ictx, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to demote image" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_demote(I *ictx, Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto req = mirror::DemoteRequest<>::create(*ictx, on_finish);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::image_resync(I *ictx) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond tag_owner_ctx;
+ bool is_tag_owner;
+ Journal<I>::is_tag_owner(ictx, &is_tag_owner, &tag_owner_ctx);
+ r = tag_owner_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to determine tag ownership: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (is_tag_owner) {
+ lderr(cct) << "image is primary, cannot resync to itself" << dendl;
+ return -EINVAL;
+ }
+
+ // flag the journal indicating that we want to rebuild the local image
+ r = Journal<I>::request_resync(ictx);
+ if (r < 0) {
+ lderr(cct) << "failed to request resync: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info,
+ Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto on_refresh = new FunctionContext(
+ [ictx, mirror_image_info, on_finish](int r) {
+ if (r < 0) {
+ lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto ctx = new C_ImageGetInfo(mirror_image_info, on_finish);
+ auto req = mirror::GetInfoRequest<I>::create(*ictx, &ctx->mirror_image,
+ &ctx->promotion_state,
+ ctx);
+ req->send();
+ });
+
+ if (ictx->state->is_refresh_required()) {
+ ictx->state->refresh(on_refresh);
+ } else {
+ on_refresh->complete(0);
+ }
+}
+
+template <typename I>
+int Mirror<I>::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info) {
+ C_SaferCond ctx;
+ image_get_info(ictx, mirror_image_info, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_get_status(I *ictx, mirror_image_status_t *status,
+ Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto ctx = new C_ImageGetStatus(ictx->name, status, on_finish);
+ auto req = mirror::GetStatusRequest<I>::create(
+ *ictx, &ctx->mirror_image_status_internal, &ctx->mirror_image,
+ &ctx->promotion_state, ctx);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::image_get_status(I *ictx, mirror_image_status_t *status) {
+ C_SaferCond ctx;
+ image_get_status(ictx, status, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_get_instance_id(I *ictx, std::string *instance_id) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+ int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, &mirror_image);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ return -EINVAL;
+ }
+
+ entity_inst_t instance;
+ r = cls_client::mirror_image_instance_get(&ictx->md_ctx,
+ mirror_image.global_image_id,
+ &instance);
+ if (r < 0) {
+ if (r != -ENOENT && r != -ESTALE) {
+ lderr(cct) << "failed to get mirror image instance: " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ }
+
+ *instance_id = stringify(instance.name.num());
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::site_name_get(librados::Rados& rados, std::string* name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(rados.cct());
+ ldout(cct, 20) << dendl;
+
+ int r = get_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name);
+ if (r == -EOPNOTSUPP) {
+ return r;
+ } else if (r == -ENOENT || name->empty()) {
+ // default to the cluster fsid
+ r = rados.cluster_fsid(name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve site name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::site_name_set(librados::Rados& rados, const std::string& name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(rados.cct());
+
+ std::string site_name{name};
+ boost::algorithm::trim(site_name);
+ ldout(cct, 20) << "site_name=" << site_name << dendl;
+
+ int r = set_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name);
+ if (r == -EOPNOTSUPP) {
+ return r;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to update site name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::mode_get(librados::IoCtx& io_ctx,
+ rbd_mirror_mode_t *mirror_mode) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ cls::rbd::MirrorMode mirror_mode_internal;
+ int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode_internal);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ switch (mirror_mode_internal) {
+ case cls::rbd::MIRROR_MODE_DISABLED:
+ case cls::rbd::MIRROR_MODE_IMAGE:
+ case cls::rbd::MIRROR_MODE_POOL:
+ *mirror_mode = static_cast<rbd_mirror_mode_t>(mirror_mode_internal);
+ break;
+ default:
+ lderr(cct) << "unknown mirror mode ("
+ << static_cast<uint32_t>(mirror_mode_internal) << ")"
+ << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::mode_set(librados::IoCtx& io_ctx,
+ rbd_mirror_mode_t mirror_mode) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ // TODO
+ if (!io_ctx.get_namespace().empty()) {
+ lderr(cct) << "namespaces are not supported" << dendl;
+ return -EINVAL;
+ }
+
+ cls::rbd::MirrorMode next_mirror_mode;
+ switch (mirror_mode) {
+ case RBD_MIRROR_MODE_DISABLED:
+ case RBD_MIRROR_MODE_IMAGE:
+ case RBD_MIRROR_MODE_POOL:
+ next_mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode);
+ break;
+ default:
+ lderr(cct) << "unknown mirror mode ("
+ << static_cast<uint32_t>(mirror_mode) << ")" << dendl;
+ return -EINVAL;
+ }
+
+ int r;
+ if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ // fail early if pool still has peers registered and attempting to disable
+ std::vector<cls::rbd::MirrorPeer> mirror_peers;
+ r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (!mirror_peers.empty()) {
+ lderr(cct) << "mirror peers still registered" << dendl;
+ return -EBUSY;
+ }
+ }
+
+ cls::rbd::MirrorMode current_mirror_mode;
+ r = cls_client::mirror_mode_get(&io_ctx, &current_mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (current_mirror_mode == next_mirror_mode) {
+ return 0;
+ } else if (current_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ r = cls_client::mirror_uuid_set(&io_ctx, uuid_gen.to_string());
+ if (r < 0) {
+ lderr(cct) << "failed to allocate mirroring uuid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ if (current_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ r = cls_client::mirror_mode_set(&io_ctx, cls::rbd::MIRROR_MODE_IMAGE);
+ if (r < 0) {
+ lderr(cct) << "failed to set mirror mode to image: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = MirroringWatcher<>::notify_mode_updated(io_ctx,
+ cls::rbd::MIRROR_MODE_IMAGE);
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+
+ if (next_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) {
+ return 0;
+ }
+
+ if (next_mirror_mode == cls::rbd::MIRROR_MODE_POOL) {
+ map<string, string> images;
+ r = Image<I>::list_images_v2(io_ctx, &images);
+ if (r < 0) {
+ lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& img_pair : images) {
+ uint64_t features;
+ uint64_t incompatible_features;
+ r = cls_client::get_features(&io_ctx, util::header_name(img_pair.second),
+ true, &features, &incompatible_features);
+ if (r < 0) {
+ lderr(cct) << "error getting features for image " << img_pair.first
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((features & RBD_FEATURE_JOURNALING) != 0) {
+ I *img_ctx = I::create("", img_pair.second, nullptr, io_ctx, false);
+ r = img_ctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening image "<< img_pair.first << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = image_enable(img_ctx, true);
+ int close_r = img_ctx->state->close();
+ if (r < 0) {
+ lderr(cct) << "error enabling mirroring for image "
+ << img_pair.first << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (close_r < 0) {
+ lderr(cct) << "failed to close image " << img_pair.first << ": "
+ << cpp_strerror(close_r) << dendl;
+ return close_r;
+ }
+ }
+ }
+ } else if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ while (true) {
+ bool retry_busy = false;
+ bool pending_busy = false;
+
+ std::set<std::string> image_ids;
+ r = list_mirror_images(io_ctx, image_ids);
+ if (r < 0) {
+ lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& img_id : image_ids) {
+ if (current_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) {
+ cls::rbd::MirrorImage mirror_image;
+ r = cls_client::mirror_image_get(&io_ctx, img_id, &mirror_image);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state for image id "
+ << img_id << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "failed to disable mirror mode: there are still "
+ << "images with mirroring enabled" << dendl;
+ return -EINVAL;
+ }
+ } else {
+ I *img_ctx = I::create("", img_id, nullptr, io_ctx, false);
+ r = img_ctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening image id "<< img_id << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = image_disable(img_ctx, false);
+ int close_r = img_ctx->state->close();
+ if (r == -EBUSY) {
+ pending_busy = true;
+ } else if (r < 0) {
+ lderr(cct) << "error disabling mirroring for image id " << img_id
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (close_r < 0) {
+ lderr(cct) << "failed to close image id " << img_id << ": "
+ << cpp_strerror(close_r) << dendl;
+ return close_r;
+ } else if (pending_busy) {
+ // at least one mirrored image was successfully disabled, so we can
+ // retry any failures caused by busy parent/child relationships
+ retry_busy = true;
+ }
+ }
+ }
+
+ if (!retry_busy && pending_busy) {
+ lderr(cct) << "error disabling mirroring for one or more images"
+ << dendl;
+ return -EBUSY;
+ } else if (!retry_busy) {
+ break;
+ }
+ }
+ }
+
+ r = cls_client::mirror_mode_set(&io_ctx, next_mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "failed to set mirror mode: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = MirroringWatcher<>::notify_mode_updated(io_ctx, next_mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_bootstrap_create(librados::IoCtx& io_ctx,
+ std::string* token) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ auto mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ return -EINVAL;
+ }
+
+ // retrieve the cluster fsid
+ std::string fsid;
+ librados::Rados rados(io_ctx);
+ r = rados.cluster_fsid(&fsid);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string peer_client_id;
+ std::string cephx_key;
+ r = create_bootstrap_user(cct, rados, &peer_client_id, &cephx_key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string mon_host = cct->_conf.get_val<std::string>("mon_host");
+ ldout(cct, 20) << "mon_host=" << mon_host << dendl;
+
+ // format the token response
+ bufferlist token_bl;
+ token_bl.append(
+ R"({)" \
+ R"("fsid":")" + fsid + R"(",)" + \
+ R"("client_id":")" + peer_client_id + R"(",)" + \
+ R"("key":")" + cephx_key + R"(",)" + \
+ R"("mon_host":")" + \
+ boost::replace_all_copy(mon_host, "\"", "\\\"") + R"(")" + \
+ R"(})");
+ ldout(cct, 20) << "token=" << token_bl.to_str() << dendl;
+
+ bufferlist base64_bl;
+ token_bl.encode_base64(base64_bl);
+ *token = base64_bl.to_str();
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_bootstrap_import(librados::IoCtx& io_ctx,
+ rbd_mirror_peer_direction_t direction,
+ const std::string& token) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ if (direction != RBD_MIRROR_PEER_DIRECTION_RX &&
+ direction != RBD_MIRROR_PEER_DIRECTION_RX_TX) {
+ lderr(cct) << "invalid mirror peer direction" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist token_bl;
+ try {
+ bufferlist base64_bl;
+ base64_bl.append(token);
+ token_bl.decode_base64(base64_bl);
+ } catch (buffer::error& err) {
+ lderr(cct) << "failed to decode base64" << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 20) << "token=" << token_bl.to_str() << dendl;
+
+ bool json_valid = false;
+ std::string expected_remote_fsid;
+ std::string remote_client_id;
+ std::string remote_key;
+ std::string remote_mon_host;
+
+ json_spirit::mValue json_root;
+ if(json_spirit::read(token_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ expected_remote_fsid = json_obj["fsid"].get_str();
+ remote_client_id = json_obj["client_id"].get_str();
+ remote_key = json_obj["key"].get_str();
+ remote_mon_host = json_obj["mon_host"].get_str();
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(cct) << "invalid bootstrap token JSON received" << dendl;
+ return -EINVAL;
+ }
+
+ // sanity check import process
+ std::string local_fsid;
+ librados::Rados rados(io_ctx);
+ int r = rados.cluster_fsid(&local_fsid);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string local_site_name;
+ r = site_name_get(rados, &local_site_name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster site name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ // attempt to connect to remote cluster
+ librados::Rados remote_rados;
+ remote_rados.init(remote_client_id.c_str());
+
+ auto remote_cct = reinterpret_cast<CephContext*>(remote_rados.cct());
+ remote_cct->_conf.set_val("mon_host", remote_mon_host);
+ remote_cct->_conf.set_val("key", remote_key);
+
+ r = remote_rados.connect();
+ if (r < 0) {
+ lderr(cct) << "failed to connect to peer cluster: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string remote_fsid;
+ r = remote_rados.cluster_fsid(&remote_fsid);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve remote cluster fsid: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (local_fsid == remote_fsid) {
+ lderr(cct) << "cannot import token for local cluster" << dendl;
+ return -EINVAL;
+ } else if (expected_remote_fsid != remote_fsid) {
+ lderr(cct) << "unexpected remote cluster fsid" << dendl;
+ return -EINVAL;
+ }
+
+ std::string remote_site_name;
+ r = site_name_get(remote_rados, &remote_site_name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve remote cluster site name: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (local_site_name == remote_site_name) {
+ lderr(cct) << "cannot import token for duplicate site name" << dendl;
+ return -EINVAL;
+ }
+
+ librados::IoCtx remote_io_ctx;
+ r = remote_rados.ioctx_create(io_ctx.get_pool_name().c_str(), remote_io_ctx);
+ if (r == -ENOENT) {
+ ldout(cct, 10) << "remote pool does not exist" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to open remote pool '" << io_ctx.get_pool_name()
+ << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto remote_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ r = cls_client::mirror_mode_get(&remote_io_ctx, &remote_mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve remote mirroring mode: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (remote_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ return -ENOSYS;
+ }
+
+ auto local_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ r = cls_client::mirror_mode_get(&io_ctx, &local_mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve local mirroring mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (local_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ // copy mirror mode from remote peer
+ r = mode_set(io_ctx, static_cast<rbd_mirror_mode_t>(remote_mirror_mode));
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (direction == RBD_MIRROR_PEER_DIRECTION_RX_TX) {
+ // create a local mirror peer user and export it to the remote cluster
+ std::string local_client_id;
+ std::string local_key;
+ r = create_bootstrap_user(cct, rados, &local_client_id, &local_key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string local_mon_host = cct->_conf.get_val<std::string>("mon_host");
+
+ // create local cluster peer in remote cluster
+ r = create_bootstrap_peer(cct, remote_io_ctx, local_site_name, local_fsid,
+ local_client_id, local_key, local_mon_host,
+ "local", "remote");
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ // create remote cluster peer in local cluster
+ r = create_bootstrap_peer(cct, io_ctx, remote_site_name, remote_fsid,
+ remote_client_id, remote_key, remote_mon_host,
+ "remote", "local");
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_add(librados::IoCtx& io_ctx, std::string *uuid,
+ const std::string &cluster_name,
+ const std::string &client_name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "name=" << cluster_name << ", "
+ << "client=" << client_name << dendl;
+
+ // TODO
+ if (!io_ctx.get_namespace().empty()) {
+ lderr(cct) << "namespaces are not supported" << dendl;
+ return -EINVAL;
+ }
+
+ if (cct->_conf->cluster == cluster_name) {
+ lderr(cct) << "cannot add self as remote peer" << dendl;
+ return -EINVAL;
+ }
+
+ int r;
+ do {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+
+ *uuid = uuid_gen.to_string();
+ r = cls_client::mirror_peer_add(&io_ctx, *uuid, cluster_name,
+ client_name);
+ if (r == -ESTALE) {
+ ldout(cct, 5) << "duplicate UUID detected, retrying" << dendl;
+ } else if (r < 0) {
+ lderr(cct) << "failed to add mirror peer '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } while (r == -ESTALE);
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_remove(librados::IoCtx& io_ctx, const std::string &uuid) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << dendl;
+
+ int r = remove_peer_config_key(io_ctx, uuid);
+ if (r < 0) {
+ lderr(cct) << "failed to remove peer attributes '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = cls_client::mirror_peer_remove(&io_ctx, uuid);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to remove peer '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_list(librados::IoCtx& io_ctx,
+ std::vector<mirror_peer_t> *peers) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ std::vector<cls::rbd::MirrorPeer> mirror_peers;
+ int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ peers->clear();
+ peers->reserve(mirror_peers.size());
+ for (auto &mirror_peer : mirror_peers) {
+ mirror_peer_t peer;
+ peer.uuid = mirror_peer.uuid;
+ peer.cluster_name = mirror_peer.cluster_name;
+ peer.client_name = mirror_peer.client_name;
+ peers->push_back(peer);
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_set_client(librados::IoCtx& io_ctx, const std::string &uuid,
+ const std::string &client_name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "client=" << client_name << dendl;
+
+ int r = cls_client::mirror_peer_set_client(&io_ctx, uuid, client_name);
+ if (r < 0) {
+ lderr(cct) << "failed to update client '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_set_cluster(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const std::string &cluster_name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "cluster=" << cluster_name << dendl;
+
+ if (cct->_conf->cluster == cluster_name) {
+ lderr(cct) << "cannot set self as remote peer" << dendl;
+ return -EINVAL;
+ }
+
+ int r = cls_client::mirror_peer_set_cluster(&io_ctx, uuid, cluster_name);
+ if (r < 0) {
+ lderr(cct) << "failed to update cluster '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_get_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ Attributes* attributes) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << dendl;
+
+ attributes->clear();
+
+ librados::Rados rados(io_ctx);
+ std::string value;
+ int r = get_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid),
+ &value);
+ if (r == -ENOENT || value.empty()) {
+ return -ENOENT;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve peer attributes: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(value, json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ for (auto& pairs : json_obj) {
+ (*attributes)[pairs.first] = pairs.second.get_str();
+ }
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(cct) << "invalid peer attributes JSON received" << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_set_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const Attributes& attributes) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "attributes=" << attributes << dendl;
+
+ std::vector<mirror_peer_t> mirror_peers;
+ int r = peer_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ return r;
+ }
+
+ if (std::find_if(mirror_peers.begin(), mirror_peers.end(),
+ [&uuid](const librbd::mirror_peer_t& peer) {
+ return uuid == peer.uuid;
+ }) == mirror_peers.end()) {
+ ldout(cct, 5) << "mirror peer uuid " << uuid << " does not exist" << dendl;
+ return -ENOENT;
+ }
+
+ std::stringstream ss;
+ ss << "{";
+ for (auto& pair : attributes) {
+ ss << "\\\"" << pair.first << "\\\": "
+ << "\\\"" << pair.second << "\\\"";
+ if (&pair != &(*attributes.rbegin())) {
+ ss << ", ";
+ }
+ }
+ ss << "}";
+
+ librados::Rados rados(io_ctx);
+ r = set_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid),
+ ss.str());
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to update peer attributes: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_status_list(librados::IoCtx& io_ctx,
+ const std::string &start_id, size_t max,
+ IdToMirrorImageStatus *images) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ int r;
+
+ map<string, string> id_to_name;
+ {
+ map<string, string> name_to_id;
+ r = Image<I>::list_images_v2(io_ctx, &name_to_id);
+ if (r < 0) {
+ return r;
+ }
+ for (auto it : name_to_id) {
+ id_to_name[it.second] = it.first;
+ }
+ }
+
+ map<std::string, cls::rbd::MirrorImage> images_;
+ map<std::string, cls::rbd::MirrorImageStatus> statuses_;
+
+ r = librbd::cls_client::mirror_image_status_list(&io_ctx, start_id, max,
+ &images_, &statuses_);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror image statuses: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ cls::rbd::MirrorImageStatus unknown_status(
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN, "status not found");
+
+ for (auto it = images_.begin(); it != images_.end(); ++it) {
+ auto &image_id = it->first;
+ auto &info = it->second;
+ if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) {
+ continue;
+ }
+
+ auto &image_name = id_to_name[image_id];
+ if (image_name.empty()) {
+ lderr(cct) << "failed to find image name for image " << image_id << ", "
+ << "using image id as name" << dendl;
+ image_name = image_id;
+ }
+ auto s_it = statuses_.find(image_id);
+ auto &s = s_it != statuses_.end() ? s_it->second : unknown_status;
+ (*images)[image_id] = mirror_image_status_t{
+ image_name,
+ mirror_image_info_t{
+ info.global_image_id,
+ static_cast<mirror_image_state_t>(info.state),
+ false}, // XXX: To set "primary" right would require an additional call.
+ static_cast<mirror_image_status_state_t>(s.state),
+ s.description,
+ s.last_update.sec(),
+ s.up};
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_status_summary(librados::IoCtx& io_ctx,
+ MirrorImageStatusStates *states) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ std::map<cls::rbd::MirrorImageStatusState, int> states_;
+ int r = cls_client::mirror_image_status_get_summary(&io_ctx, &states_);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to get mirror status summary: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (auto &s : states_) {
+ (*states)[static_cast<mirror_image_status_state_t>(s.first)] = s.second;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_instance_id_list(
+ librados::IoCtx& io_ctx, const std::string &start_image_id, size_t max,
+ std::map<std::string, std::string> *instance_ids) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ std::map<std::string, entity_inst_t> instances;
+
+ int r = librbd::cls_client::mirror_image_instance_list(
+ &io_ctx, start_image_id, max, &instances);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror image instances: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto it : instances) {
+ (*instance_ids)[it.first] = stringify(it.second.name.num());
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Mirror<librbd::ImageCtx>;
diff --git a/src/librbd/api/Mirror.h b/src/librbd/api/Mirror.h
new file mode 100644
index 00000000..94768acd
--- /dev/null
+++ b/src/librbd/api/Mirror.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_MIRROR_H
+#define LIBRBD_API_MIRROR_H
+
+#include "include/rbd/librbd.hpp"
+#include <map>
+#include <string>
+#include <vector>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Mirror {
+ typedef std::map<std::string, std::string> Attributes;
+ typedef std::map<std::string, mirror_image_status_t> IdToMirrorImageStatus;
+ typedef std::map<mirror_image_status_state_t, int> MirrorImageStatusStates;
+
+ static int site_name_get(librados::Rados& rados, std::string* name);
+ static int site_name_set(librados::Rados& rados, const std::string& name);
+
+ static int mode_get(librados::IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
+ static int mode_set(librados::IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
+
+ static int peer_bootstrap_create(librados::IoCtx& io_ctx, std::string* token);
+ static int peer_bootstrap_import(librados::IoCtx& io_ctx,
+ rbd_mirror_peer_direction_t direction,
+ const std::string& token);
+
+ static int peer_add(librados::IoCtx& io_ctx, std::string *uuid,
+ const std::string &cluster_name,
+ const std::string &client_name);
+ static int peer_remove(librados::IoCtx& io_ctx, const std::string &uuid);
+ static int peer_list(librados::IoCtx& io_ctx,
+ std::vector<mirror_peer_t> *peers);
+ static int peer_set_client(librados::IoCtx& io_ctx, const std::string &uuid,
+ const std::string &client_name);
+ static int peer_set_cluster(librados::IoCtx& io_ctx, const std::string &uuid,
+ const std::string &cluster_name);
+ static int peer_get_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ Attributes* attributes);
+ static int peer_set_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const Attributes& attributes);
+
+ static int image_status_list(librados::IoCtx& io_ctx,
+ const std::string &start_id, size_t max,
+ IdToMirrorImageStatus *images);
+ static int image_status_summary(librados::IoCtx& io_ctx,
+ MirrorImageStatusStates *states);
+ static int image_instance_id_list(librados::IoCtx& io_ctx,
+ const std::string &start_image_id,
+ size_t max,
+ std::map<std::string, std::string> *ids);
+
+ static int image_enable(ImageCtxT *ictx, bool relax_same_pool_parent_check);
+ static int image_disable(ImageCtxT *ictx, bool force);
+ static int image_promote(ImageCtxT *ictx, bool force);
+ static void image_promote(ImageCtxT *ictx, bool force, Context *on_finish);
+ static int image_demote(ImageCtxT *ictx);
+ static void image_demote(ImageCtxT *ictx, Context *on_finish);
+ static int image_resync(ImageCtxT *ictx);
+ static int image_get_info(ImageCtxT *ictx,
+ mirror_image_info_t *mirror_image_info);
+ static void image_get_info(ImageCtxT *ictx,
+ mirror_image_info_t *mirror_image_info,
+ Context *on_finish);
+ static int image_get_status(ImageCtxT *ictx, mirror_image_status_t *status);
+ static void image_get_status(ImageCtxT *ictx, mirror_image_status_t *status,
+ Context *on_finish);
+ static int image_get_instance_id(ImageCtxT *ictx, std::string *instance_id);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Mirror<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_MIRROR_H
diff --git a/src/librbd/api/Namespace.cc b/src/librbd/api/Namespace.cc
new file mode 100644
index 00000000..ed7f8e28
--- /dev/null
+++ b/src/librbd/api/Namespace.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/api/Namespace.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Namespace: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+const std::list<std::string> POOL_OBJECTS {
+ RBD_CHILDREN,
+ RBD_GROUP_DIRECTORY,
+ RBD_INFO,
+ RBD_MIRRORING,
+ RBD_TASK,
+ RBD_TRASH,
+ RBD_DIRECTORY
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Namespace<I>::create(librados::IoCtx& io_ctx, const std::string& name)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << "name=" << name << dendl;
+
+ if (name.empty()) {
+ return -EINVAL;
+ }
+
+ librados::Rados rados(io_ctx);
+ int8_t require_osd_release;
+ int r = rados.get_min_compatible_osd(&require_osd_release);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve min OSD release: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (require_osd_release < CEPH_RELEASE_NAUTILUS) {
+ ldout(cct, 1) << "namespace support requires nautilus or later OSD"
+ << dendl;
+ return -ENOSYS;
+ }
+
+
+ librados::IoCtx default_ns_ctx;
+ default_ns_ctx.dup(io_ctx);
+ default_ns_ctx.set_namespace("");
+
+ r = cls_client::namespace_add(&default_ns_ctx, name);
+ if (r < 0) {
+ lderr(cct) << "failed to add namespace: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ librados::IoCtx ns_ctx;
+ ns_ctx.dup(io_ctx);
+ ns_ctx.set_namespace(name);
+
+ r = cls_client::dir_state_set(&ns_ctx, RBD_DIRECTORY,
+ cls::rbd::DIRECTORY_STATE_READY);
+ if (r < 0) {
+ lderr(cct) << "failed to initialize image directory: " << cpp_strerror(r)
+ << dendl;
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ int ret_val = cls_client::namespace_remove(&default_ns_ctx, name);
+ if (ret_val < 0) {
+ lderr(cct) << "failed to remove namespace: " << cpp_strerror(ret_val) << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int Namespace<I>::remove(librados::IoCtx& io_ctx, const std::string& name)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << "name=" << name << dendl;
+
+ if (name.empty()) {
+ return -EINVAL;
+ }
+
+ librados::IoCtx default_ns_ctx;
+ default_ns_ctx.dup(io_ctx);
+ default_ns_ctx.set_namespace("");
+
+ librados::IoCtx ns_ctx;
+ ns_ctx.dup(io_ctx);
+ ns_ctx.set_namespace(name);
+
+ std::map<std::string, cls::rbd::TrashImageSpec> trash_entries;
+
+ librados::ObjectWriteOperation dir_op;
+ librbd::cls_client::dir_state_set(
+ &dir_op, cls::rbd::DIRECTORY_STATE_ADD_DISABLED);
+ dir_op.remove();
+
+ int r = ns_ctx.operate(RBD_DIRECTORY, &dir_op);
+ if (r == -EBUSY) {
+ ldout(cct, 5) << "image directory not empty" << dendl;
+ goto rollback;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to disable the namespace: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = cls_client::trash_list(&ns_ctx, "", 1, &trash_entries);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list trash directory: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (!trash_entries.empty()) {
+ ldout(cct, 5) << "image trash not empty" << dendl;
+ goto rollback;
+ }
+
+ for (auto& oid : POOL_OBJECTS) {
+ r = ns_ctx.remove(oid);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to remove object '" << oid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ r = cls_client::namespace_remove(&default_ns_ctx, name);
+ if (r < 0) {
+ lderr(cct) << "failed to remove namespace: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+
+rollback:
+
+ r = librbd::cls_client::dir_state_set(
+ &ns_ctx, RBD_DIRECTORY, cls::rbd::DIRECTORY_STATE_READY);
+ if (r < 0) {
+ lderr(cct) << "failed to restore directory state: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return -EBUSY;
+}
+
+template <typename I>
+int Namespace<I>::list(IoCtx& io_ctx, vector<string> *names)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << dendl;
+
+ librados::IoCtx default_ns_ctx;
+ default_ns_ctx.dup(io_ctx);
+ default_ns_ctx.set_namespace("");
+
+ int r;
+ int max_read = 1024;
+ std::string last_read = "";
+ do {
+ std::list<std::string> name_list;
+ r = cls_client::namespace_list(&default_ns_ctx, last_read, max_read,
+ &name_list);
+ if (r == -ENOENT) {
+ return 0;
+ } else if (r < 0) {
+ lderr(cct) << "error listing namespaces: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ names->insert(names->end(), name_list.begin(), name_list.end());
+ if (!name_list.empty()) {
+ last_read = name_list.back();
+ }
+ r = name_list.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+template <typename I>
+int Namespace<I>::exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << "name=" << name << dendl;
+
+ *exists = false;
+ if (name.empty()) {
+ return -EINVAL;
+ }
+
+ librados::IoCtx ns_ctx;
+ ns_ctx.dup(io_ctx);
+ ns_ctx.set_namespace(name);
+
+ int r = librbd::cls_client::dir_state_assert(&ns_ctx, RBD_DIRECTORY,
+ cls::rbd::DIRECTORY_STATE_READY);
+ if (r == 0) {
+ *exists = true;
+ } else if (r != -ENOENT) {
+ lderr(cct) << "error asserting namespace: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Namespace<librbd::ImageCtx>;
diff --git a/src/librbd/api/Namespace.h b/src/librbd/api/Namespace.h
new file mode 100644
index 00000000..220eb28f
--- /dev/null
+++ b/src/librbd/api/Namespace.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_NAMESPACE_H
+#define CEPH_LIBRBD_API_NAMESPACE_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include <string>
+#include <vector>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Namespace {
+
+ static int create(librados::IoCtx& io_ctx, const std::string& name);
+ static int remove(librados::IoCtx& io_ctx, const std::string& name);
+ static int list(librados::IoCtx& io_ctx, std::vector<std::string>* names);
+ static int exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Namespace<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_NAMESPACE_H
diff --git a/src/librbd/api/Pool.cc b/src/librbd/api/Pool.cc
new file mode 100644
index 00000000..88adf561
--- /dev/null
+++ b/src/librbd/api/Pool.cc
@@ -0,0 +1,377 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Pool.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "osd/osd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Trash.h"
+#include "librbd/image/ValidatePoolRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Pool::ImageStatRequest: " \
+ << __func__ << " " << this << ": " \
+ << "(id=" << m_image_id << "): "
+
+template <typename I>
+class ImageStatRequest {
+public:
+ ImageStatRequest(librados::IoCtx& io_ctx, SimpleThrottle& throttle,
+ const std::string& image_id, bool scan_snaps,
+ std::atomic<uint64_t>* bytes,
+ std::atomic<uint64_t>* max_bytes,
+ std::atomic<uint64_t>* snaps)
+ : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ m_io_ctx(io_ctx), m_throttle(throttle), m_image_id(image_id),
+ m_scan_snaps(scan_snaps), m_bytes(bytes), m_max_bytes(max_bytes),
+ m_snaps(snaps) {
+ m_throttle.start_op();
+ }
+
+ void send() {
+ get_head();
+ }
+
+protected:
+ void finish(int r) {
+ (*m_max_bytes) += m_max_size;
+ m_throttle.end_op(r);
+
+ delete this;
+ }
+
+private:
+ CephContext* m_cct;
+ librados::IoCtx& m_io_ctx;
+ SimpleThrottle& m_throttle;
+ const std::string& m_image_id;
+ bool m_scan_snaps;
+ std::atomic<uint64_t>* m_bytes;
+ std::atomic<uint64_t>* m_max_bytes;
+ std::atomic<uint64_t>* m_snaps;
+ bufferlist m_out_bl;
+
+ uint64_t m_max_size = 0;
+ ::SnapContext m_snapc;
+
+ void get_head() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_size_start(&op, CEPH_NOSNAP);
+ if (m_scan_snaps) {
+ cls_client::get_snapcontext_start(&op);
+ }
+
+ m_out_bl.clear();
+ auto aio_comp = util::create_rados_callback<
+ ImageStatRequest<I>, &ImageStatRequest<I>::handle_get_head>(this);
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ }
+
+ void handle_get_head(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (r == 0) {
+ uint8_t order;
+ r = cls_client::get_size_finish(&it, &m_max_size, &order);
+ if (r == 0) {
+ (*m_bytes) += m_max_size;
+ }
+ }
+ if (m_scan_snaps && r == 0) {
+ r = cls_client::get_snapcontext_finish(&it, &m_snapc);
+ if (r == 0) {
+ (*m_snaps) += m_snapc.snaps.size();
+ }
+ }
+
+ if (r == -ENOENT) {
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to stat image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_snapc.is_valid()) {
+ lderr(m_cct) << "snap context is invalid" << dendl;
+ finish(-EIO);
+ return;
+ }
+
+ get_snaps();
+ }
+
+ void get_snaps() {
+ if (!m_scan_snaps || m_snapc.snaps.empty()) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+ librados::ObjectReadOperation op;
+ for (auto snap_seq : m_snapc.snaps) {
+ cls_client::get_size_start(&op, snap_seq);
+ }
+
+ m_out_bl.clear();
+ auto aio_comp = util::create_rados_callback<
+ ImageStatRequest<I>, &ImageStatRequest<I>::handle_get_snaps>(this);
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ }
+
+ void handle_get_snaps(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ auto it = m_out_bl.cbegin();
+ for ([[maybe_unused]] auto snap_seq : m_snapc.snaps) {
+ uint64_t size;
+ if (r == 0) {
+ uint8_t order;
+ r = cls_client::get_size_finish(&it, &size, &order);
+ }
+ if (r == 0 && m_max_size < size) {
+ m_max_size = size;
+ }
+ }
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 15) << "out-of-sync metadata" << dendl;
+ get_head();
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve snap size: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ } else {
+ finish(0);
+ }
+ }
+
+};
+
+template <typename I>
+void get_pool_stat_option_value(typename Pool<I>::StatOptions* stat_options,
+ rbd_pool_stat_option_t option,
+ uint64_t** value) {
+ auto it = stat_options->find(option);
+ if (it == stat_options->end()) {
+ *value = nullptr;
+ } else {
+ *value = it->second;
+ }
+}
+
+template <typename I>
+int get_pool_stats(librados::IoCtx& io_ctx, const ConfigProxy& config,
+ const std::vector<std::string>& image_ids, uint64_t* image_count,
+ uint64_t* provisioned_bytes, uint64_t* max_provisioned_bytes,
+ uint64_t* snapshot_count) {
+
+ bool scan_snaps = ((max_provisioned_bytes != nullptr) ||
+ (snapshot_count != nullptr));
+
+ SimpleThrottle throttle(
+ config.template get_val<uint64_t>("rbd_concurrent_management_ops"), true);
+ std::atomic<uint64_t> bytes{0};
+ std::atomic<uint64_t> max_bytes{0};
+ std::atomic<uint64_t> snaps{0};
+ for (auto& image_id : image_ids) {
+ if (throttle.pending_error()) {
+ break;
+ }
+
+ auto req = new ImageStatRequest<I>(io_ctx, throttle, image_id,
+ scan_snaps, &bytes, &max_bytes, &snaps);
+ req->send();
+ }
+
+ int r = throttle.wait_for_ret();
+ if (r < 0) {
+ return r;
+ }
+
+ if (image_count != nullptr) {
+ *image_count = image_ids.size();
+ }
+ if (provisioned_bytes != nullptr) {
+ *provisioned_bytes = bytes.load();
+ }
+ if (max_provisioned_bytes != nullptr) {
+ *max_provisioned_bytes = max_bytes.load();
+ }
+ if (snapshot_count != nullptr) {
+ *snapshot_count = snaps.load();
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Pool: " << __func__ << ": "
+
+template <typename I>
+int Pool<I>::init(librados::IoCtx& io_ctx, bool force) {
+ auto cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ int r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RBD, force);
+ if (r < 0) {
+ return r;
+ }
+
+ ConfigProxy config{cct->_conf};
+ api::Config<I>::apply_pool_overrides(io_ctx, &config);
+ if (!config.get_val<bool>("rbd_validate_pool")) {
+ return 0;
+ }
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond ctx;
+ auto req = image::ValidatePoolRequest<I>::create(io_ctx, op_work_queue, &ctx);
+ req->send();
+
+ return ctx.wait();
+}
+
+template <typename I>
+int Pool<I>::add_stat_option(StatOptions* stat_options,
+ rbd_pool_stat_option_t option,
+ uint64_t* value) {
+ switch (option) {
+ case RBD_POOL_STAT_OPTION_IMAGES:
+ case RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS:
+ case RBD_POOL_STAT_OPTION_TRASH_IMAGES:
+ case RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS:
+ stat_options->emplace(option, value);
+ return 0;
+ default:
+ break;
+ }
+ return -ENOENT;
+}
+
+template <typename I>
+int Pool<I>::get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options) {
+ auto cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ ConfigProxy config{cct->_conf};
+ api::Config<I>::apply_pool_overrides(io_ctx, &config);
+
+ uint64_t* image_count;
+ uint64_t* provisioned_bytes;
+ uint64_t* max_provisioned_bytes;
+ uint64_t* snapshot_count;
+
+ std::vector<trash_image_info_t> trash_entries;
+ int r = Trash<I>::list(io_ctx, trash_entries, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGES, &image_count);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES,
+ &provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+ &max_provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snapshot_count);
+ if (image_count != nullptr || provisioned_bytes != nullptr ||
+ max_provisioned_bytes != nullptr || snapshot_count != nullptr) {
+ typename Image<I>::ImageNameToIds images;
+ int r = Image<I>::list_images_v2(io_ctx, &images);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<std::string> image_ids;
+ image_ids.reserve(images.size() + trash_entries.size());
+ for (auto& it : images) {
+ image_ids.push_back(std::move(it.second));
+ }
+ for (auto& it : trash_entries) {
+ if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ image_ids.push_back(std::move(it.id));
+ }
+ }
+
+ r = get_pool_stats<I>(io_ctx, config, image_ids, image_count,
+ provisioned_bytes, max_provisioned_bytes,
+ snapshot_count);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_IMAGES, &image_count);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES,
+ &provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+ &max_provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &snapshot_count);
+ if (image_count != nullptr || provisioned_bytes != nullptr ||
+ max_provisioned_bytes != nullptr || snapshot_count != nullptr) {
+
+ std::vector<std::string> image_ids;
+ image_ids.reserve(trash_entries.size());
+ for (auto& it : trash_entries) {
+ if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ continue;
+ }
+ image_ids.push_back(std::move(it.id));
+ }
+
+ r = get_pool_stats<I>(io_ctx, config, image_ids, image_count,
+ provisioned_bytes, max_provisioned_bytes,
+ snapshot_count);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Pool<librbd::ImageCtx>;
diff --git a/src/librbd/api/Pool.h b/src/librbd/api/Pool.h
new file mode 100644
index 00000000..7b607ab6
--- /dev/null
+++ b/src/librbd/api/Pool.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_POOL_H
+#define CEPH_LIBRBD_API_POOL_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.h"
+#include <map>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Pool {
+public:
+ typedef std::map<rbd_pool_stat_option_t, uint64_t*> StatOptions;
+
+ static int init(librados::IoCtx& io_ctx, bool force);
+
+ static int add_stat_option(StatOptions* stat_options,
+ rbd_pool_stat_option_t option,
+ uint64_t* value);
+
+ static int get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Pool<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_POOL_H
diff --git a/src/librbd/api/PoolMetadata.cc b/src/librbd/api/PoolMetadata.cc
new file mode 100644
index 00000000..f3d53944
--- /dev/null
+++ b/src/librbd/api/PoolMetadata.cc
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/PoolMetadata.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::PoolMetadata: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+void update_pool_timestamp(librados::IoCtx& io_ctx) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ auto now = ceph_clock_now();
+ std::string cmd =
+ R"({)"
+ R"("prefix": "config set", )"
+ R"("who": "global", )"
+ R"("name": "rbd_config_pool_override_update_timestamp", )"
+ R"("value": ")" + stringify(now.sec()) + R"(")"
+ R"(})";
+
+ librados::Rados rados(io_ctx);
+ bufferlist in_bl;
+ std::string ss;
+ int r = rados.mon_command(cmd, in_bl, nullptr, &ss);
+ if (r < 0) {
+ lderr(cct) << "failed to notify clients of pool config update: "
+ << cpp_strerror(r) << dendl;
+ }
+}
+
+} // anonymous namespace
+
+template <typename I>
+int PoolMetadata<I>::get(librados::IoCtx& io_ctx,
+ const std::string &key, std::string *value) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, value);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int PoolMetadata<I>::set(librados::IoCtx& io_ctx, const std::string &key,
+ const std::string &value) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ bool need_update_pool_timestamp = false;
+
+ std::string config_key;
+ if (util::is_metadata_config_override(key, &config_key)) {
+ if (!librbd::api::Config<I>::is_option_name(io_ctx, config_key)) {
+ lderr(cct) << "validation for " << key
+ << " failed: not allowed pool level override" << dendl;
+ return -EINVAL;
+ }
+ int r = ConfigProxy{false}.set_val(config_key.c_str(), value);
+ if (r < 0) {
+ lderr(cct) << "validation for " << key << " failed: " << cpp_strerror(r)
+ << dendl;
+ return -EINVAL;
+ }
+
+ need_update_pool_timestamp = true;
+ }
+
+ ceph::bufferlist bl;
+ bl.append(value);
+
+ int r = cls_client::metadata_set(&io_ctx, RBD_INFO, {{key, bl}});
+ if (r < 0) {
+ lderr(cct) << "failed setting metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (need_update_pool_timestamp) {
+ update_pool_timestamp(io_ctx);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int PoolMetadata<I>::remove(librados::IoCtx& io_ctx, const std::string &key) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ std::string value;
+ int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, &value);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "metadata " << key << " does not exist" << dendl;
+ } else {
+ lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ }
+
+ r = cls_client::metadata_remove(&io_ctx, RBD_INFO, key);
+ if (r < 0) {
+ lderr(cct) << "failed removing metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string config_key;
+ if (util::is_metadata_config_override(key, &config_key)) {
+ update_pool_timestamp(io_ctx);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int PoolMetadata<I>::list(librados::IoCtx& io_ctx, const std::string &start,
+ uint64_t max,
+ std::map<std::string, ceph::bufferlist> *pairs) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ int r = cls_client::metadata_list(&io_ctx, RBD_INFO, start, max, pairs);
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "failed listing metadata: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::PoolMetadata<librbd::ImageCtx>;
diff --git a/src/librbd/api/PoolMetadata.h b/src/librbd/api/PoolMetadata.h
new file mode 100644
index 00000000..c0a81735
--- /dev/null
+++ b/src/librbd/api/PoolMetadata.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_POOL_METADATA_H
+#define CEPH_LIBRBD_API_POOL_METADATA_H
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+
+#include <map>
+#include <string>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PoolMetadata {
+public:
+ static int get(librados::IoCtx& io_ctx, const std::string &key,
+ std::string *value);
+ static int set(librados::IoCtx& io_ctx, const std::string &key,
+ const std::string &value);
+ static int remove(librados::IoCtx& io_ctx, const std::string &key);
+ static int list(librados::IoCtx& io_ctx, const std::string &start,
+ uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::PoolMetadata<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_POOL_METADATA_H
diff --git a/src/librbd/api/Snapshot.cc b/src/librbd/api/Snapshot.cc
new file mode 100644
index 00000000..3d29cfb2
--- /dev/null
+++ b/src/librbd/api/Snapshot.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Snapshot.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include <boost/variant.hpp>
+#include "include/Context.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Snapshot: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+class GetGroupVisitor : public boost::static_visitor<int> {
+public:
+ CephContext* cct;
+ librados::IoCtx *image_ioctx;
+ snap_group_namespace_t *group_snap;
+
+ explicit GetGroupVisitor(CephContext* cct, librados::IoCtx *_image_ioctx,
+ snap_group_namespace_t *group_snap)
+ : cct(cct), image_ioctx(_image_ioctx), group_snap(group_snap) {};
+
+ template <typename T>
+ inline int operator()(const T&) const {
+ // ignore other than GroupSnapshotNamespace types.
+ return -EINVAL;
+ }
+
+ inline int operator()(
+ const cls::rbd::GroupSnapshotNamespace& snap_namespace) {
+ IoCtx group_ioctx;
+ int r = util::create_ioctx(*image_ioctx, "group", snap_namespace.group_pool,
+ {}, &group_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot group_snapshot;
+
+ std::string group_name;
+ r = cls_client::dir_get_name(&group_ioctx, RBD_GROUP_DIRECTORY,
+ snap_namespace.group_id, &group_name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve group name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ string group_header_oid = util::group_header_name(snap_namespace.group_id);
+ r = cls_client::group_snap_get_by_id(&group_ioctx,
+ group_header_oid,
+ snap_namespace.group_snapshot_id,
+ &group_snapshot);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve group snapshot: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ group_snap->group_pool = group_ioctx.get_id();
+ group_snap->group_name = group_name;
+ group_snap->group_snap_name = group_snapshot.name;
+ return 0;
+ }
+};
+
+class GetTrashVisitor : public boost::static_visitor<int> {
+public:
+ std::string* original_name;
+
+ explicit GetTrashVisitor(std::string* original_name)
+ : original_name(original_name) {
+ }
+
+ template <typename T>
+ inline int operator()(const T&) const {
+ return -EINVAL;
+ }
+
+ inline int operator()(
+ const cls::rbd::TrashSnapshotNamespace& snap_namespace) {
+ *original_name = snap_namespace.original_name;
+ return 0;
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Snapshot<I>::get_group_namespace(I *ictx, uint64_t snap_id,
+ snap_group_namespace_t *group_snap) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ GetGroupVisitor ggv = GetGroupVisitor(ictx->cct, &ictx->md_ctx, group_snap);
+ r = boost::apply_visitor(ggv, snap_info->snap_namespace);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::get_trash_namespace(I *ictx, uint64_t snap_id,
+ std::string* original_name) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ auto visitor = GetTrashVisitor(original_name);
+ r = boost::apply_visitor(visitor, snap_info->snap_namespace);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::get_namespace_type(I *ictx, uint64_t snap_id,
+ snap_namespace_type_t *namespace_type) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker l(ictx->snap_lock);
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ *namespace_type = static_cast<snap_namespace_type_t>(
+ cls::rbd::get_snap_namespace_type(snap_info->snap_namespace));
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::remove(I *ictx, uint64_t snap_id) {
+ ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_id << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::SnapshotNamespace snapshot_namespace;
+ std::string snapshot_name;
+ {
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ auto it = ictx->snap_info.find(snap_id);
+ if (it == ictx->snap_info.end()) {
+ return -ENOENT;
+ }
+
+ snapshot_namespace = it->second.snap_namespace;
+ snapshot_name = it->second.name;
+ }
+
+ C_SaferCond ctx;
+ ictx->operations->snap_remove(snapshot_namespace, snapshot_name, &ctx);
+ r = ctx.wait();
+ return r;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Snapshot<librbd::ImageCtx>;
diff --git a/src/librbd/api/Snapshot.h b/src/librbd/api/Snapshot.h
new file mode 100644
index 00000000..453861c4
--- /dev/null
+++ b/src/librbd/api/Snapshot.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_SNAPSHOT_H
+#define CEPH_LIBRBD_API_SNAPSHOT_H
+
+#include "include/rbd/librbd.hpp"
+#include <string>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Snapshot {
+
+ static int get_group_namespace(ImageCtxT *ictx, uint64_t snap_id,
+ snap_group_namespace_t *group_snap);
+
+ static int get_trash_namespace(ImageCtxT *ictx, uint64_t snap_id,
+ std::string *original_name);
+
+ static int get_namespace_type(ImageCtxT *ictx, uint64_t snap_id,
+ snap_namespace_type_t *namespace_type);
+
+ static int remove(ImageCtxT *ictx, uint64_t snap_id);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Snapshot<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_SNAPSHOT_H
diff --git a/src/librbd/api/Trash.cc b/src/librbd/api/Trash.cc
new file mode 100644
index 00000000..0664461e
--- /dev/null
+++ b/src/librbd/api/Trash.cc
@@ -0,0 +1,681 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Trash.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/TrashWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/api/DiffIterate.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/trash/MoveRequest.h"
+#include "librbd/trash/RemoveRequest.h"
+#include <json_spirit/json_spirit.h>
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/image/ListWatchersRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Trash: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+template <typename I>
+const typename Trash<I>::TrashImageSources Trash<I>::RESTORE_SOURCE_WHITELIST {
+ cls::rbd::TRASH_IMAGE_SOURCE_USER,
+ cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING
+ };
+
+namespace {
+
+template <typename I>
+int disable_mirroring(I *ictx) {
+ if (!ictx->test_features(RBD_FEATURE_JOURNALING)) {
+ return 0;
+ }
+
+ cls::rbd::MirrorImage mirror_image;
+ int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, &mirror_image);
+ if (r == -ENOENT) {
+ ldout(ictx->cct, 10) << "mirroring is not enabled for this image" << dendl;
+ return 0;
+ }
+
+ if (r < 0) {
+ lderr(ictx->cct) << "failed to retrieve mirror image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(ictx->cct, 10) << dendl;
+
+ C_SaferCond ctx;
+ auto req = mirror::DisableRequest<I>::create(ictx, false, true, &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(ictx->cct) << "failed to disable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int enable_mirroring(IoCtx &io_ctx, const std::string &image_id) {
+ auto cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+
+ uint64_t features;
+ uint64_t incompatible_features;
+ int r = cls_client::get_features(&io_ctx, util::header_name(image_id), true,
+ &features, &incompatible_features);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve features: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((features & RBD_FEATURE_JOURNALING) == 0) {
+ return 0;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (mirror_mode != cls::rbd::MIRROR_MODE_POOL) {
+ ldout(cct, 10) << "not pool mirroring mode" << dendl;
+ return 0;
+ }
+
+ ldout(cct, 10) << dendl;
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+ C_SaferCond ctx;
+ auto req = mirror::EnableRequest<I>::create(io_ctx, image_id, "",
+ op_work_queue, &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int Trash<I>::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, const std::string &image_id,
+ uint64_t delay) {
+ ceph_assert(!image_name.empty() && !image_id.empty());
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << &io_ctx << " name=" << image_name << ", id=" << image_id
+ << dendl;
+
+ auto ictx = new I("", image_id, nullptr, io_ctx, false);
+ int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (r == 0) {
+ if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+ RWLock::WLocker snap_locker(ictx->snap_lock);
+ ictx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
+ ictx->owner_lock.get_read();
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->block_requests(0);
+
+ r = ictx->operations->prepare_image_update(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, false);
+ if (r < 0) {
+ lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl;
+ ictx->owner_lock.put_read();
+ ictx->state->close();
+ return -EBUSY;
+ }
+ }
+ ictx->owner_lock.put_read();
+
+ ictx->snap_lock.get_read();
+ if (!ictx->migration_info.empty()) {
+ lderr(cct) << "cannot move migrating image to trash" << dendl;
+ ictx->snap_lock.put_read();
+ ictx->state->close();
+ return -EBUSY;
+ }
+ ictx->snap_lock.put_read();
+
+ r = disable_mirroring<I>(ictx);
+ if (r < 0) {
+ ictx->state->close();
+ return r;
+ }
+
+ ictx->state->close();
+ }
+
+ utime_t delete_time{ceph_clock_now()};
+ utime_t deferment_end_time{delete_time};
+ deferment_end_time += delay;
+ cls::rbd::TrashImageSpec trash_image_spec{
+ static_cast<cls::rbd::TrashImageSource>(source), image_name,
+ delete_time, deferment_end_time};
+
+ trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_MOVING;
+ C_SaferCond ctx;
+ auto req = trash::MoveRequest<I>::create(io_ctx, image_id, trash_image_spec,
+ &ctx);
+ req->send();
+
+ r = ctx.wait();
+ trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_NORMAL;
+ int ret = cls_client::trash_state_set(&io_ctx, image_id,
+ trash_image_spec.state,
+ cls::rbd::TRASH_IMAGE_STATE_MOVING);
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond notify_ctx;
+ TrashWatcher<I>::notify_image_added(io_ctx, image_id, trash_image_spec,
+ &notify_ctx);
+ r = notify_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, uint64_t delay) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << &io_ctx << " name=" << image_name << dendl;
+
+ // try to get image id from the directory
+ std::string image_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r == -ENOENT) {
+ r = io_ctx.stat(util::old_header_name(image_name), nullptr, nullptr);
+ if (r == 0) {
+ // cannot move V1 image to trash
+ ldout(cct, 10) << "cannot move v1 image to trash" << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ // image doesn't exist -- perhaps already in the trash since removing
+ // from the directory is the last step
+ return -ENOENT;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ceph_assert(!image_name.empty() && !image_id.empty());
+ return Trash<I>::move(io_ctx, source, image_name, image_id, delay);
+}
+
+template <typename I>
+int Trash<I>::get(IoCtx &io_ctx, const std::string &id,
+ trash_image_info_t *info) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << __func__ << " " << &io_ctx << dendl;
+
+ cls::rbd::TrashImageSpec spec;
+ int r = cls_client::trash_get(&io_ctx, id, &spec);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "error retrieving trash entry: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ rbd_trash_image_source_t source = static_cast<rbd_trash_image_source_t>(
+ spec.source);
+ *info = trash_image_info_t{id, spec.name, source, spec.deletion_time.sec(),
+ spec.deferment_end_time.sec()};
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::list(IoCtx &io_ctx, vector<trash_image_info_t> &entries,
+ bool exclude_user_remove_source) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "trash_list " << &io_ctx << dendl;
+
+ bool more_entries;
+ uint32_t max_read = 1024;
+ std::string last_read = "";
+ do {
+ map<string, cls::rbd::TrashImageSpec> trash_entries;
+ int r = cls_client::trash_list(&io_ctx, last_read, max_read,
+ &trash_entries);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing rbd trash entries: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (r == -ENOENT) {
+ break;
+ }
+
+ if (trash_entries.empty()) {
+ break;
+ }
+
+ for (const auto &entry : trash_entries) {
+ rbd_trash_image_source_t source =
+ static_cast<rbd_trash_image_source_t>(entry.second.source);
+ if (exclude_user_remove_source &&
+ source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ continue;
+ }
+ entries.push_back({entry.first, entry.second.name, source,
+ entry.second.deletion_time.sec(),
+ entry.second.deferment_end_time.sec()});
+ }
+ last_read = trash_entries.rbegin()->first;
+ more_entries = (trash_entries.size() >= max_read);
+ } while (more_entries);
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::purge(IoCtx& io_ctx, time_t expire_ts,
+ float threshold, ProgressContext& pctx) {
+ auto *cct((CephContext *) io_ctx.cct());
+ ldout(cct, 20) << &io_ctx << dendl;
+
+ std::vector<librbd::trash_image_info_t> trash_entries;
+ int r = librbd::api::Trash<I>::list(io_ctx, trash_entries, true);
+ if (r < 0) {
+ return r;
+ }
+
+ trash_entries.erase(
+ std::remove_if(trash_entries.begin(), trash_entries.end(),
+ [](librbd::trash_image_info_t info) {
+ return info.source != RBD_TRASH_IMAGE_SOURCE_USER;
+ }),
+ trash_entries.end());
+
+ std::set<std::string> to_be_removed;
+ if (threshold != -1) {
+ if (threshold < 0 || threshold > 1) {
+ lderr(cct) << "argument 'threshold' is out of valid range"
+ << dendl;
+ return -EINVAL;
+ }
+
+ librados::bufferlist inbl;
+ librados::bufferlist outbl;
+ std::string pool_name = io_ctx.get_pool_name();
+
+ librados::Rados rados(io_ctx);
+ rados.mon_command(R"({"prefix": "df", "format": "json"})", inbl,
+ &outbl, nullptr);
+
+ json_spirit::mValue json;
+ if (!json_spirit::read(outbl.to_str(), json)) {
+ lderr(cct) << "ceph df json output could not be parsed"
+ << dendl;
+ return -EBADMSG;
+ }
+
+ json_spirit::mArray arr = json.get_obj()["pools"].get_array();
+
+ double pool_percent_used = 0;
+ uint64_t pool_total_bytes = 0;
+
+ std::map<std::string, std::vector<std::string>> datapools;
+
+ std::sort(trash_entries.begin(), trash_entries.end(),
+ [](librbd::trash_image_info_t a, librbd::trash_image_info_t b) {
+ return a.deferment_end_time < b.deferment_end_time;
+ }
+ );
+
+ for (const auto &entry : trash_entries) {
+ int64_t data_pool_id = -1;
+ r = cls_client::get_data_pool(&io_ctx, util::header_name(entry.id),
+ &data_pool_id);
+ if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) {
+ lderr(cct) << "failed to query data pool: " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (data_pool_id == -1) {
+ data_pool_id = io_ctx.get_id();
+ }
+
+ if (data_pool_id != io_ctx.get_id()) {
+ librados::IoCtx data_io_ctx;
+ r = util::create_ioctx(io_ctx, "image", data_pool_id,
+ {}, &data_io_ctx);
+ if (r < 0) {
+ lderr(cct) << "error accessing data pool" << dendl;
+ continue;
+ }
+ auto data_pool = data_io_ctx.get_pool_name();
+ datapools[data_pool].push_back(entry.id);
+ } else {
+ datapools[pool_name].push_back(entry.id);
+ }
+ }
+
+ uint64_t bytes_to_free = 0;
+
+ for (uint8_t i = 0; i < arr.size(); ++i) {
+ json_spirit::mObject obj = arr[i].get_obj();
+ std::string name = obj.find("name")->second.get_str();
+ auto img = datapools.find(name);
+ if (img != datapools.end()) {
+ json_spirit::mObject stats = arr[i].get_obj()["stats"].get_obj();
+ pool_percent_used = stats["percent_used"].get_real();
+ if (pool_percent_used <= threshold) continue;
+
+ bytes_to_free = 0;
+
+ pool_total_bytes = stats["max_avail"].get_uint64() +
+ stats["bytes_used"].get_uint64();
+
+ auto bytes_threshold = (uint64_t) (pool_total_bytes *
+ (pool_percent_used - threshold));
+
+ for (const auto &it : img->second) {
+ auto ictx = new I("", it, nullptr, io_ctx, false);
+ r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r == -ENOENT) {
+ continue;
+ } else if (r < 0) {
+ lderr(cct) << "failed to open image " << it << ": "
+ << cpp_strerror(r) << dendl;
+ }
+
+ r = librbd::api::DiffIterate<I>::diff_iterate(
+ ictx, cls::rbd::UserSnapshotNamespace(), nullptr, 0, ictx->size,
+ false, true,
+ [](uint64_t offset, size_t len, int exists, void *arg) {
+ auto *to_free = reinterpret_cast<uint64_t *>(arg);
+ if (exists)
+ (*to_free) += len;
+ return 0;
+ }, &bytes_to_free);
+
+ ictx->state->close();
+ if (r < 0) {
+ lderr(cct) << "failed to calculate disk usage for image " << it
+ << ": " << cpp_strerror(r) << dendl;
+ continue;
+ }
+
+ to_be_removed.insert(it);
+ if (bytes_to_free >= bytes_threshold) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (bytes_to_free == 0) {
+ ldout(cct, 10) << "pool usage is lower than or equal to "
+ << (threshold * 100)
+ << "%" << dendl;
+ return 0;
+ }
+ }
+
+ if (expire_ts == 0) {
+ struct timespec now;
+ clock_gettime(CLOCK_REALTIME, &now);
+ expire_ts = now.tv_sec;
+ }
+
+ for (const auto &entry : trash_entries) {
+ if (expire_ts >= entry.deferment_end_time) {
+ to_be_removed.insert(entry.id);
+ }
+ }
+
+ NoOpProgressContext remove_pctx;
+ uint64_t list_size = to_be_removed.size(), i = 0;
+ for (const auto &entry_id : to_be_removed) {
+ r = librbd::api::Trash<I>::remove(io_ctx, entry_id, true, remove_pctx);
+ if (r < 0) {
+ if (r == -ENOTEMPTY) {
+ ldout(cct, 5) << "image has snapshots - these must be deleted "
+ << "with 'rbd snap purge' before the image can be "
+ << "removed." << dendl;
+ } else if (r == -EBUSY) {
+ ldout(cct, 5) << "error: image still has watchers" << std::endl
+ << "This means the image is still open or the client "
+ << "using it crashed. Try again after closing/unmapping "
+ << "it or waiting 30s for the crashed client to timeout."
+ << dendl;
+ } else if (r == -EMLINK) {
+ ldout(cct, 5) << "Remove the image from the group and try again."
+ << dendl;
+ } else {
+ lderr(cct) << "remove error: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ pctx.update_progress(++i, list_size);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::remove(IoCtx &io_ctx, const std::string &image_id, bool force,
+ ProgressContext& prog_ctx) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "trash_remove " << &io_ctx << " " << image_id
+ << " " << force << dendl;
+
+ cls::rbd::TrashImageSpec trash_spec;
+ int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec);
+ if (r < 0) {
+ lderr(cct) << "error getting image id " << image_id
+ << " info from trash: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ utime_t now = ceph_clock_now();
+ if (now < trash_spec.deferment_end_time && !force) {
+ lderr(cct) << "error: deferment time has not expired." << dendl;
+ return -EPERM;
+ }
+ if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ lderr(cct) << "error: image is pending restoration." << dendl;
+ return -EBUSY;
+ }
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond cond;
+ auto req = librbd::trash::RemoveRequest<I>::create(
+ io_ctx, image_id, op_work_queue, force, prog_ctx, &cond);
+ req->send();
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond notify_ctx;
+ TrashWatcher<I>::notify_image_removed(io_ctx, image_id, &notify_ctx);
+ r = notify_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::restore(librados::IoCtx &io_ctx,
+ const TrashImageSources& trash_image_sources,
+ const std::string &image_id,
+ const std::string &image_new_name) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "trash_restore " << &io_ctx << " " << image_id << " "
+ << image_new_name << dendl;
+
+ cls::rbd::TrashImageSpec trash_spec;
+ int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec);
+ if (r < 0) {
+ lderr(cct) << "error getting image id " << image_id
+ << " info from trash: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (trash_image_sources.count(trash_spec.source) == 0) {
+ lderr(cct) << "Current trash source '" << trash_spec.source << "' "
+ << "does not match expected: "
+ << trash_image_sources << dendl;
+ return -EINVAL;
+ }
+
+ std::string image_name = image_new_name;
+ if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_RESTORING) {
+ lderr(cct) << "error restoring image id " << image_id
+ << ", which is pending deletion" << dendl;
+ return -EBUSY;
+ }
+ r = cls_client::trash_state_set(&io_ctx, image_id,
+ cls::rbd::TRASH_IMAGE_STATE_RESTORING,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (image_name.empty()) {
+ // if user didn't specify a new name, let's try using the old name
+ image_name = trash_spec.name;
+ ldout(cct, 20) << "restoring image id " << image_id << " with name "
+ << image_name << dendl;
+ }
+
+ // check if no image exists with the same name
+ bool create_id_obj = true;
+ std::string existing_id;
+ r = cls_client::get_id(&io_ctx, util::id_obj_name(image_name), &existing_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error checking if image " << image_name << " exists: "
+ << cpp_strerror(r) << dendl;
+ int ret = cls_client::trash_state_set(&io_ctx, image_id,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL,
+ cls::rbd::TRASH_IMAGE_STATE_RESTORING);
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(ret) << dendl;
+ }
+ return r;
+ } else if (r != -ENOENT){
+ // checking if we are recovering from an incomplete restore
+ if (existing_id != image_id) {
+ ldout(cct, 2) << "an image with the same name already exists" << dendl;
+ int r2 = cls_client::trash_state_set(&io_ctx, image_id,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL,
+ cls::rbd::TRASH_IMAGE_STATE_RESTORING);
+ if (r2 < 0 && r2 != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(r2) << dendl;
+ }
+ return -EEXIST;
+ }
+ create_id_obj = false;
+ }
+
+ if (create_id_obj) {
+ ldout(cct, 2) << "adding id object" << dendl;
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_client::set_id(&op, image_id);
+ r = io_ctx.operate(util::id_obj_name(image_name), &op);
+ if (r < 0) {
+ lderr(cct) << "error adding id object for image " << image_name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ ldout(cct, 2) << "adding rbd image to v2 directory..." << dendl;
+ r = cls_client::dir_add_image(&io_ctx, RBD_DIRECTORY, image_name,
+ image_id);
+ if (r < 0 && r != -EEXIST) {
+ lderr(cct) << "error adding image to v2 directory: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = enable_mirroring<I>(io_ctx, image_id);
+ if (r < 0) {
+ // not fatal -- ignore
+ }
+
+ ldout(cct, 2) << "removing image from trash..." << dendl;
+ r = cls_client::trash_remove(&io_ctx, image_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing image id " << image_id << " from trash: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond notify_ctx;
+ TrashWatcher<I>::notify_image_removed(io_ctx, image_id, &notify_ctx);
+ r = notify_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Trash<librbd::ImageCtx>;
diff --git a/src/librbd/api/Trash.h b/src/librbd/api/Trash.h
new file mode 100644
index 00000000..65b6b8bc
--- /dev/null
+++ b/src/librbd/api/Trash.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_TRASH_H
+#define LIBRBD_API_TRASH_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <set>
+#include <string>
+#include <vector>
+
+namespace librbd {
+
+class ProgressContext;
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Trash {
+ typedef std::set<cls::rbd::TrashImageSource> TrashImageSources;
+ static const TrashImageSources RESTORE_SOURCE_WHITELIST;
+
+ static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, uint64_t delay);
+ static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, const std::string &image_id,
+ uint64_t delay);
+ static int get(librados::IoCtx &io_ctx, const std::string &id,
+ trash_image_info_t *info);
+ static int list(librados::IoCtx &io_ctx,
+ std::vector<trash_image_info_t> &entries,
+ bool exclude_user_remove_source);
+ static int purge(IoCtx& io_ctx, time_t expire_ts,
+ float threshold, ProgressContext& pctx);
+ static int remove(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool force, ProgressContext& prog_ctx);
+ static int restore(librados::IoCtx &io_ctx,
+ const TrashImageSources& trash_image_sources,
+ const std::string &image_id,
+ const std::string &image_new_name);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Trash<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_TRASH_H
diff --git a/src/librbd/cache/ImageCache.h b/src/librbd/cache/ImageCache.h
new file mode 100644
index 00000000..71af11f2
--- /dev/null
+++ b/src/librbd/cache/ImageCache.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_IMAGE_CACHE
+#define CEPH_LIBRBD_CACHE_IMAGE_CACHE
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include <vector>
+
+class Context;
+
+namespace librbd {
+namespace cache {
+
+/**
+ * client-side, image extent cache interface
+ */
+struct ImageCache {
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+ virtual ~ImageCache() {
+ }
+
+ /// client AIO methods
+ virtual void aio_read(Extents&& image_extents, ceph::bufferlist* bl,
+ int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_write(Extents&& image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) = 0;
+ virtual void aio_flush(Context *on_finish) = 0;
+ virtual void aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_compare_and_write(Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) = 0;
+
+ /// internal state methods
+ virtual void init(Context *on_finish) = 0;
+ virtual void shut_down(Context *on_finish) = 0;
+
+ virtual void invalidate(Context *on_finish) = 0;
+ virtual void flush(Context *on_finish) = 0;
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_IMAGE_CACHE
diff --git a/src/librbd/cache/ImageWriteback.cc b/src/librbd/cache/ImageWriteback.cc
new file mode 100644
index 00000000..ad479fbd
--- /dev/null
+++ b/src/librbd/cache/ImageWriteback.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ImageWriteback.h"
+#include "include/buffer.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ReadResult.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageWriteback: " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+template <typename I>
+ImageWriteback<I>::ImageWriteback(I &image_ctx) : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_read(Extents &&image_extents, bufferlist *bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_READ);
+ io::ImageReadRequest<I> req(m_image_ctx, aio_comp, std::move(image_extents),
+ io::ReadResult{bl}, fadvise_flags, {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_write(Extents &&image_extents,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_WRITE);
+ io::ImageWriteRequest<I> req(m_image_ctx, aio_comp, std::move(image_extents),
+ std::move(bl), fadvise_flags, {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_DISCARD);
+ io::ImageDiscardRequest<I> req(m_image_ctx, aio_comp, {{offset, length}},
+ discard_granularity_bytes, {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_flush(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_FLUSH);
+ io::ImageFlushRequest<I> req(m_image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL,
+ {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "data_len=" << bl.length() << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_WRITESAME);
+ io::ImageWriteSameRequest<I> req(m_image_ctx, aio_comp, {{offset, length}},
+ std::move(bl), fadvise_flags, {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_COMPARE_AND_WRITE);
+ io::ImageCompareAndWriteRequest<I> req(m_image_ctx, aio_comp,
+ std::move(image_extents),
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, fadvise_flags, {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ImageWriteback<librbd::ImageCtx>;
+
diff --git a/src/librbd/cache/ImageWriteback.h b/src/librbd/cache/ImageWriteback.h
new file mode 100644
index 00000000..382c57c1
--- /dev/null
+++ b/src/librbd/cache/ImageWriteback.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK
+#define CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+/**
+ * client-side, image extent cache writeback handler
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageWriteback {
+public:
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+ explicit ImageWriteback(ImageCtxT &image_ctx);
+
+ void aio_read(Extents &&image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish);
+ void aio_write(Extents &&image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish);
+ void aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes, Context *on_finish);
+ void aio_flush(Context *on_finish);
+ void aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish);
+ void aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags, Context *on_finish);
+private:
+ ImageCtxT &m_image_ctx;
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ImageWriteback<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK
diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.cc b/src/librbd/cache/ObjectCacherObjectDispatch.cc
new file mode 100644
index 00000000..5bced71b
--- /dev/null
+++ b/src/librbd/cache/ObjectCacherObjectDispatch.cc
@@ -0,0 +1,408 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ObjectCacherObjectDispatch.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/LibrbdWriteback.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/Utils.h"
+#include "osd/osd_types.h"
+#include "osdc/WritebackHandler.h"
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::ObjectCacherObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+namespace {
+
+typedef std::vector<ObjectExtent> ObjectExtents;
+
+} // anonymous namespace
+
+template <typename I>
+struct ObjectCacherObjectDispatch<I>::C_InvalidateCache : public Context {
+ ObjectCacherObjectDispatch* dispatcher;
+ bool purge_on_error;
+ Context *on_finish;
+
+ C_InvalidateCache(ObjectCacherObjectDispatch* dispatcher,
+ bool purge_on_error, Context *on_finish)
+ : dispatcher(dispatcher), purge_on_error(purge_on_error),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ ceph_assert(dispatcher->m_cache_lock.is_locked());
+ auto cct = dispatcher->m_image_ctx->cct;
+
+ if (r == -EBLACKLISTED) {
+ lderr(cct) << "blacklisted during flush (purging)" << dendl;
+ dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set);
+ } else if (r < 0 && purge_on_error) {
+ lderr(cct) << "failed to invalidate cache (purging): "
+ << cpp_strerror(r) << dendl;
+ dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set);
+ } else if (r != 0) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r) << dendl;
+ }
+
+ auto unclean = dispatcher->m_object_cacher->release_set(
+ dispatcher->m_object_set);
+ if (unclean == 0) {
+ r = 0;
+ } else {
+ lderr(cct) << "could not release all objects from cache: "
+ << unclean << " bytes remain" << dendl;
+ if (r == 0) {
+ r = -EBUSY;
+ }
+ }
+
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+ObjectCacherObjectDispatch<I>::ObjectCacherObjectDispatch(
+ I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_cache_lock(util::unique_lock_name(
+ "librbd::cache::ObjectCacherObjectDispatch::cache_lock", this)) {
+ ceph_assert(m_image_ctx->data_ctx.is_valid());
+}
+
+template <typename I>
+ObjectCacherObjectDispatch<I>::~ObjectCacherObjectDispatch() {
+ delete m_object_cacher;
+ delete m_object_set;
+
+ delete m_writeback_handler;
+}
+
+template <typename I>
+void ObjectCacherObjectDispatch<I>::init() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_cache_lock.Lock();
+ ldout(cct, 5) << "enabling caching..." << dendl;
+ m_writeback_handler = new LibrbdWriteback(m_image_ctx, m_cache_lock);
+
+ uint64_t init_max_dirty = m_image_ctx->cache_max_dirty;
+ if (m_image_ctx->cache_writethrough_until_flush) {
+ init_max_dirty = 0;
+ }
+
+ auto cache_size =
+ m_image_ctx->config.template get_val<Option::size_t>("rbd_cache_size");
+ auto target_dirty =
+ m_image_ctx->config.template get_val<Option::size_t>("rbd_cache_target_dirty");
+ auto max_dirty_age =
+ m_image_ctx->config.template get_val<double>("rbd_cache_max_dirty_age");
+ auto block_writes_upfront =
+ m_image_ctx->config.template get_val<bool>("rbd_cache_block_writes_upfront");
+ auto max_dirty_object =
+ m_image_ctx->config.template get_val<uint64_t>("rbd_cache_max_dirty_object");
+
+ ldout(cct, 5) << "Initial cache settings:"
+ << " size=" << cache_size
+ << " num_objects=" << 10
+ << " max_dirty=" << init_max_dirty
+ << " target_dirty=" << target_dirty
+ << " max_dirty_age=" << max_dirty_age << dendl;
+
+ m_object_cacher = new ObjectCacher(cct, m_image_ctx->perfcounter->get_name(),
+ *m_writeback_handler, m_cache_lock,
+ nullptr, nullptr, cache_size,
+ 10, /* reset this in init */
+ init_max_dirty, target_dirty,
+ max_dirty_age, block_writes_upfront);
+
+ // size object cache appropriately
+ if (max_dirty_object == 0) {
+ max_dirty_object = std::min<uint64_t>(
+ 2000, std::max<uint64_t>(10, cache_size / 100 /
+ sizeof(ObjectCacher::Object)));
+ }
+ ldout(cct, 5) << " cache bytes " << cache_size
+ << " -> about " << max_dirty_object << " objects" << dendl;
+ m_object_cacher->set_max_objects(max_dirty_object);
+
+ m_object_set = new ObjectCacher::ObjectSet(nullptr,
+ m_image_ctx->data_ctx.get_id(), 0);
+ m_object_cacher->start();
+ m_cache_lock.Unlock();
+
+ // add ourself to the IO object dispatcher chain
+ m_image_ctx->io_object_dispatcher->register_object_dispatch(this);
+}
+
+template <typename I>
+void ObjectCacherObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // chain shut down in reverse order
+
+ // shut down the cache
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ m_object_cacher->stop();
+ on_finish->complete(r);
+ });
+
+ // ensure we aren't holding the cache lock post-flush
+ on_finish = util::create_async_context_callback(*m_image_ctx, on_finish);
+
+ // invalidate any remaining cache entries
+ on_finish = new C_InvalidateCache(this, true, on_finish);
+
+ // flush all pending writeback state
+ m_cache_lock.Lock();
+ m_object_cacher->release_set(m_object_set);
+ m_object_cacher->flush_set(m_object_set, on_finish);
+ m_cache_lock.Unlock();
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::read(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ io::ExtentMap* extent_map, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ // IO chained in reverse order
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << object_len << dendl;
+
+ // ensure we aren't holding the cache lock post-read
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ m_image_ctx->snap_lock.get_read();
+ auto rd = m_object_cacher->prepare_read(snap_id, read_data, op_flags);
+ m_image_ctx->snap_lock.put_read();
+
+ ObjectExtent extent(oid, object_no, object_off, object_len, 0);
+ extent.oloc.pool = m_image_ctx->data_ctx.get_id();
+ extent.buffer_extents.push_back({0, object_len});
+ rd->extents.push_back(extent);
+
+ ZTracer::Trace trace(parent_trace);
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+
+ m_cache_lock.Lock();
+ int r = m_object_cacher->readx(rd, m_object_set, on_dispatched, &trace);
+ m_cache_lock.Unlock();
+ if (r != 0) {
+ on_dispatched->complete(r);
+ }
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << object_len << dendl;
+
+ ObjectExtents object_extents;
+ object_extents.emplace_back(oid, object_no, object_off, object_len, 0);
+
+ // discard the cache state after changes are committed to disk (and to
+ // prevent races w/ readahead)
+ auto ctx = *on_finish;
+ *on_finish = new FunctionContext(
+ [this, object_extents, ctx](int r) {
+ m_cache_lock.Lock();
+ m_object_cacher->discard_set(m_object_set, object_extents);
+ m_cache_lock.Unlock();
+
+ ctx->complete(r);
+ });
+
+ // ensure we aren't holding the cache lock post-write
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+
+ // ensure any in-flight writeback is complete before advancing
+ // the discard request
+ m_cache_lock.Lock();
+ m_object_cacher->discard_writeback(m_object_set, object_extents,
+ on_dispatched);
+ m_cache_lock.Unlock();
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << data.length() << dendl;
+
+ // ensure we aren't holding the cache lock post-write
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ m_image_ctx->snap_lock.get_read();
+ ObjectCacher::OSDWrite *wr = m_object_cacher->prepare_write(
+ snapc, data, ceph::real_time::min(), op_flags, *journal_tid);
+ m_image_ctx->snap_lock.put_read();
+
+ ObjectExtent extent(oid, 0, object_off, data.length(), 0);
+ extent.oloc.pool = m_image_ctx->data_ctx.get_id();
+ extent.buffer_extents.push_back({0, data.length()});
+ wr->extents.push_back(extent);
+
+ ZTracer::Trace trace(parent_trace);
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+
+ m_cache_lock.Lock();
+ m_object_cacher->writex(wr, m_object_set, on_dispatched, &trace);
+ m_cache_lock.Unlock();
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, io::Extents&& buffer_extents, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << object_len << dendl;
+
+ // ObjectCacher doesn't support write-same so convert to regular write
+ ObjectExtent extent(oid, 0, object_off, object_len, 0);
+ extent.buffer_extents = std::move(buffer_extents);
+
+ bufferlist ws_data;
+ io::util::assemble_write_same_extent(extent, data, &ws_data, true);
+
+ return write(oid, object_no, object_off, std::move(ws_data), snapc,
+ op_flags, parent_trace, object_dispatch_flags, journal_tid,
+ dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << cmp_data.length() << dendl;
+
+ // pass-through the compare-and-write request since it's not a supported
+ // operation of the ObjectCacher
+
+ // ensure we aren't holding the cache lock post-flush
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ // flush any pending writes from the cache
+ ZTracer::Trace trace(parent_trace);
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+
+ ObjectExtents object_extents;
+ object_extents.emplace_back(oid, object_no, object_off, cmp_data.length(),
+ 0);
+
+ Mutex::Locker cache_locker(m_cache_lock);
+ m_object_cacher->flush_set(m_object_set, object_extents, &trace,
+ on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ // ensure we aren't holding the cache lock post-flush
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ m_cache_lock.Lock();
+ if (flush_source == io::FLUSH_SOURCE_USER && !m_user_flushed &&
+ m_image_ctx->cache_writethrough_until_flush &&
+ m_image_ctx->cache_max_dirty > 0) {
+ m_user_flushed = true;
+ m_object_cacher->set_max_dirty(m_image_ctx->cache_max_dirty);
+ ldout(cct, 5) << "saw first user flush, enabling writeback" << dendl;
+ }
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ m_object_cacher->flush_set(m_object_set, on_dispatched);
+ m_cache_lock.Unlock();
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::invalidate_cache(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // ensure we aren't holding the cache lock post-flush
+ on_finish = util::create_async_context_callback(*m_image_ctx, on_finish);
+
+ // invalidate any remaining cache entries
+ on_finish = new C_InvalidateCache(this, false, on_finish);
+
+ m_cache_lock.Lock();
+ m_object_cacher->release_set(m_object_set);
+ m_object_cacher->flush_set(m_object_set, on_finish);
+ m_cache_lock.Unlock();
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::reset_existence_cache(
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_cache_lock.Lock();
+ m_object_cacher->clear_nonexistence(m_object_set);
+ m_cache_lock.Unlock();
+
+ return false;
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ObjectCacherObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.h b/src/librbd/cache/ObjectCacherObjectDispatch.h
new file mode 100644
index 00000000..cb145681
--- /dev/null
+++ b/src/librbd/cache/ObjectCacherObjectDispatch.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H
+
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "common/Mutex.h"
+#include "osdc/ObjectCacher.h"
+
+struct WritebackHandler;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace cache {
+
+/**
+ * Facade around the OSDC object cacher to make it align with
+ * the object dispatcher interface
+ */
+template <typename ImageCtxT = ImageCtx>
+class ObjectCacherObjectDispatch : public io::ObjectDispatchInterface {
+public:
+ static ObjectCacherObjectDispatch* create(ImageCtxT* image_ctx) {
+ return new ObjectCacherObjectDispatch(image_ctx);
+ }
+
+ ObjectCacherObjectDispatch(ImageCtxT* image_ctx);
+ ~ObjectCacherObjectDispatch() override;
+
+ io::ObjectDispatchLayer get_object_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_CACHE;
+ }
+
+ void init();
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ io::ExtentMap* extent_map, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, io::Extents&& buffer_extents,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override;
+ bool reset_existence_cache(Context* on_finish) override;
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ }
+
+private:
+ struct C_InvalidateCache;
+
+ ImageCtxT* m_image_ctx;
+
+ Mutex m_cache_lock;
+ ObjectCacher *m_object_cacher = nullptr;
+ ObjectCacher::ObjectSet *m_object_set = nullptr;
+
+ WritebackHandler *m_writeback_handler = nullptr;
+
+ bool m_user_flushed = false;
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ObjectCacherObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H
diff --git a/src/librbd/cache/PassthroughImageCache.cc b/src/librbd/cache/PassthroughImageCache.cc
new file mode 100644
index 00000000..c3672f53
--- /dev/null
+++ b/src/librbd/cache/PassthroughImageCache.cc
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PassthroughImageCache.h"
+#include "include/buffer.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::PassthroughImageCache: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+template <typename I>
+PassthroughImageCache<I>::PassthroughImageCache(ImageCtx &image_ctx)
+ : m_image_ctx(image_ctx), m_image_writeback(image_ctx) {
+}
+
+template <typename I>
+void PassthroughImageCache<I>::aio_read(Extents &&image_extents, bufferlist *bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_read(std::move(image_extents), bl, fadvise_flags,
+ on_finish);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::aio_write(Extents &&image_extents,
+ bufferlist&& bl,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_write(std::move(image_extents), std::move(bl),
+ fadvise_flags, on_finish);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_discard(offset, length, discard_granularity_bytes,
+ on_finish);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::aio_flush(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_flush(on_finish);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::aio_writesame(uint64_t offset, uint64_t length,
+ bufferlist&& bl, int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "data_len=" << bl.length() << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_writesame(offset, length, std::move(bl), fadvise_flags,
+ on_finish);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::aio_compare_and_write(Extents &&image_extents,
+ bufferlist&& cmp_bl,
+ bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_compare_and_write(
+ std::move(image_extents), std::move(cmp_bl), std::move(bl), mismatch_offset,
+ fadvise_flags, on_finish);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::init(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::shut_down(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::invalidate(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ // dump cache contents (don't have anything)
+ on_finish->complete(0);
+}
+
+template <typename I>
+void PassthroughImageCache<I>::flush(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ // internal flush -- nothing to writeback but make sure
+ // in-flight IO is flushed
+ aio_flush(on_finish);
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::PassthroughImageCache<librbd::ImageCtx>;
diff --git a/src/librbd/cache/PassthroughImageCache.h b/src/librbd/cache/PassthroughImageCache.h
new file mode 100644
index 00000000..4be69a50
--- /dev/null
+++ b/src/librbd/cache/PassthroughImageCache.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PASSTHROUGH_IMAGE_CACHE
+#define CEPH_LIBRBD_CACHE_PASSTHROUGH_IMAGE_CACHE
+
+#include "ImageCache.h"
+#include "ImageWriteback.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+/**
+ * Example passthrough client-side, image extent cache
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class PassthroughImageCache : public ImageCache {
+public:
+ explicit PassthroughImageCache(ImageCtx &image_ctx);
+
+ /// client AIO methods
+ void aio_read(Extents&& image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish) override;
+ void aio_write(Extents&& image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) override;
+ void aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) override;
+ void aio_flush(Context *on_finish) override;
+ void aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) override;
+ void aio_compare_and_write(Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,int fadvise_flags,
+ Context *on_finish) override;
+
+ /// internal state methods
+ void init(Context *on_finish) override;
+ void shut_down(Context *on_finish) override;
+
+ void invalidate(Context *on_finish) override;
+ void flush(Context *on_finish) override;
+
+private:
+ ImageCtxT &m_image_ctx;
+ ImageWriteback<ImageCtxT> m_image_writeback;
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::PassthroughImageCache<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PASSTHROUGH_IMAGE_CACHE
diff --git a/src/librbd/deep_copy/ImageCopyRequest.cc b/src/librbd/deep_copy/ImageCopyRequest.cc
new file mode 100644
index 00000000..62e5409c
--- /dev/null
+++ b/src/librbd/deep_copy/ImageCopyRequest.cc
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ImageCopyRequest.h"
+#include "ObjectCopyRequest.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/Utils.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/OpenRequest.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::ImageCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+ImageCopyRequest<I>::ImageCopyRequest(I *src_image_ctx, I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const ObjectNumber &object_number,
+ const SnapSeqs &snap_seqs,
+ ProgressContext *prog_ctx,
+ Context *on_finish)
+ : RefCountedObject(dst_image_ctx->cct, 1), m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start),
+ m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start),
+ m_flatten(flatten), m_object_number(object_number), m_snap_seqs(snap_seqs),
+ m_prog_ctx(prog_ctx), m_on_finish(on_finish), m_cct(dst_image_ctx->cct),
+ m_lock(unique_lock_name("ImageCopyRequest::m_lock", this)) {
+}
+
+template <typename I>
+void ImageCopyRequest<I>::send() {
+ m_dst_image_ctx->snap_lock.get_read();
+ util::compute_snap_map(m_dst_image_ctx->cct, m_src_snap_id_start,
+ m_src_snap_id_end, m_dst_image_ctx->snaps, m_snap_seqs,
+ &m_snap_map);
+ m_dst_image_ctx->snap_lock.put_read();
+
+ if (m_snap_map.empty()) {
+ lderr(m_cct) << "failed to map snapshots within boundary" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ send_object_copies();
+}
+
+template <typename I>
+void ImageCopyRequest<I>::cancel() {
+ Mutex::Locker locker(m_lock);
+
+ ldout(m_cct, 20) << dendl;
+ m_canceled = true;
+}
+
+template <typename I>
+void ImageCopyRequest<I>::send_object_copies() {
+ m_object_no = 0;
+ if (m_object_number) {
+ m_object_no = *m_object_number + 1;
+ }
+
+ uint64_t size;
+ {
+ RWLock::RLocker snap_locker(m_src_image_ctx->snap_lock);
+ size = m_src_image_ctx->get_image_size(CEPH_NOSNAP);
+ for (auto snap_id : m_src_image_ctx->snaps) {
+ size = std::max(size, m_src_image_ctx->get_image_size(snap_id));
+ }
+ }
+ m_end_object_no = Striper::get_num_objects(m_dst_image_ctx->layout, size);
+
+ ldout(m_cct, 20) << "start_object=" << m_object_no << ", "
+ << "end_object=" << m_end_object_no << dendl;
+
+ bool complete;
+ {
+ Mutex::Locker locker(m_lock);
+ for (uint64_t i = 0;
+ i < m_src_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops");
+ ++i) {
+ send_next_object_copy();
+ if (m_ret_val < 0 && m_current_ops == 0) {
+ break;
+ }
+ }
+ complete = (m_current_ops == 0) && !m_updating_progress;
+ }
+
+ if (complete) {
+ finish(m_ret_val);
+ }
+}
+
+template <typename I>
+void ImageCopyRequest<I>::send_next_object_copy() {
+ ceph_assert(m_lock.is_locked());
+
+ if (m_canceled && m_ret_val == 0) {
+ ldout(m_cct, 10) << "image copy canceled" << dendl;
+ m_ret_val = -ECANCELED;
+ }
+
+ if (m_ret_val < 0 || m_object_no >= m_end_object_no) {
+ return;
+ }
+
+ uint64_t ono = m_object_no++;
+
+ ldout(m_cct, 20) << "object_num=" << ono << dendl;
+
+ ++m_current_ops;
+
+ Context *ctx = new FunctionContext(
+ [this, ono](int r) {
+ handle_object_copy(ono, r);
+ });
+ ObjectCopyRequest<I> *req = ObjectCopyRequest<I>::create(
+ m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_dst_snap_id_start,
+ m_snap_map, ono, m_flatten, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageCopyRequest<I>::handle_object_copy(uint64_t object_no, int r) {
+ ldout(m_cct, 20) << "object_no=" << object_no << ", r=" << r << dendl;
+
+ bool complete;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_current_ops > 0);
+ --m_current_ops;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "object copy failed: " << cpp_strerror(r) << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ } else {
+ m_copied_objects.push(object_no);
+ while (!m_updating_progress && !m_copied_objects.empty() &&
+ m_copied_objects.top() ==
+ (m_object_number ? *m_object_number + 1 : 0)) {
+ m_object_number = m_copied_objects.top();
+ m_copied_objects.pop();
+ uint64_t progress_object_no = *m_object_number + 1;
+ m_updating_progress = true;
+ m_lock.Unlock();
+ m_prog_ctx->update_progress(progress_object_no, m_end_object_no);
+ m_lock.Lock();
+ ceph_assert(m_updating_progress);
+ m_updating_progress = false;
+ }
+ }
+
+ send_next_object_copy();
+ complete = (m_current_ops == 0) && !m_updating_progress;
+ }
+
+ if (complete) {
+ finish(m_ret_val);
+ }
+}
+
+template <typename I>
+void ImageCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ put();
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::ImageCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/ImageCopyRequest.h b/src/librbd/deep_copy/ImageCopyRequest.h
new file mode 100644
index 00000000..189cb237
--- /dev/null
+++ b/src/librbd/deep_copy/ImageCopyRequest.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "common/RefCountedObj.h"
+#include "librbd/Types.h"
+#include "librbd/deep_copy/Types.h"
+#include <functional>
+#include <map>
+#include <queue>
+#include <vector>
+#include <boost/optional.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace deep_copy {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCopyRequest : public RefCountedObject {
+public:
+ static ImageCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const ObjectNumber &object_number,
+ const SnapSeqs &snap_seqs,
+ ProgressContext *prog_ctx,
+ Context *on_finish) {
+ return new ImageCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start,
+ src_snap_id_end, dst_snap_id_start, flatten,
+ object_number, snap_seqs, prog_ctx, on_finish);
+ }
+
+ ImageCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, const ObjectNumber &object_number,
+ const SnapSeqs &snap_seqs, ProgressContext *prog_ctx,
+ Context *on_finish);
+
+ void send();
+ void cancel();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . . . .
+ * | . . (parallel execution of
+ * v v . multiple objects at once)
+ * COPY_OBJECT . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_src_snap_id_end;
+ librados::snap_t m_dst_snap_id_start;
+ bool m_flatten;
+ ObjectNumber m_object_number;
+ SnapSeqs m_snap_seqs;
+ ProgressContext *m_prog_ctx;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ Mutex m_lock;
+ bool m_canceled = false;
+
+ uint64_t m_object_no = 0;
+ uint64_t m_end_object_no = 0;
+ uint64_t m_current_ops = 0;
+ std::priority_queue<
+ uint64_t, std::vector<uint64_t>, std::greater<uint64_t>> m_copied_objects;
+ bool m_updating_progress = false;
+ SnapMap m_snap_map;
+ int m_ret_val = 0;
+
+ void send_object_copies();
+ void send_next_object_copy();
+ void handle_object_copy(uint64_t object_no, int r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::ImageCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/MetadataCopyRequest.cc b/src/librbd/deep_copy/MetadataCopyRequest.cc
new file mode 100644
index 00000000..cbce9a19
--- /dev/null
+++ b/src/librbd/deep_copy/MetadataCopyRequest.cc
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MetadataCopyRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::MetadataCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+namespace {
+
+const uint64_t MAX_METADATA_ITEMS = 128;
+
+} // anonymous namespace
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+MetadataCopyRequest<I>::MetadataCopyRequest(I *src_image_ctx, I *dst_image_ctx,
+ Context *on_finish)
+ : m_src_image_ctx(src_image_ctx), m_dst_image_ctx(dst_image_ctx),
+ m_on_finish(on_finish), m_cct(dst_image_ctx->cct) {
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::send() {
+ list_src_metadata();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::list_src_metadata() {
+ ldout(m_cct, 20) << "start_key=" << m_last_metadata_key << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::metadata_list_start(&op, m_last_metadata_key, MAX_METADATA_ITEMS);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ MetadataCopyRequest<I>,
+ &MetadataCopyRequest<I>::handle_list_src_metadata>(this);
+ m_out_bl.clear();
+ m_src_image_ctx->md_ctx.aio_operate(m_src_image_ctx->header_oid,
+ aio_comp, &op, &m_out_bl);
+ aio_comp->release();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::handle_list_src_metadata(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ Metadata metadata;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::metadata_list_finish(&it, &metadata);
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (metadata.empty()) {
+ finish(0);
+ return;
+ }
+
+ m_last_metadata_key = metadata.rbegin()->first;
+ m_more_metadata = (metadata.size() >= MAX_METADATA_ITEMS);
+ set_dst_metadata(metadata);
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::set_dst_metadata(const Metadata& metadata) {
+ ldout(m_cct, 20) << "count=" << metadata.size() << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::metadata_set(&op, metadata);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ MetadataCopyRequest<I>,
+ &MetadataCopyRequest<I>::handle_set_dst_metadata>(this);
+ m_dst_image_ctx->md_ctx.aio_operate(m_dst_image_ctx->header_oid, aio_comp,
+ &op);
+ aio_comp->release();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::handle_set_dst_metadata(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to set metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_more_metadata) {
+ list_src_metadata();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::MetadataCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/MetadataCopyRequest.h b/src/librbd/deep_copy/MetadataCopyRequest.h
new file mode 100644
index 00000000..5bd69d8b
--- /dev/null
+++ b/src/librbd/deep_copy/MetadataCopyRequest.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MetadataCopyRequest {
+public:
+ static MetadataCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ Context *on_finish) {
+ return new MetadataCopyRequest(src_image_ctx, dst_image_ctx, on_finish);
+ }
+
+ MetadataCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * LIST_SRC_METADATA <------\
+ * | | (repeat if additional
+ * v | metadata)
+ * SET_DST_METADATA --------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ typedef std::map<std::string, bufferlist> Metadata;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ bufferlist m_out_bl;
+
+ std::string m_last_metadata_key;
+ bool m_more_metadata = false;
+
+ void list_src_metadata();
+ void handle_list_src_metadata(int r);
+
+ void set_dst_metadata(const Metadata& metadata);
+ void handle_set_dst_metadata(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::MetadataCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/ObjectCopyRequest.cc b/src/librbd/deep_copy/ObjectCopyRequest.cc
new file mode 100644
index 00000000..2400757a
--- /dev/null
+++ b/src/librbd/deep_copy/ObjectCopyRequest.cc
@@ -0,0 +1,998 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ObjectCopyRequest.h"
+#include "common/errno.h"
+#include "librados/snap_set_diff.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ReadResult.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::ObjectCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librados {
+
+inline bool operator==(const clone_info_t& rhs, const clone_info_t& lhs) {
+ return (rhs.cloneid == lhs.cloneid &&
+ rhs.snaps == lhs.snaps &&
+ rhs.overlap == lhs.overlap &&
+ rhs.size == lhs.size);
+}
+
+inline bool operator==(const snap_set_t& rhs, const snap_set_t& lhs) {
+ return (rhs.clones == lhs.clones &&
+ rhs.seq == lhs.seq);
+}
+
+} // namespace librados
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+ObjectCopyRequest<I>::ObjectCopyRequest(I *src_image_ctx,
+ I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t dst_snap_id_start,
+ const SnapMap &snap_map,
+ uint64_t dst_object_number,
+ bool flatten, Context *on_finish)
+ : m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_cct(dst_image_ctx->cct),
+ m_src_snap_id_start(src_snap_id_start),
+ m_dst_snap_id_start(dst_snap_id_start), m_snap_map(snap_map),
+ m_dst_object_number(dst_object_number), m_flatten(flatten),
+ m_on_finish(on_finish) {
+ ceph_assert(src_image_ctx->data_ctx.is_valid());
+ ceph_assert(dst_image_ctx->data_ctx.is_valid());
+ ceph_assert(!m_snap_map.empty());
+
+ m_src_async_op = new io::AsyncOperation();
+ m_src_async_op->start_op(*util::get_image_ctx(m_src_image_ctx));
+
+ m_src_io_ctx.dup(m_src_image_ctx->data_ctx);
+ m_dst_io_ctx.dup(m_dst_image_ctx->data_ctx);
+
+ m_dst_oid = m_dst_image_ctx->get_object_name(dst_object_number);
+
+ ldout(m_cct, 20) << "dst_oid=" << m_dst_oid << dendl;
+
+ compute_src_object_extents();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send() {
+ send_list_snaps();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_list_snaps() {
+ ceph_assert(!m_src_objects.empty());
+ m_src_ono = *m_src_objects.begin();
+ m_src_oid = m_src_image_ctx->get_object_name(m_src_ono);
+
+ ldout(m_cct, 20) << "src_oid=" << m_src_oid << dendl;
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_list_snaps>(this);
+
+ librados::ObjectReadOperation op;
+ m_snap_set = {};
+ m_snap_ret = 0;
+ op.list_snaps(&m_snap_set, &m_snap_ret);
+
+ m_src_io_ctx.snap_set_read(CEPH_SNAPDIR);
+ int r = m_src_io_ctx.aio_operate(m_src_oid, rados_completion, &op,
+ nullptr);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_list_snaps(int r) {
+ if (r == 0 && m_snap_ret < 0) {
+ r = m_snap_ret;
+ }
+
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to list snaps: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_retry_missing_read) {
+ if (m_snap_set == m_retry_snap_set) {
+ lderr(m_cct) << "read encountered missing object using up-to-date snap set"
+ << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ ldout(m_cct, 20) << "retrying using updated snap set" << dendl;
+ m_retry_missing_read = false;
+ m_retry_snap_set = {};
+ }
+
+ if (r == -ENOENT) {
+ for (auto &it : m_src_object_extents) {
+ auto &e = it.second;
+ if (e.object_no == m_src_ono) {
+ e.noent = true;
+ }
+ }
+ m_read_ops = {};
+ m_read_snaps = {};
+ m_zero_interval = {};
+ } else {
+ compute_read_ops();
+ }
+ send_read_object();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_read_object() {
+
+ if (m_read_snaps.empty()) {
+ // all snapshots have been read
+ merge_write_ops();
+
+ ceph_assert(!m_src_objects.empty());
+ m_src_objects.erase(m_src_objects.begin());
+
+ if (!m_src_objects.empty()) {
+ send_list_snaps();
+ return;
+ }
+
+ // all objects have been read
+ send_read_from_parent();
+ return;
+ }
+
+ auto index = *m_read_snaps.begin();
+ auto src_snap_seq = index.second;
+
+ bool read_required = false;
+ librados::ObjectReadOperation op;
+
+ for (auto &copy_op : m_read_ops[index]) {
+ if (!read_required) {
+ // map the copy op start snap id back to the necessary read snap id
+ m_src_io_ctx.snap_set_read(src_snap_seq);
+
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << dendl;
+ read_required = true;
+ }
+ ldout(m_cct, 20) << "read op: " << copy_op.src_offset << "~"
+ << copy_op.length << dendl;
+ op.sparse_read(copy_op.src_offset, copy_op.length, &copy_op.src_extent_map,
+ &copy_op.out_bl, nullptr);
+ op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ }
+
+ if (!read_required) {
+ // nothing written to this object for this snapshot (must be trunc/remove)
+ handle_read_object(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_read_object>(this);
+ auto comp = create_rados_callback(ctx);
+
+ ldout(m_cct, 20) << "read " << m_src_oid << dendl;
+
+ int r = m_src_io_ctx.aio_operate(m_src_oid, comp, &op, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_read_object(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ m_retry_snap_set = m_snap_set;
+ m_retry_missing_read = true;
+
+ ldout(m_cct, 5) << "object missing potentially due to removed snapshot"
+ << dendl;
+ send_list_snaps();
+ return;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to read from source object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ceph_assert(!m_read_snaps.empty());
+ m_read_snaps.erase(m_read_snaps.begin());
+
+ send_read_object();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_read_from_parent() {
+ m_src_image_ctx->snap_lock.get_read();
+ m_src_image_ctx->parent_lock.get_read();
+ io::Extents image_extents;
+ compute_read_from_parent_ops(&image_extents);
+ m_src_image_ctx->snap_lock.put_read();
+
+ if (image_extents.empty()) {
+ m_src_image_ctx->parent_lock.put_read();
+ handle_read_from_parent(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ ceph_assert(m_src_image_ctx->parent != nullptr);
+
+ auto ctx = create_context_callback<
+ ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_read_from_parent>(this);
+ auto comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(m_src_image_ctx->parent), io::AIO_TYPE_READ);
+ ldout(m_cct, 20) << "completion " << comp << ", extents " << image_extents
+ << dendl;
+
+ auto src_image_ctx = m_src_image_ctx;
+ io::ImageRequest<I>::aio_read(src_image_ctx->parent, comp,
+ std::move(image_extents),
+ io::ReadResult{&m_read_from_parent_data}, 0,
+ ZTracer::Trace());
+ src_image_ctx->parent_lock.put_read();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_read_from_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to read from parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_read_ops.empty()) {
+ ceph_assert(m_read_ops.size() == 1);
+ auto src_snap_seq = m_read_ops.begin()->first.first;
+ auto &copy_ops = m_read_ops.begin()->second;
+ uint64_t offset = 0;
+ for (auto it = copy_ops.begin(); it != copy_ops.end(); ) {
+ it->out_bl.substr_of(m_read_from_parent_data, offset, it->length);
+ offset += it->length;
+ if (it->out_bl.is_zero()) {
+ m_zero_interval[src_snap_seq].insert(it->dst_offset, it->length);
+ it = copy_ops.erase(it);
+ } else {
+ it++;
+ }
+ }
+ merge_write_ops();
+ }
+
+ compute_dst_object_may_exist();
+ compute_zero_ops();
+
+ if (m_write_ops.empty()) {
+ // nothing to copy
+ finish(-ENOENT);
+ return;
+ }
+
+ send_write_object();
+ return;
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_write_object() {
+ ceph_assert(!m_write_ops.empty());
+ auto& copy_ops = m_write_ops.begin()->second;
+
+ // retrieve the destination snap context for the op
+ SnapIds dst_snap_ids;
+ librados::snap_t dst_snap_seq = 0;
+ librados::snap_t src_snap_seq = m_write_ops.begin()->first;
+ if (src_snap_seq != 0) {
+ auto snap_map_it = m_snap_map.find(src_snap_seq);
+ ceph_assert(snap_map_it != m_snap_map.end());
+
+ auto dst_snap_id = snap_map_it->second.front();
+ auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_id);
+ ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end());
+ if (!dst_may_exist_it->second && !copy_ops.empty()) {
+ // if the object cannot exist, the only valid op is to remove it
+ ceph_assert(copy_ops.size() == 1U);
+ ceph_assert(copy_ops.begin()->type == COPY_OP_TYPE_REMOVE);
+ }
+
+ // write snapshot context should be before actual snapshot
+ ceph_assert(!snap_map_it->second.empty());
+ auto dst_snap_ids_it = snap_map_it->second.begin();
+ ++dst_snap_ids_it;
+
+ dst_snap_ids = SnapIds{dst_snap_ids_it, snap_map_it->second.end()};
+ if (!dst_snap_ids.empty()) {
+ dst_snap_seq = dst_snap_ids.front();
+ }
+ ceph_assert(dst_snap_seq != CEPH_NOSNAP);
+ }
+
+ ldout(m_cct, 20) << "dst_snap_seq=" << dst_snap_seq << ", "
+ << "dst_snaps=" << dst_snap_ids << dendl;
+
+ librados::ObjectWriteOperation op;
+ uint64_t buffer_offset;
+
+ if (!m_dst_image_ctx->migration_info.empty()) {
+ cls_client::assert_snapc_seq(&op, dst_snap_seq,
+ cls::rbd::ASSERT_SNAPC_SEQ_GT_SNAPSET_SEQ);
+ }
+
+ for (auto &copy_op : copy_ops) {
+ switch (copy_op.type) {
+ case COPY_OP_TYPE_WRITE:
+ buffer_offset = 0;
+ for (auto &e : copy_op.dst_extent_map) {
+ ldout(m_cct, 20) << "write op: " << e.first << "~" << e.second
+ << dendl;
+ bufferlist tmpbl;
+ tmpbl.substr_of(copy_op.out_bl, buffer_offset, e.second);
+ op.write(e.first, tmpbl);
+ op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ buffer_offset += e.second;
+ }
+ break;
+ case COPY_OP_TYPE_ZERO:
+ ldout(m_cct, 20) << "zero op: " << copy_op.dst_offset << "~"
+ << copy_op.length << dendl;
+ op.zero(copy_op.dst_offset, copy_op.length);
+ break;
+ case COPY_OP_TYPE_REMOVE_TRUNC:
+ ldout(m_cct, 20) << "create op" << dendl;
+ op.create(false);
+ // fall through
+ case COPY_OP_TYPE_TRUNC:
+ ldout(m_cct, 20) << "trunc op: " << copy_op.dst_offset << dendl;
+ op.truncate(copy_op.dst_offset);
+ break;
+ case COPY_OP_TYPE_REMOVE:
+ ldout(m_cct, 20) << "remove op" << dendl;
+ op.remove();
+ break;
+ default:
+ ceph_abort();
+ }
+ }
+
+ if (op.size() == (m_dst_image_ctx->migration_info.empty() ? 0 : 1)) {
+ handle_write_object(0);
+ return;
+ }
+
+ int r;
+ Context *finish_op_ctx;
+ {
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r);
+ }
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_write_object(r);
+ finish_op_ctx->complete(0);
+ });
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+ r = m_dst_io_ctx.aio_operate(m_dst_oid, comp, &op, dst_snap_seq, dst_snap_ids,
+ nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_write_object(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r == -ERANGE) {
+ ldout(m_cct, 10) << "concurrent deep copy" << dendl;
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to write to destination object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ m_write_ops.erase(m_write_ops.begin());
+ if (!m_write_ops.empty()) {
+ send_write_object();
+ return;
+ }
+
+ send_update_object_map();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_update_object_map() {
+ if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP) ||
+ m_dst_object_state.empty()) {
+ finish(0);
+ return;
+ }
+
+ m_dst_image_ctx->owner_lock.get_read();
+ m_dst_image_ctx->snap_lock.get_read();
+ if (m_dst_image_ctx->object_map == nullptr) {
+ // possible that exclusive lock was lost in background
+ lderr(m_cct) << "object map is not initialized" << dendl;
+
+ m_dst_image_ctx->snap_lock.put_read();
+ m_dst_image_ctx->owner_lock.put_read();
+ finish(-EINVAL);
+ return;
+ }
+
+ auto &dst_object_state = *m_dst_object_state.begin();
+ auto it = m_snap_map.find(dst_object_state.first);
+ ceph_assert(it != m_snap_map.end());
+ auto dst_snap_id = it->second.front();
+ auto object_state = dst_object_state.second;
+ m_dst_object_state.erase(m_dst_object_state.begin());
+
+ ldout(m_cct, 20) << "dst_snap_id=" << dst_snap_id << ", object_state="
+ << static_cast<uint32_t>(object_state) << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ m_dst_image_ctx->snap_lock.put_read();
+ m_dst_image_ctx->owner_lock.put_read();
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_update_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+
+ auto dst_image_ctx = m_dst_image_ctx;
+ dst_image_ctx->object_map_lock.get_write();
+ bool sent = dst_image_ctx->object_map->template aio_update<
+ Context, &Context::complete>(dst_snap_id, m_dst_object_number, object_state,
+ {}, {}, false, ctx);
+
+ // NOTE: state machine might complete before we reach here
+ dst_image_ctx->object_map_lock.put_write();
+ dst_image_ctx->snap_lock.put_read();
+ dst_image_ctx->owner_lock.put_read();
+ if (!sent) {
+ ceph_assert(dst_snap_id == CEPH_NOSNAP);
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_update_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_dst_object_state.empty()) {
+ send_update_object_map();
+ return;
+ }
+ finish(0);
+}
+
+template <typename I>
+Context *ObjectCopyRequest<I>::start_lock_op(RWLock &owner_lock, int* r) {
+ ceph_assert(m_dst_image_ctx->owner_lock.is_locked());
+ if (m_dst_image_ctx->exclusive_lock == nullptr) {
+ return new FunctionContext([](int r) {});
+ }
+ return m_dst_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+uint64_t ObjectCopyRequest<I>::src_to_dst_object_offset(uint64_t objectno,
+ uint64_t offset) {
+ std::vector<std::pair<uint64_t, uint64_t>> image_extents;
+ Striper::extent_to_file(m_cct, &m_src_image_ctx->layout, objectno, offset, 1,
+ image_extents);
+ ceph_assert(image_extents.size() == 1);
+ auto dst_object_offset = image_extents.begin()->first;
+
+ std::map<object_t, std::vector<ObjectExtent>> dst_object_extents;
+ Striper::file_to_extents(m_cct, m_dst_image_ctx->format_string,
+ &m_dst_image_ctx->layout, dst_object_offset, 1, 0,
+ dst_object_extents);
+ ceph_assert(dst_object_extents.size() == 1);
+ ceph_assert(dst_object_extents.begin()->second.size() == 1);
+ auto &e = *dst_object_extents.begin()->second.begin();
+ ceph_assert(e.objectno == m_dst_object_number);
+
+ return e.offset;
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_src_object_extents() {
+ std::vector<std::pair<uint64_t, uint64_t>> image_extents;
+ Striper::extent_to_file(m_cct, &m_dst_image_ctx->layout, m_dst_object_number,
+ 0, m_dst_image_ctx->layout.object_size, image_extents);
+
+ size_t total = 0;
+ for (auto &e : image_extents) {
+ std::map<object_t, std::vector<ObjectExtent>> src_object_extents;
+ Striper::file_to_extents(m_cct, m_src_image_ctx->format_string,
+ &m_src_image_ctx->layout, e.first, e.second, 0,
+ src_object_extents);
+ auto stripe_unit = std::min(m_src_image_ctx->layout.stripe_unit,
+ m_dst_image_ctx->layout.stripe_unit);
+ for (auto &p : src_object_extents) {
+ for (auto &s : p.second) {
+ m_src_objects.insert(s.objectno);
+ total += s.length;
+ while (s.length > 0) {
+ ceph_assert(s.length >= stripe_unit);
+ auto dst_object_offset = src_to_dst_object_offset(s.objectno, s.offset);
+ m_src_object_extents[dst_object_offset] = {s.objectno, s.offset,
+ stripe_unit};
+ s.offset += stripe_unit;
+ s.length -= stripe_unit;
+ }
+ }
+ }
+ }
+
+ ceph_assert(total == m_dst_image_ctx->layout.object_size);
+
+ ldout(m_cct, 20) << m_src_object_extents.size() << " src extents" << dendl;
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_read_ops() {
+ m_read_ops = {};
+ m_read_snaps = {};
+ m_zero_interval = {};
+
+ m_src_image_ctx->parent_lock.get_read();
+ bool hide_parent = (m_src_image_ctx->parent != nullptr);
+ m_src_image_ctx->parent_lock.put_read();
+
+ librados::snap_t src_copy_point_snap_id = m_snap_map.rbegin()->first;
+ bool prev_exists = (hide_parent || m_src_snap_id_start > 0);
+ uint64_t prev_end_size = prev_exists ?
+ m_src_image_ctx->layout.object_size : 0;
+ librados::snap_t start_src_snap_id = m_src_snap_id_start;
+
+ for (auto &pair : m_snap_map) {
+ ceph_assert(!pair.second.empty());
+ librados::snap_t end_src_snap_id = pair.first;
+ librados::snap_t end_dst_snap_id = pair.second.front();
+
+ interval_set<uint64_t> diff;
+ uint64_t end_size;
+ bool exists;
+ librados::snap_t clone_end_snap_id;
+ calc_snap_set_diff(m_cct, m_snap_set, start_src_snap_id,
+ end_src_snap_id, &diff, &end_size, &exists,
+ &clone_end_snap_id, &m_read_whole_object);
+
+ if (m_read_whole_object) {
+ ldout(m_cct, 1) << "need to read full object" << dendl;
+ diff.insert(0, m_src_image_ctx->layout.object_size);
+ exists = true;
+ end_size = m_src_image_ctx->layout.object_size;
+ clone_end_snap_id = end_src_snap_id;
+ } else if (!exists) {
+ end_size = 0;
+ if (hide_parent && end_src_snap_id == m_snap_map.begin()->first &&
+ m_snap_set.clones.empty()) {
+ ldout(m_cct, 20) << "no clones for existing object" << dendl;
+ exists = true;
+ diff.insert(0, m_src_image_ctx->layout.object_size);
+ clone_end_snap_id = end_src_snap_id;
+ }
+ }
+
+ ldout(m_cct, 20) << "start_src_snap_id=" << start_src_snap_id << ", "
+ << "end_src_snap_id=" << end_src_snap_id << ", "
+ << "clone_end_snap_id=" << clone_end_snap_id << ", "
+ << "end_dst_snap_id=" << end_dst_snap_id << ", "
+ << "diff=" << diff << ", "
+ << "end_size=" << end_size << ", "
+ << "exists=" << exists << dendl;
+
+ m_zero_interval[end_src_snap_id] = {};
+
+ if (exists || prev_exists) {
+ // clip diff to size of object (in case it was truncated)
+ if (end_size < prev_end_size) {
+ interval_set<uint64_t> trunc;
+ trunc.insert(end_size, prev_end_size);
+ trunc.intersection_of(diff);
+ diff.subtract(trunc);
+ ldout(m_cct, 20) << "clearing truncate diff: " << trunc << dendl;
+ }
+
+ if (exists) {
+ // reads should be issued against the newest (existing) snapshot within
+ // the associated snapshot object clone. writes should be issued
+ // against the oldest snapshot in the snap_map.
+ ceph_assert(clone_end_snap_id >= end_src_snap_id);
+ if (clone_end_snap_id > src_copy_point_snap_id) {
+ // do not read past the copy point snapshot
+ clone_end_snap_id = src_copy_point_snap_id;
+ }
+ }
+
+ for (auto &it : m_src_object_extents) {
+ auto dst_object_offset = it.first;
+ auto &e = it.second;
+
+ if (e.object_no != m_src_ono) {
+ continue;
+ }
+
+ interval_set<uint64_t> read_interval;
+ read_interval.insert(e.offset, e.length);
+
+ if (end_size < prev_end_size) {
+ interval_set<uint64_t> zero_interval;
+ zero_interval.insert(end_size, prev_end_size - end_size);
+ zero_interval.intersection_of(read_interval);
+ if (!zero_interval.empty()) {
+ auto it = zero_interval.begin();
+ auto offset = it.get_start() - e.offset;
+ m_zero_interval[end_src_snap_id].insert(dst_object_offset + offset,
+ it.get_len());
+ ldout(m_cct, 20) << "extent " << e.offset << "~" << e.length
+ << " intersects truncation " << end_size << "~"
+ << prev_end_size - end_size << ", inserting zero "
+ << dst_object_offset + offset << "~"
+ << it.get_len() << dendl;
+ }
+ }
+
+ // limit read interval to diff
+ read_interval.intersection_of(diff);
+
+ ldout(m_cct, 20) << "src_object_extent: " << e.offset << "~" << e.length
+ << ", dst_object_offset=" << dst_object_offset
+ << ", read: " << read_interval << dendl;
+
+ ceph_assert(exists || read_interval.empty());
+
+ for (auto it = read_interval.begin(); it != read_interval.end();
+ it++) {
+ ceph_assert(it.get_start() >= e.offset);
+ auto offset = it.get_start() - e.offset;
+ ldout(m_cct, 20) << "read/write op: " << it.get_start() << "~"
+ << it.get_len() << " dst: "
+ << dst_object_offset + offset << dendl;
+ m_read_ops[{end_src_snap_id, clone_end_snap_id}]
+ .emplace_back(COPY_OP_TYPE_WRITE, it.get_start(),
+ dst_object_offset + offset, it.get_len());
+ }
+ }
+ }
+
+ prev_end_size = end_size;
+ prev_exists = exists;
+ if (hide_parent && prev_exists && prev_end_size == 0) {
+ // hide parent
+ prev_end_size = m_src_image_ctx->layout.object_size;
+ }
+ start_src_snap_id = end_src_snap_id;
+ }
+
+ for (auto &it : m_read_ops) {
+ m_read_snaps.push_back(it.first);
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_read_from_parent_ops(
+ io::Extents *parent_image_extents) {
+ assert(m_src_image_ctx->snap_lock.is_locked());
+ assert(m_src_image_ctx->parent_lock.is_locked());
+
+ m_read_ops = {};
+ m_zero_interval = {};
+ parent_image_extents->clear();
+
+ if (m_src_image_ctx->parent == nullptr) {
+ ldout(m_cct, 20) << "no parent" << dendl;
+ return;
+ }
+
+ size_t noent_count = 0;
+ for (auto &it : m_src_object_extents) {
+ if (it.second.noent) {
+ noent_count++;
+ }
+ }
+
+ if (noent_count == 0) {
+ ldout(m_cct, 20) << "no extents need read from parent" << dendl;
+ return;
+ }
+
+ if (noent_count == m_src_object_extents.size() && !m_flatten) {
+ ldout(m_cct, 20) << "reading all extents skipped when no flatten"
+ << dendl;
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ auto src_snap_seq = m_snap_map.begin()->first;
+
+ uint64_t parent_overlap;
+ int r = m_src_image_ctx->get_parent_overlap(src_snap_seq, &parent_overlap);
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed getting parent overlap for snap_id: "
+ << src_snap_seq << ": " << cpp_strerror(r) << dendl;
+ return;
+ }
+ if (parent_overlap == 0) {
+ ldout(m_cct, 20) << "no parent overlap" << dendl;
+ return;
+ }
+
+ for (auto &it : m_src_object_extents) {
+ auto dst_object_offset = it.first;
+ auto &e = it.second;
+
+ if (!e.noent) {
+ continue;
+ }
+
+ std::vector<std::pair<uint64_t, uint64_t>> image_extents;
+ Striper::extent_to_file(m_cct, &m_src_image_ctx->layout, e.object_no,
+ e.offset, e.length, image_extents);
+
+ uint64_t overlap = m_src_image_ctx->prune_parent_extents(image_extents,
+ parent_overlap);
+ if (overlap == 0) {
+ ldout(m_cct, 20) << "no parent overlap for object_no " << e.object_no
+ << " extent " << e.offset << "~" << e.length << dendl;
+ continue;
+ }
+
+ ldout(m_cct, 20) << "object_no " << e.object_no << " extent " << e.offset
+ << "~" << e.length << " overlap " << parent_overlap
+ << " parent extents " << image_extents << dendl;
+
+ ceph_assert(image_extents.size() == 1);
+
+ auto src_image_offset = image_extents.begin()->first;
+ auto length = image_extents.begin()->second;
+ m_read_ops[{src_snap_seq, 0}].emplace_back(COPY_OP_TYPE_WRITE, e.offset,
+ dst_object_offset, length);
+ m_read_ops[{src_snap_seq, 0}].rbegin()->src_extent_map[e.offset] = length;
+ parent_image_extents->emplace_back(src_image_offset, length);
+ }
+
+ if (!parent_image_extents->empty()) {
+ m_dst_object_state[src_snap_seq] = OBJECT_EXISTS;
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::merge_write_ops() {
+ ldout(m_cct, 20) << dendl;
+
+ for (auto &it : m_zero_interval) {
+ m_dst_zero_interval[it.first].insert(it.second);
+ }
+
+ for (auto &it : m_read_ops) {
+ auto src_snap_seq = it.first.first;
+ auto &copy_ops = it.second;
+ for (auto &copy_op : copy_ops) {
+ uint64_t src_offset = copy_op.src_offset;
+ uint64_t dst_offset = copy_op.dst_offset;
+ for (auto &e : copy_op.src_extent_map) {
+ uint64_t zero_len = e.first - src_offset;
+ if (zero_len > 0) {
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq
+ << ", inserting zero " << dst_offset << "~"
+ << zero_len << dendl;
+ m_dst_zero_interval[src_snap_seq].insert(dst_offset, zero_len);
+ src_offset += zero_len;
+ dst_offset += zero_len;
+ }
+ copy_op.dst_extent_map[dst_offset] = e.second;
+ src_offset += e.second;
+ dst_offset += e.second;
+ }
+ if (dst_offset < copy_op.dst_offset + copy_op.length) {
+ uint64_t zero_len = copy_op.dst_offset + copy_op.length - dst_offset;
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq
+ << ", inserting zero " << dst_offset << "~"
+ << zero_len << dendl;
+ m_dst_zero_interval[src_snap_seq].insert(dst_offset, zero_len);
+ } else {
+ ceph_assert(dst_offset == copy_op.dst_offset + copy_op.length);
+ }
+ m_write_ops[src_snap_seq].emplace_back(std::move(copy_op));
+ }
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_zero_ops() {
+ ldout(m_cct, 20) << dendl;
+
+ bool fast_diff = m_dst_image_ctx->test_features(RBD_FEATURE_FAST_DIFF);
+ uint64_t prev_end_size = 0;
+
+ m_src_image_ctx->parent_lock.get_read();
+ bool hide_parent = (m_src_image_ctx->parent != nullptr);
+ m_src_image_ctx->parent_lock.put_read();
+
+ for (auto &it : m_dst_zero_interval) {
+ auto src_snap_seq = it.first;
+ auto &zero_interval = it.second;
+
+ auto snap_map_it = m_snap_map.find(src_snap_seq);
+ ceph_assert(snap_map_it != m_snap_map.end());
+ auto dst_snap_seq = snap_map_it->second.front();
+
+ auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_seq);
+ ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end());
+ if (!dst_may_exist_it->second && prev_end_size > 0) {
+ ldout(m_cct, 5) << "object DNE for snap_id: " << dst_snap_seq << dendl;
+ m_write_ops[src_snap_seq].emplace_back(COPY_OP_TYPE_REMOVE, 0, 0, 0);
+ prev_end_size = 0;
+ continue;
+ }
+
+ if (hide_parent) {
+ RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock);
+ RWLock::RLocker parent_locker(m_dst_image_ctx->parent_lock);
+ uint64_t parent_overlap = 0;
+ int r = m_dst_image_ctx->get_parent_overlap(dst_snap_seq, &parent_overlap);
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed getting parent overlap for snap_id: "
+ << dst_snap_seq << ": " << cpp_strerror(r) << dendl;
+ }
+ if (parent_overlap == 0) {
+ ldout(m_cct, 20) << "no parent overlap" << dendl;
+ hide_parent = false;
+ } else {
+ std::vector<std::pair<uint64_t, uint64_t>> image_extents;
+ Striper::extent_to_file(m_cct, &m_dst_image_ctx->layout,
+ m_dst_object_number, 0,
+ m_dst_image_ctx->layout.object_size,
+ image_extents);
+ uint64_t overlap = m_dst_image_ctx->prune_parent_extents(image_extents,
+ parent_overlap);
+ if (overlap == 0) {
+ ldout(m_cct, 20) << "no parent overlap" << dendl;
+ hide_parent = false;
+ } else if (src_snap_seq == m_dst_zero_interval.begin()->first) {
+ for (auto e : image_extents) {
+ prev_end_size += e.second;
+ }
+ ceph_assert(prev_end_size <= m_dst_image_ctx->layout.object_size);
+ }
+ }
+ }
+
+ uint64_t end_size = prev_end_size;
+
+ // update end_size if there are writes into higher offsets
+ auto iter = m_write_ops.find(src_snap_seq);
+ if (iter != m_write_ops.end()) {
+ for (auto &copy_op : iter->second) {
+ for (auto &e : copy_op.dst_extent_map) {
+ end_size = std::max(end_size, e.first + e.second);
+ }
+ }
+ }
+
+ for (auto z = zero_interval.begin(); z != zero_interval.end(); z++) {
+ if (z.get_start() + z.get_len() >= end_size) {
+ // zero interval at the object end
+ if (z.get_start() == 0 && hide_parent) {
+ m_write_ops[src_snap_seq]
+ .emplace_back(COPY_OP_TYPE_REMOVE_TRUNC, 0, 0, 0);
+ ldout(m_cct, 20) << "COPY_OP_TYPE_REMOVE_TRUNC" << dendl;
+ } else if (z.get_start() < prev_end_size) {
+ if (z.get_start() == 0) {
+ m_write_ops[src_snap_seq]
+ .emplace_back(COPY_OP_TYPE_REMOVE, 0, 0, 0);
+ ldout(m_cct, 20) << "COPY_OP_TYPE_REMOVE" << dendl;
+ } else {
+ m_write_ops[src_snap_seq]
+ .emplace_back(COPY_OP_TYPE_TRUNC, 0, z.get_start(), 0);
+ ldout(m_cct, 20) << "COPY_OP_TYPE_TRUNC " << z.get_start() << dendl;
+ }
+ }
+ end_size = std::min(end_size, z.get_start());
+ } else {
+ // zero interval inside the object
+ m_write_ops[src_snap_seq]
+ .emplace_back(COPY_OP_TYPE_ZERO, 0, z.get_start(), z.get_len());
+ ldout(m_cct, 20) << "COPY_OP_TYPE_ZERO " << z.get_start() << "~"
+ << z.get_len() << dendl;
+ }
+ }
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", end_size="
+ << end_size << dendl;
+ if (end_size > 0 || hide_parent) {
+ m_dst_object_state[src_snap_seq] = OBJECT_EXISTS;
+ if (fast_diff && end_size == prev_end_size &&
+ m_write_ops[src_snap_seq].empty()) {
+ m_dst_object_state[src_snap_seq] = OBJECT_EXISTS_CLEAN;
+ }
+ }
+ prev_end_size = end_size;
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ // ensure IoCtxs are closed prior to proceeding
+ auto on_finish = m_on_finish;
+
+ m_src_async_op->finish_op();
+ delete m_src_async_op;
+ delete this;
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_dst_object_may_exist() {
+ RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock);
+
+ auto snap_ids = m_dst_image_ctx->snaps;
+ snap_ids.push_back(CEPH_NOSNAP);
+
+ for (auto snap_id : snap_ids) {
+ m_dst_object_may_exist[snap_id] =
+ (m_dst_object_number < m_dst_image_ctx->get_object_count(snap_id));
+ }
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::ObjectCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/ObjectCopyRequest.h b/src/librbd/deep_copy/ObjectCopyRequest.h
new file mode 100644
index 00000000..a0445f3b
--- /dev/null
+++ b/src/librbd/deep_copy/ObjectCopyRequest.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/interval_set.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/deep_copy/Types.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <map>
+#include <string>
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+namespace io { class AsyncOperation; }
+
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectCopyRequest {
+public:
+ static ObjectCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t dst_snap_id_start,
+ const SnapMap &snap_map,
+ uint64_t object_number, bool flatten,
+ Context *on_finish) {
+ return new ObjectCopyRequest(src_image_ctx, dst_image_ctx,
+ src_snap_id_start, dst_snap_id_start, snap_map,
+ object_number, flatten, on_finish);
+ }
+
+ ObjectCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t dst_snap_id_start, const SnapMap &snap_map,
+ uint64_t object_number, bool flatten, Context *on_finish);
+
+ void send();
+
+ // testing support
+ inline librados::IoCtx &get_src_io_ctx() {
+ return m_src_io_ctx;
+ }
+ inline librados::IoCtx &get_dst_io_ctx() {
+ return m_dst_io_ctx;
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | /----------------------\
+ * | | |
+ * v v | (repeat for each src object)
+ * LIST_SNAPS < * * * |
+ * | * (-ENOENT and snap set stale)
+ * | * * * * * * |
+ * | * /-----------\ |
+ * | * | | (repeat for each snapshot)
+ * v * v | |
+ * READ_OBJECT ---------/ |
+ * | | |
+ * | \----------------------/
+ * v
+ * READ_FROM_PARENT (skip if not needed)
+ * |
+ * | /-----------\
+ * | | | (repeat for each snapshot)
+ * v v |
+ * WRITE_OBJECT --------/
+ * |
+ * | /-----------\
+ * | | | (repeat for each snapshot)
+ * v v |
+ * UPDATE_OBJECT_MAP ---/ (skip if object
+ * | map disabled)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ struct SrcObjectExtent {
+ uint64_t object_no = 0;
+ uint64_t offset = 0;
+ uint64_t length = 0;
+ bool noent = false;
+
+ SrcObjectExtent() {
+ }
+ SrcObjectExtent(uint64_t object_no, uint64_t offset, uint64_t length)
+ : object_no(object_no), offset(offset), length(length) {
+ }
+ };
+
+ typedef std::map<uint64_t, SrcObjectExtent> SrcObjectExtents;
+
+ enum CopyOpType {
+ COPY_OP_TYPE_WRITE,
+ COPY_OP_TYPE_ZERO,
+ COPY_OP_TYPE_TRUNC,
+ COPY_OP_TYPE_REMOVE,
+ COPY_OP_TYPE_REMOVE_TRUNC,
+ };
+
+ typedef std::map<uint64_t, uint64_t> ExtentMap;
+
+ struct CopyOp {
+ CopyOp(CopyOpType type, uint64_t src_offset, uint64_t dst_offset,
+ uint64_t length)
+ : type(type), src_offset(src_offset), dst_offset(dst_offset),
+ length(length) {
+ }
+
+ CopyOpType type;
+ uint64_t src_offset;
+ uint64_t dst_offset;
+ uint64_t length;
+
+ ExtentMap src_extent_map;
+ ExtentMap dst_extent_map;
+ bufferlist out_bl;
+ };
+
+ typedef std::list<CopyOp> CopyOps;
+ typedef std::pair<librados::snap_t, librados::snap_t> WriteReadSnapIds;
+ typedef std::map<librados::snap_t, uint8_t> SnapObjectStates;
+ typedef std::map<librados::snap_t, std::map<uint64_t, uint64_t>> SnapObjectSizes;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ CephContext *m_cct;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_dst_snap_id_start;
+ SnapMap m_snap_map;
+ uint64_t m_dst_object_number;
+ bool m_flatten;
+ Context *m_on_finish;
+
+ decltype(m_src_image_ctx->data_ctx) m_src_io_ctx;
+ decltype(m_dst_image_ctx->data_ctx) m_dst_io_ctx;
+ std::string m_dst_oid;
+
+ std::set<uint64_t> m_src_objects;
+ uint64_t m_src_ono;
+ std::string m_src_oid;
+ SrcObjectExtents m_src_object_extents;
+ librados::snap_set_t m_snap_set;
+ int m_snap_ret = 0;
+ bool m_retry_missing_read = false;
+ librados::snap_set_t m_retry_snap_set;
+ bool m_read_whole_object = false;
+
+ std::map<WriteReadSnapIds, CopyOps> m_read_ops;
+ std::list<WriteReadSnapIds> m_read_snaps;
+ std::map<librados::snap_t, CopyOps> m_write_ops;
+ std::map<librados::snap_t, interval_set<uint64_t>> m_zero_interval;
+ std::map<librados::snap_t, interval_set<uint64_t>> m_dst_zero_interval;
+ std::map<librados::snap_t, uint8_t> m_dst_object_state;
+ std::map<librados::snap_t, bool> m_dst_object_may_exist;
+ bufferlist m_read_from_parent_data;
+
+ io::AsyncOperation* m_src_async_op = nullptr;
+
+ void send_list_snaps();
+ void handle_list_snaps(int r);
+
+ void send_read_object();
+ void handle_read_object(int r);
+
+ void send_read_from_parent();
+ void handle_read_from_parent(int r);
+
+ void send_write_object();
+ void handle_write_object(int r);
+
+ void send_update_object_map();
+ void handle_update_object_map(int r);
+
+ Context *start_lock_op(RWLock &owner_lock, int* r);
+
+ uint64_t src_to_dst_object_offset(uint64_t objectno, uint64_t offset);
+
+ void compute_src_object_extents();
+ void compute_read_ops();
+ void compute_read_from_parent_ops(io::Extents *image_extents);
+ void merge_write_ops();
+ void compute_zero_ops();
+
+ void compute_dst_object_may_exist();
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::ObjectCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/SetHeadRequest.cc b/src/librbd/deep_copy/SetHeadRequest.cc
new file mode 100644
index 00000000..5670ba07
--- /dev/null
+++ b/src/librbd/deep_copy/SetHeadRequest.cc
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SetHeadRequest.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/Utils.h"
+#include "librbd/image/AttachParentRequest.h"
+#include "librbd/image/DetachParentRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::SetHeadRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+SetHeadRequest<I>::SetHeadRequest(I *image_ctx, uint64_t size,
+ const cls::rbd::ParentImageSpec &spec,
+ uint64_t parent_overlap,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_size(size), m_parent_spec(spec),
+ m_parent_overlap(parent_overlap), m_on_finish(on_finish),
+ m_cct(image_ctx->cct) {
+ ceph_assert(m_parent_overlap <= m_size);
+}
+
+template <typename I>
+void SetHeadRequest<I>::send() {
+ send_set_size();
+}
+
+template <typename I>
+void SetHeadRequest<I>::send_set_size() {
+ m_image_ctx->snap_lock.get_read();
+ if (m_image_ctx->size == m_size) {
+ m_image_ctx->snap_lock.put_read();
+ send_detach_parent();
+ return;
+ }
+ m_image_ctx->snap_lock.put_read();
+
+ ldout(m_cct, 20) << dendl;
+
+ // Change the image size on disk so that the snapshot picks up
+ // the expected size. We can do this because the last snapshot
+ // we process is the sync snapshot which was created to match the
+ // image size. We also don't need to worry about trimming because
+ // we track the highest possible object number within the sync record
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::set_size(&op, m_size);
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_set_size(r);
+ finish_op_ctx->complete(0);
+ });
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+ r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SetHeadRequest<I>::handle_set_size(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update image size: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // adjust in-memory image size now that it's updated on disk
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ if (m_image_ctx->size > m_size) {
+ RWLock::WLocker parent_locker(m_image_ctx->parent_lock);
+ if (m_image_ctx->parent_md.spec.pool_id != -1 &&
+ m_image_ctx->parent_md.overlap > m_size) {
+ m_image_ctx->parent_md.overlap = m_size;
+ }
+ }
+ m_image_ctx->size = m_size;
+ }
+
+ send_detach_parent();
+}
+
+template <typename I>
+void SetHeadRequest<I>::send_detach_parent() {
+ m_image_ctx->parent_lock.get_read();
+ if (m_image_ctx->parent_md.spec.pool_id == -1 ||
+ (m_image_ctx->parent_md.spec == m_parent_spec &&
+ m_image_ctx->parent_md.overlap == m_parent_overlap)) {
+ m_image_ctx->parent_lock.put_read();
+ send_attach_parent();
+ return;
+ }
+ m_image_ctx->parent_lock.put_read();
+
+ ldout(m_cct, 20) << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_detach_parent(r);
+ finish_op_ctx->complete(0);
+ });
+ auto req = image::DetachParentRequest<I>::create(*m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void SetHeadRequest<I>::handle_detach_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // adjust in-memory parent now that it's updated on disk
+ RWLock::WLocker parent_locker(m_image_ctx->parent_lock);
+ m_image_ctx->parent_md.spec = {};
+ m_image_ctx->parent_md.overlap = 0;
+ }
+
+ send_attach_parent();
+}
+
+template <typename I>
+void SetHeadRequest<I>::send_attach_parent() {
+ m_image_ctx->parent_lock.get_read();
+ if (m_image_ctx->parent_md.spec == m_parent_spec &&
+ m_image_ctx->parent_md.overlap == m_parent_overlap) {
+ m_image_ctx->parent_lock.put_read();
+ finish(0);
+ return;
+ }
+ m_image_ctx->parent_lock.put_read();
+
+ ldout(m_cct, 20) << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_attach_parent(r);
+ finish_op_ctx->complete(0);
+ });
+ auto req = image::AttachParentRequest<I>::create(
+ *m_image_ctx, m_parent_spec, m_parent_overlap, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void SetHeadRequest<I>::handle_attach_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // adjust in-memory parent now that it's updated on disk
+ RWLock::WLocker parent_locker(m_image_ctx->parent_lock);
+ m_image_ctx->parent_md.spec = m_parent_spec;
+ m_image_ctx->parent_md.overlap = m_parent_overlap;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+Context *SetHeadRequest<I>::start_lock_op(int* r) {
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ return new FunctionContext([](int r) {});
+ }
+ return m_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void SetHeadRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::SetHeadRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/SetHeadRequest.h b/src/librbd/deep_copy/SetHeadRequest.h
new file mode 100644
index 00000000..9a17c9fd
--- /dev/null
+++ b/src/librbd/deep_copy/SetHeadRequest.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SetHeadRequest {
+public:
+ static SetHeadRequest* create(ImageCtxT *image_ctx, uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap,
+ Context *on_finish) {
+ return new SetHeadRequest(image_ctx, size, parent_spec, parent_overlap,
+ on_finish);
+ }
+
+ SetHeadRequest(ImageCtxT *image_ctx, uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (skip if not needed)
+ * SET_SIZE
+ * |
+ * v (skip if not needed)
+ * DETACH_PARENT
+ * |
+ * v (skip if not needed)
+ * ATTACH_PARENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_size;
+ cls::rbd::ParentImageSpec m_parent_spec;
+ uint64_t m_parent_overlap;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ void send_set_size();
+ void handle_set_size(int r);
+
+ void send_detach_parent();
+ void handle_detach_parent(int r);
+
+ void send_attach_parent();
+ void handle_attach_parent(int r);
+
+ Context *start_lock_op(int* r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::SetHeadRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H
diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.cc b/src/librbd/deep_copy/SnapshotCopyRequest.cc
new file mode 100644
index 00000000..d1375a42
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCopyRequest.cc
@@ -0,0 +1,730 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SnapshotCopyRequest.h"
+#include "SetHeadRequest.h"
+#include "SnapshotCreateRequest.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+namespace {
+
+template <typename I>
+const std::string &get_snapshot_name(I *image_ctx, librados::snap_t snap_id) {
+ auto snap_it = std::find_if(image_ctx->snap_ids.begin(),
+ image_ctx->snap_ids.end(),
+ [snap_id](
+ const std::pair<
+ std::pair<cls::rbd::SnapshotNamespace,
+ std::string>,
+ librados::snap_t> &pair) {
+ return pair.second == snap_id;
+ });
+ ceph_assert(snap_it != image_ctx->snap_ids.end());
+ return snap_it->first.second;
+}
+
+} // anonymous namespace
+
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+SnapshotCopyRequest<I>::SnapshotCopyRequest(I *src_image_ctx,
+ I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, ContextWQ *work_queue,
+ SnapSeqs *snap_seqs,
+ Context *on_finish)
+ : RefCountedObject(dst_image_ctx->cct, 1), m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start),
+ m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start),
+ m_flatten(flatten), m_work_queue(work_queue), m_snap_seqs_result(snap_seqs),
+ m_snap_seqs(*snap_seqs), m_on_finish(on_finish), m_cct(dst_image_ctx->cct),
+ m_lock(unique_lock_name("SnapshotCopyRequest::m_lock", this)) {
+ ceph_assert((m_src_snap_id_start == 0 && m_dst_snap_id_start == 0) ||
+ (m_src_snap_id_start > 0 && m_dst_snap_id_start > 0));
+
+ // snap ids ordered from oldest to newest
+ m_src_image_ctx->snap_lock.get_read();
+ m_src_snap_ids.insert(src_image_ctx->snaps.begin(),
+ src_image_ctx->snaps.end());
+ m_src_image_ctx->snap_lock.put_read();
+
+ m_dst_image_ctx->snap_lock.get_read();
+ m_dst_snap_ids.insert(dst_image_ctx->snaps.begin(),
+ dst_image_ctx->snaps.end());
+ m_dst_image_ctx->snap_lock.put_read();
+
+ if (m_src_snap_id_end != CEPH_NOSNAP) {
+ m_src_snap_ids.erase(m_src_snap_ids.upper_bound(m_src_snap_id_end),
+ m_src_snap_ids.end());
+ }
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send() {
+ cls::rbd::ParentImageSpec src_parent_spec;
+ int r = validate_parent(m_src_image_ctx, &src_parent_spec);
+ if (r < 0) {
+ lderr(m_cct) << "source image parent spec mismatch" << dendl;
+ error(r);
+ return;
+ }
+
+ r = validate_parent(m_dst_image_ctx, &m_dst_parent_spec);
+ if (r < 0) {
+ lderr(m_cct) << "destination image parent spec mismatch" << dendl;
+ error(r);
+ return;
+ }
+
+ send_snap_unprotect();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::cancel() {
+ Mutex::Locker locker(m_lock);
+
+ ldout(m_cct, 20) << dendl;
+ m_canceled = true;
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_unprotect() {
+
+ SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_dst_snap_id_start > 0) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start);
+ }
+
+ for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t dst_snap_id = *snap_id_it;
+
+ m_dst_image_ctx->snap_lock.get_read();
+
+ bool dst_unprotected;
+ int r = m_dst_image_ctx->is_snap_unprotected(dst_snap_id, &dst_unprotected);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve destination snap unprotect status: "
+ << cpp_strerror(r) << dendl;
+ m_dst_image_ctx->snap_lock.put_read();
+ finish(r);
+ return;
+ }
+ m_dst_image_ctx->snap_lock.put_read();
+
+ if (dst_unprotected) {
+ // snap is already unprotected -- check next snap
+ continue;
+ }
+
+ // if destination snapshot is protected and (1) it isn't in our mapping
+ // table, or (2) the source snapshot isn't protected, unprotect it
+ auto snap_seq_it = std::find_if(
+ m_snap_seqs.begin(), m_snap_seqs.end(),
+ [dst_snap_id](const SnapSeqs::value_type& pair) {
+ return pair.second == dst_snap_id;
+ });
+
+ if (snap_seq_it != m_snap_seqs.end()) {
+ m_src_image_ctx->snap_lock.get_read();
+ bool src_unprotected;
+ r = m_src_image_ctx->is_snap_unprotected(snap_seq_it->first,
+ &src_unprotected);
+ ldout(m_cct, 20) << "m_src_image_ctx->is_snap_unprotected("
+ << snap_seq_it->first << "): r=" << r
+ << ", src_unprotected=" << src_unprotected << dendl;
+ if (r == -ENOENT) {
+ src_unprotected = true;
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve source snap unprotect status: "
+ << cpp_strerror(r) << dendl;
+ m_src_image_ctx->snap_lock.put_read();
+ finish(r);
+ return;
+ }
+ m_src_image_ctx->snap_lock.put_read();
+
+ if (src_unprotected) {
+ // source is unprotected -- unprotect destination snap
+ break;
+ }
+ } else {
+ // source snapshot doesn't exist -- unprotect destination snap
+ break;
+ }
+ }
+
+ if (snap_id_it == m_dst_snap_ids.end()) {
+ // no destination snapshots to unprotect
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_snap_remove();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id);
+
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_snap_unprotect(r);
+ finish_op_ctx->complete(0);
+ });
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ m_dst_image_ctx->operations->execute_snap_unprotect(
+ cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_unprotect(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to unprotect snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // avoid the need to refresh to delete the newly unprotected snapshot
+ RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock);
+ auto snap_info_it = m_dst_image_ctx->snap_info.find(m_prev_snap_id);
+ if (snap_info_it != m_dst_image_ctx->snap_info.end()) {
+ snap_info_it->second.protection_status =
+ RBD_PROTECTION_STATUS_UNPROTECTED;
+ }
+ }
+
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_snap_unprotect();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_remove() {
+ SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_dst_snap_id_start > 0) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start);
+ }
+
+ for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t dst_snap_id = *snap_id_it;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ m_dst_image_ctx->snap_lock.get_read();
+ int r = m_dst_image_ctx->get_snap_namespace(dst_snap_id, &snap_namespace);
+ m_dst_image_ctx->snap_lock.put_read();
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve destination snap namespace: "
+ << m_snap_name << dendl;
+ finish(r);
+ return;
+ }
+
+ if (boost::get<cls::rbd::UserSnapshotNamespace>(&snap_namespace) ==
+ nullptr) {
+ continue;
+ }
+
+ // if the destination snapshot isn't in our mapping table, remove it
+ auto snap_seq_it = std::find_if(
+ m_snap_seqs.begin(), m_snap_seqs.end(),
+ [dst_snap_id](const SnapSeqs::value_type& pair) {
+ return pair.second == dst_snap_id;
+ });
+
+ if (snap_seq_it == m_snap_seqs.end()) {
+ break;
+ }
+ }
+
+ if (snap_id_it == m_dst_snap_ids.end()) {
+ // no destination snapshots to delete
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_snap_create();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id);
+
+ ldout(m_cct, 20) << ""
+ << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_snap_remove(r);
+ finish_op_ctx->complete(0);
+ });
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ m_dst_image_ctx->operations->execute_snap_remove(
+ cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_snap_remove();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_create() {
+ SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_src_snap_id_start > 0) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start);
+ }
+
+ for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t src_snap_id = *snap_id_it;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ m_src_image_ctx->snap_lock.get_read();
+ int r = m_src_image_ctx->get_snap_namespace(src_snap_id, &snap_namespace);
+ m_src_image_ctx->snap_lock.put_read();
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve source snap namespace: "
+ << m_snap_name << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_snap_seqs.find(src_snap_id) == m_snap_seqs.end()) {
+ // the source snapshot is not in our mapping table, ...
+ if (boost::get<cls::rbd::UserSnapshotNamespace>(&snap_namespace) !=
+ nullptr) {
+ // ... create it since it's a user snapshot
+ break;
+ } else if (src_snap_id == m_src_snap_id_end) {
+ // ... map it to destination HEAD since it's not a user snapshot that we
+ // will create (e.g. MirrorPrimarySnapshotNamespace)
+ m_snap_seqs[src_snap_id] = CEPH_NOSNAP;
+ }
+ }
+ }
+
+ if (snap_id_it == m_src_snap_ids.end()) {
+ // no source snapshots to create
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_snap_protect();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id);
+
+ m_src_image_ctx->snap_lock.get_read();
+ auto snap_info_it = m_src_image_ctx->snap_info.find(m_prev_snap_id);
+ if (snap_info_it == m_src_image_ctx->snap_info.end()) {
+ m_src_image_ctx->snap_lock.put_read();
+ lderr(m_cct) << "failed to retrieve source snap info: " << m_snap_name
+ << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ uint64_t size = snap_info_it->second.size;
+ m_snap_namespace = snap_info_it->second.snap_namespace;
+ cls::rbd::ParentImageSpec parent_spec;
+ uint64_t parent_overlap = 0;
+ if (!m_flatten && snap_info_it->second.parent.spec.pool_id != -1) {
+ parent_spec = m_dst_parent_spec;
+ parent_overlap = snap_info_it->second.parent.overlap;
+ }
+ m_src_image_ctx->snap_lock.put_read();
+
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << ", "
+ << "size=" << size << ", "
+ << "parent_info=["
+ << "pool_id=" << parent_spec.pool_id << ", "
+ << "image_id=" << parent_spec.image_id << ", "
+ << "snap_id=" << parent_spec.snap_id << ", "
+ << "overlap=" << parent_overlap << "]" << dendl;
+
+ int r;
+ Context *finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_snap_create(r);
+ finish_op_ctx->complete(0);
+ });
+ SnapshotCreateRequest<I> *req = SnapshotCreateRequest<I>::create(
+ m_dst_image_ctx, m_snap_name, m_snap_namespace, size, parent_spec,
+ parent_overlap, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_create(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ if (handle_cancellation()) {
+ return;
+ }
+
+ ceph_assert(m_prev_snap_id != CEPH_NOSNAP);
+
+ auto snap_it = m_dst_image_ctx->snap_ids.find(
+ {cls::rbd::UserSnapshotNamespace(), m_snap_name});
+ ceph_assert(snap_it != m_dst_image_ctx->snap_ids.end());
+ librados::snap_t dst_snap_id = snap_it->second;
+
+ ldout(m_cct, 20) << "mapping source snap id " << m_prev_snap_id << " to "
+ << dst_snap_id << dendl;
+ m_snap_seqs[m_prev_snap_id] = dst_snap_id;
+
+ send_snap_create();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_protect() {
+ SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_src_snap_id_start > 0) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start);
+ }
+
+ for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t src_snap_id = *snap_id_it;
+
+ m_src_image_ctx->snap_lock.get_read();
+
+ bool src_protected;
+ int r = m_src_image_ctx->is_snap_protected(src_snap_id, &src_protected);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve source snap protect status: "
+ << cpp_strerror(r) << dendl;
+ m_src_image_ctx->snap_lock.put_read();
+ finish(r);
+ return;
+ }
+ m_src_image_ctx->snap_lock.put_read();
+
+ if (!src_protected) {
+ // snap is not protected -- check next snap
+ continue;
+ }
+
+ // if destination snapshot is not protected, protect it
+ auto snap_seq_it = m_snap_seqs.find(src_snap_id);
+ ceph_assert(snap_seq_it != m_snap_seqs.end());
+ if (snap_seq_it->second == CEPH_NOSNAP) {
+ // implies src end snapshot is mapped to a non-copyable snapshot
+ ceph_assert(src_snap_id == m_src_snap_id_end);
+ break;
+ }
+
+ m_dst_image_ctx->snap_lock.get_read();
+ bool dst_protected;
+ r = m_dst_image_ctx->is_snap_protected(snap_seq_it->second, &dst_protected);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve destination snap protect status: "
+ << cpp_strerror(r) << dendl;
+ m_dst_image_ctx->snap_lock.put_read();
+ finish(r);
+ return;
+ }
+ m_dst_image_ctx->snap_lock.put_read();
+
+ if (!dst_protected) {
+ break;
+ }
+ }
+
+ if (snap_id_it == m_src_snap_ids.end()) {
+ // no destination snapshots to protect
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_set_head();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id);
+
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_snap_protect(r);
+ finish_op_ctx->complete(0);
+ });
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ m_dst_image_ctx->operations->execute_snap_protect(
+ cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_protect(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to protect snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_snap_protect();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_set_head() {
+ auto snap_seq_it = m_snap_seqs.find(m_src_snap_id_end);
+ if (m_src_snap_id_end != CEPH_NOSNAP &&
+ (snap_seq_it == m_snap_seqs.end() ||
+ snap_seq_it->second != CEPH_NOSNAP)) {
+ // not copying to src nor dst HEAD revision
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ uint64_t size;
+ cls::rbd::ParentImageSpec parent_spec;
+ uint64_t parent_overlap = 0;
+ {
+ RWLock::RLocker src_locker(m_src_image_ctx->snap_lock);
+ auto snap_info_it = m_src_image_ctx->snap_info.find(m_src_snap_id_end);
+ if (snap_info_it != m_src_image_ctx->snap_info.end()) {
+ auto& snap_info = snap_info_it->second;
+ size = snap_info.size;
+ if (!m_flatten) {
+ parent_spec = snap_info.parent.spec;
+ parent_overlap = snap_info.parent.overlap;
+ }
+ } else {
+ size = m_src_image_ctx->size;
+ if (!m_flatten) {
+ parent_spec = m_src_image_ctx->parent_md.spec;
+ parent_overlap = m_src_image_ctx->parent_md.overlap;
+ }
+ }
+ }
+
+ auto ctx = create_context_callback<
+ SnapshotCopyRequest<I>, &SnapshotCopyRequest<I>::handle_set_head>(this);
+ auto req = SetHeadRequest<I>::create(m_dst_image_ctx, size, parent_spec,
+ parent_overlap, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_set_head(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_resize_object_map();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_resize_object_map() {
+ int r = 0;
+
+ if (m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ RWLock::RLocker snap_locker(m_dst_image_ctx->snap_lock);
+
+ if (m_dst_image_ctx->object_map != nullptr &&
+ Striper::get_num_objects(m_dst_image_ctx->layout,
+ m_dst_image_ctx->size) !=
+ m_dst_image_ctx->object_map->size()) {
+
+ ldout(m_cct, 20) << dendl;
+
+ auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r);
+ if (finish_op_ctx != nullptr) {
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_resize_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+
+ m_dst_image_ctx->object_map->aio_resize(m_dst_image_ctx->size,
+ OBJECT_NONEXISTENT, ctx);
+ return;
+ }
+
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ }
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_resize_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to resize object map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+bool SnapshotCopyRequest<I>::handle_cancellation() {
+ {
+ Mutex::Locker locker(m_lock);
+ if (!m_canceled) {
+ return false;
+ }
+ }
+ ldout(m_cct, 10) << "snapshot copy canceled" << dendl;
+ finish(-ECANCELED);
+ return true;
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::error(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_work_queue->queue(new FunctionContext([this, r](int r1) { finish(r); }));
+}
+
+template <typename I>
+int SnapshotCopyRequest<I>::validate_parent(I *image_ctx,
+ cls::rbd::ParentImageSpec *spec) {
+ RWLock::RLocker owner_locker(image_ctx->owner_lock);
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+
+ // ensure source image's parent specs are still consistent
+ *spec = image_ctx->parent_md.spec;
+ for (auto &snap_info_pair : image_ctx->snap_info) {
+ auto &parent_spec = snap_info_pair.second.parent.spec;
+ if (parent_spec.pool_id == -1) {
+ continue;
+ } else if (spec->pool_id == -1) {
+ *spec = parent_spec;
+ continue;
+ }
+
+ if (*spec != parent_spec) {
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+Context *SnapshotCopyRequest<I>::start_lock_op(int* r) {
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ return start_lock_op(m_dst_image_ctx->owner_lock, r);
+}
+
+template <typename I>
+Context *SnapshotCopyRequest<I>::start_lock_op(RWLock &owner_lock, int* r) {
+ ceph_assert(m_dst_image_ctx->owner_lock.is_locked());
+ if (m_dst_image_ctx->exclusive_lock == nullptr) {
+ return new FunctionContext([](int r) {});
+ }
+ return m_dst_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ *m_snap_seqs_result = m_snap_seqs;
+ }
+
+ m_on_finish->complete(r);
+ put();
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::SnapshotCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.h b/src/librbd/deep_copy/SnapshotCopyRequest.h
new file mode 100644
index 00000000..12068c9a
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCopyRequest.h
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/RefCountedObj.h"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SnapshotCopyRequest : public RefCountedObject {
+public:
+ static SnapshotCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, ContextWQ *work_queue,
+ SnapSeqs *snap_seqs, Context *on_finish) {
+ return new SnapshotCopyRequest(src_image_ctx, dst_image_ctx,
+ src_snap_id_start, src_snap_id_end,
+ dst_snap_id_start, flatten, work_queue,
+ snap_seqs, on_finish);
+ }
+
+ SnapshotCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, ContextWQ *work_queue, SnapSeqs *snap_seqs,
+ Context *on_finish);
+
+ void send();
+ void cancel();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * UNPROTECT_SNAP ----/
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * REMOVE_SNAP -------/
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * CREATE_SNAP -------/
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * PROTECT_SNAP ------/
+ * |
+ * v
+ * SET_HEAD (skip if not needed)
+ * |
+ * v
+ * RESIZE_OBJECT_MAP (skip if not needed)
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ typedef std::set<librados::snap_t> SnapIdSet;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_src_snap_id_end;
+ librados::snap_t m_dst_snap_id_start;
+ bool m_flatten;
+ ContextWQ *m_work_queue;
+ SnapSeqs *m_snap_seqs_result;
+ SnapSeqs m_snap_seqs;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ SnapIdSet m_src_snap_ids;
+ SnapIdSet m_dst_snap_ids;
+ librados::snap_t m_prev_snap_id = CEPH_NOSNAP;
+
+ std::string m_snap_name;
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+
+ cls::rbd::ParentImageSpec m_dst_parent_spec;
+
+ Mutex m_lock;
+ bool m_canceled = false;
+
+ void send_snap_unprotect();
+ void handle_snap_unprotect(int r);
+
+ void send_snap_remove();
+ void handle_snap_remove(int r);
+
+ void send_snap_create();
+ void handle_snap_create(int r);
+
+ void send_snap_protect();
+ void handle_snap_protect(int r);
+
+ void send_set_head();
+ void handle_set_head(int r);
+
+ void send_resize_object_map();
+ void handle_resize_object_map(int r);
+
+ bool handle_cancellation();
+
+ void error(int r);
+
+ int validate_parent(ImageCtxT *image_ctx, cls::rbd::ParentImageSpec *spec);
+
+ Context *start_lock_op(int* r);
+ Context *start_lock_op(RWLock &owner_locki, int* r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::SnapshotCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.cc b/src/librbd/deep_copy/SnapshotCreateRequest.cc
new file mode 100644
index 00000000..0ff9c832
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCreateRequest.cc
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SetHeadRequest.h"
+#include "SnapshotCreateRequest.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCreateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+SnapshotCreateRequest<I>::SnapshotCreateRequest(
+ I *dst_image_ctx, const std::string &snap_name,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ uint64_t size, const cls::rbd::ParentImageSpec &spec,
+ uint64_t parent_overlap, Context *on_finish)
+ : m_dst_image_ctx(dst_image_ctx), m_snap_name(snap_name),
+ m_snap_namespace(snap_namespace), m_size(size),
+ m_parent_spec(spec), m_parent_overlap(parent_overlap),
+ m_on_finish(on_finish), m_cct(dst_image_ctx->cct) {
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send() {
+ send_set_head();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_set_head() {
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ SnapshotCreateRequest<I>, &SnapshotCreateRequest<I>::handle_set_head>(this);
+ auto req = SetHeadRequest<I>::create(m_dst_image_ctx, m_size, m_parent_spec,
+ m_parent_overlap, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::handle_set_head(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_snap();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_snap() {
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_create_snap(r);
+ finish_op_ctx->complete(0);
+ });
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ m_dst_image_ctx->operations->execute_snap_create(m_snap_namespace,
+ m_snap_name.c_str(),
+ ctx,
+ 0U, true);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::handle_create_snap(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_object_map();
+}
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_object_map() {
+
+ if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+ finish(0);
+ return;
+ }
+
+ m_dst_image_ctx->snap_lock.get_read();
+ auto snap_it = m_dst_image_ctx->snap_ids.find(
+ {cls::rbd::UserSnapshotNamespace(), m_snap_name});
+ if (snap_it == m_dst_image_ctx->snap_ids.end()) {
+ lderr(m_cct) << "failed to locate snap: " << m_snap_name << dendl;
+ m_dst_image_ctx->snap_lock.put_read();
+ finish(-ENOENT);
+ return;
+ }
+ librados::snap_t local_snap_id = snap_it->second;
+ m_dst_image_ctx->snap_lock.put_read();
+
+ std::string object_map_oid(librbd::ObjectMap<>::object_map_name(
+ m_dst_image_ctx->id, local_snap_id));
+ uint64_t object_count = Striper::get_num_objects(m_dst_image_ctx->layout,
+ m_size);
+ ldout(m_cct, 20) << "object_map_oid=" << object_map_oid << ", "
+ << "object_count=" << object_count << dendl;
+
+ // initialize an empty object map of the correct size (object sync
+ // will populate the object map)
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::object_map_resize(&op, object_count, OBJECT_NONEXISTENT);
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_create_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+ r = m_dst_image_ctx->md_ctx.aio_operate(object_map_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::handle_create_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create object map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::start_lock_op(int* r) {
+ RWLock::RLocker owner_locker(m_dst_image_ctx->owner_lock);
+ if (m_dst_image_ctx->exclusive_lock == nullptr) {
+ return new FunctionContext([](int r) {});
+ }
+ return m_dst_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::SnapshotCreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.h b/src/librbd/deep_copy/SnapshotCreateRequest.h
new file mode 100644
index 00000000..8c228063
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCreateRequest.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SnapshotCreateRequest {
+public:
+ static SnapshotCreateRequest* create(ImageCtxT *dst_image_ctx,
+ const std::string &snap_name,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap,
+ Context *on_finish) {
+ return new SnapshotCreateRequest(dst_image_ctx, snap_name, snap_namespace, size,
+ parent_spec, parent_overlap, on_finish);
+ }
+
+ SnapshotCreateRequest(ImageCtxT *dst_image_ctx,
+ const std::string &snap_name,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SET_HEAD
+ * |
+ * v
+ * CREATE_SNAP
+ * |
+ * v (skip if not needed)
+ * CREATE_OBJECT_MAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_dst_image_ctx;
+ std::string m_snap_name;
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ uint64_t m_size;
+ cls::rbd::ParentImageSpec m_parent_spec;
+ uint64_t m_parent_overlap;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ void send_set_head();
+ void handle_set_head(int r);
+
+ void send_create_snap();
+ void handle_create_snap(int r);
+
+ void send_create_object_map();
+ void handle_create_object_map(int r);
+
+ Context *start_lock_op(int* r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::SnapshotCreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/deep_copy/Types.h b/src/librbd/deep_copy/Types.h
new file mode 100644
index 00000000..10d3c7c1
--- /dev/null
+++ b/src/librbd/deep_copy/Types.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_TYPES_H
+#define CEPH_LIBRBD_DEEP_COPY_TYPES_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include <boost/optional.hpp>
+
+namespace librbd {
+namespace deep_copy {
+
+typedef std::vector<librados::snap_t> SnapIds;
+typedef std::map<librados::snap_t, SnapIds> SnapMap;
+
+typedef boost::optional<uint64_t> ObjectNumber;
+
+} // namespace deep_copy
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_DEEP_COPY_TYPES_H
diff --git a/src/librbd/deep_copy/Utils.cc b/src/librbd/deep_copy/Utils.cc
new file mode 100644
index 00000000..c2dd2502
--- /dev/null
+++ b/src/librbd/deep_copy/Utils.cc
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "Utils.h"
+#include <set>
+
+namespace librbd {
+namespace deep_copy {
+namespace util {
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::util::" << __func__ << ": "
+
+void compute_snap_map(CephContext* cct,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ const SnapIds& dst_snap_ids,
+ const SnapSeqs &snap_seqs,
+ SnapMap *snap_map) {
+ std::set<librados::snap_t> ordered_dst_snap_ids{
+ dst_snap_ids.begin(), dst_snap_ids.end()};
+ auto dst_snap_id_it = ordered_dst_snap_ids.begin();
+
+ SnapIds snap_ids;
+ for (auto &it : snap_seqs) {
+ // ensure all dst snap ids are included in the mapping table since
+ // deep copy will skip non-user snapshots
+ while (dst_snap_id_it != ordered_dst_snap_ids.end()) {
+ if (*dst_snap_id_it < it.second) {
+ snap_ids.insert(snap_ids.begin(), *dst_snap_id_it);
+ } else if (*dst_snap_id_it > it.second) {
+ break;
+ }
+ ++dst_snap_id_it;
+ }
+
+ // we should only have the HEAD revision in the the last snap seq
+ ceph_assert(snap_ids.empty() || snap_ids[0] != CEPH_NOSNAP);
+ snap_ids.insert(snap_ids.begin(), it.second);
+
+ if (it.first < src_snap_id_start) {
+ continue;
+ } else if (it.first > src_snap_id_end) {
+ break;
+ }
+
+ (*snap_map)[it.first] = snap_ids;
+ }
+
+ ldout(cct, 10) << "src_snap_id_start=" << src_snap_id_start << ", "
+ << "src_snap_id_end=" << src_snap_id_end << ", "
+ << "dst_snap_ids=" << dst_snap_ids << ", "
+ << "snap_seqs=" << snap_seqs << ", "
+ << "snap_map=" << *snap_map << dendl;
+}
+
+} // namespace util
+} // namespace deep_copy
+} // namespace librbd
diff --git a/src/librbd/deep_copy/Utils.h b/src/librbd/deep_copy/Utils.h
new file mode 100644
index 00000000..0681548d
--- /dev/null
+++ b/src/librbd/deep_copy/Utils.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_UTILS_H
+#define CEPH_LIBRBD_DEEP_COPY_UTILS_H
+
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+#include "librbd/deep_copy/Types.h"
+
+#include <boost/optional.hpp>
+
+struct CephContext;
+
+namespace librbd {
+namespace deep_copy {
+namespace util {
+
+void compute_snap_map(CephContext* cct,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ const SnapIds& dst_snap_ids,
+ const SnapSeqs &snap_seqs,
+ SnapMap *snap_map);
+
+} // namespace util
+} // namespace deep_copy
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_DEEP_COPY_UTILS_H
diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.cc b/src/librbd/exclusive_lock/AutomaticPolicy.cc
new file mode 100644
index 00000000..4d5f48b1
--- /dev/null
+++ b/src/librbd/exclusive_lock/AutomaticPolicy.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/AutomaticPolicy.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ExclusiveLock.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock::AutomaticPolicy "
+
+namespace librbd {
+namespace exclusive_lock {
+
+int AutomaticPolicy::lock_requested(bool force) {
+ ceph_assert(m_image_ctx->owner_lock.is_locked());
+ ceph_assert(m_image_ctx->exclusive_lock != nullptr);
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force
+ << dendl;
+
+ // release the lock upon request (ignore forced requests)
+ m_image_ctx->exclusive_lock->release_lock(nullptr);
+ return 0;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.h b/src/librbd/exclusive_lock/AutomaticPolicy.h
new file mode 100644
index 00000000..12ba9b6c
--- /dev/null
+++ b/src/librbd/exclusive_lock/AutomaticPolicy.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H
+
+#include "librbd/exclusive_lock/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+class AutomaticPolicy : public Policy {
+public:
+ AutomaticPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ bool may_auto_request_lock() override {
+ return true;
+ }
+
+ int lock_requested(bool force) override;
+
+private:
+ ImageCtx *m_image_ctx;
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H
diff --git a/src/librbd/exclusive_lock/Policy.h b/src/librbd/exclusive_lock/Policy.h
new file mode 100644
index 00000000..8dcbf444
--- /dev/null
+++ b/src/librbd/exclusive_lock/Policy.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
+
+namespace librbd {
+namespace exclusive_lock {
+
+enum OperationRequestType {
+ OPERATION_REQUEST_TYPE_GENERAL = 0,
+ OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE = 1,
+};
+
+struct Policy {
+ virtual ~Policy() {
+ }
+
+ virtual bool may_auto_request_lock() = 0;
+ virtual int lock_requested(bool force) = 0;
+
+ virtual bool accept_blocked_request(OperationRequestType) {
+ return false;
+ }
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.cc b/src/librbd/exclusive_lock/PostAcquireRequest.cc
new file mode 100644
index 00000000..7e67e41c
--- /dev/null
+++ b/src/librbd/exclusive_lock/PostAcquireRequest.cc
@@ -0,0 +1,308 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/PostAcquireRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/journal/Policy.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::PostAcquireRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+PostAcquireRequest<I>* PostAcquireRequest<I>::create(I &image_ctx,
+ Context *on_acquire,
+ Context *on_finish) {
+ return new PostAcquireRequest(image_ctx, on_acquire, on_finish);
+}
+
+template <typename I>
+PostAcquireRequest<I>::PostAcquireRequest(I &image_ctx, Context *on_acquire,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_on_acquire(on_acquire),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_object_map(nullptr), m_journal(nullptr), m_error_result(0) {
+}
+
+template <typename I>
+PostAcquireRequest<I>::~PostAcquireRequest() {
+ if (!m_prepare_lock_completed) {
+ m_image_ctx.state->handle_prepare_lock_complete();
+ }
+ delete m_on_acquire;
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send() {
+ send_refresh();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_refresh() {
+ if (!m_image_ctx.state->is_refresh_required()) {
+ send_open_object_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<klass, &klass::handle_refresh>(this));
+
+ // ImageState is blocked waiting for lock to complete -- safe to directly
+ // refresh
+ image::RefreshRequest<I> *req = image::RefreshRequest<I>::create(
+ m_image_ctx, true, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_refresh(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ // next issued IO or op will (re)-refresh the image and shut down lock
+ ldout(cct, 5) << "exclusive lock dynamically disabled" << dendl;
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ save_result(r);
+ revert();
+ finish();
+ return;
+ }
+
+ send_open_object_map();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_open_journal() {
+ // alert caller that we now own the exclusive lock
+ m_on_acquire->complete(0);
+ m_on_acquire = nullptr;
+
+ bool journal_enabled;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ journal_enabled = (m_image_ctx.test_features(RBD_FEATURE_JOURNALING,
+ m_image_ctx.snap_lock) &&
+ !m_image_ctx.get_journal_policy()->journal_disabled());
+ }
+ if (!journal_enabled) {
+ apply();
+ finish();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_open_journal>(
+ this);
+ m_journal = m_image_ctx.create_journal();
+
+ // journal playback requires object map (if enabled) and itself
+ apply();
+
+ m_journal->open(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_open_journal(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl;
+ send_close_journal();
+ return;
+ }
+
+ send_allocate_journal_tag();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_allocate_journal_tag() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_allocate_journal_tag>(this);
+ m_image_ctx.get_journal_policy()->allocate_tag_on_lock(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_allocate_journal_tag(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to allocate journal tag: " << cpp_strerror(r)
+ << dendl;
+ send_close_journal();
+ return;
+ }
+
+ finish();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_close_journal() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_close_journal>(
+ this);
+ m_journal->close(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_close_journal(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl;
+ }
+
+ send_close_object_map();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_open_object_map() {
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ send_open_journal();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_open_object_map>(
+ this);
+
+ m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP);
+ m_object_map->open(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_open_object_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open object map: " << cpp_strerror(r) << dendl;
+ delete m_object_map;
+ m_object_map = nullptr;
+
+ if (r != -EFBIG) {
+ save_result(r);
+ revert();
+ finish();
+ return;
+ }
+ }
+
+ send_open_journal();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_close_object_map() {
+ if (m_object_map == nullptr) {
+ revert();
+ finish();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_close_object_map>(this);
+ m_object_map->close(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_close_object_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl;
+ }
+
+ revert();
+ finish();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::apply() {
+ {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ ceph_assert(m_image_ctx.object_map == nullptr);
+ m_image_ctx.object_map = m_object_map;
+
+ ceph_assert(m_image_ctx.journal == nullptr);
+ m_image_ctx.journal = m_journal;
+ }
+
+ m_prepare_lock_completed = true;
+ m_image_ctx.state->handle_prepare_lock_complete();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::revert() {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ m_image_ctx.object_map = nullptr;
+ m_image_ctx.journal = nullptr;
+
+ delete m_object_map;
+ delete m_journal;
+
+ ceph_assert(m_error_result < 0);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::PostAcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.h b/src/librbd/exclusive_lock/PostAcquireRequest.h
new file mode 100644
index 00000000..06fdce39
--- /dev/null
+++ b/src/librbd/exclusive_lock/PostAcquireRequest.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "librbd/ImageCtx.h"
+#include "msg/msg_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class PostAcquireRequest {
+public:
+ static PostAcquireRequest* create(ImageCtxT &image_ctx, Context *on_acquire,
+ Context *on_finish);
+
+ ~PostAcquireRequest();
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v
+ * REFRESH (skip if not
+ * | needed)
+ * v
+ * OPEN_OBJECT_MAP (skip if
+ * | disabled)
+ * v
+ * OPEN_JOURNAL (skip if
+ * | * disabled)
+ * | *
+ * | * * * * * * * *
+ * v *
+ * ALLOCATE_JOURNAL_TAG *
+ * | * *
+ * | * *
+ * | v v
+ * | CLOSE_JOURNAL
+ * | |
+ * | v
+ * | CLOSE_OBJECT_MAP
+ * | |
+ * v |
+ * <finish> <----------/
+ *
+ * @endverbatim
+ */
+
+ PostAcquireRequest(ImageCtxT &image_ctx, Context *on_acquire,
+ Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_acquire;
+ Context *m_on_finish;
+
+ decltype(m_image_ctx.object_map) m_object_map;
+ decltype(m_image_ctx.journal) m_journal;
+
+ bool m_prepare_lock_completed = false;
+ int m_error_result;
+
+ void send_refresh();
+ void handle_refresh(int r);
+
+ void send_open_journal();
+ void handle_open_journal(int r);
+
+ void send_allocate_journal_tag();
+ void handle_allocate_journal_tag(int r);
+
+ void send_open_object_map();
+ void handle_open_object_map(int r);
+
+ void send_close_journal();
+ void handle_close_journal(int r);
+
+ void send_close_object_map();
+ void handle_close_object_map(int r);
+
+ void apply();
+ void revert();
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::PostAcquireRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.cc b/src/librbd/exclusive_lock/PreAcquireRequest.cc
new file mode 100644
index 00000000..ba3da1a2
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreAcquireRequest.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/PreAcquireRequest.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ImageState.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::PreAcquireRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+PreAcquireRequest<I>* PreAcquireRequest<I>::create(I &image_ctx,
+ Context *on_finish) {
+ return new PreAcquireRequest(image_ctx, on_finish);
+}
+
+template <typename I>
+PreAcquireRequest<I>::PreAcquireRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+PreAcquireRequest<I>::~PreAcquireRequest() {
+}
+
+template <typename I>
+void PreAcquireRequest<I>::send() {
+ send_prepare_lock();
+}
+
+template <typename I>
+void PreAcquireRequest<I>::send_prepare_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ // acquire the lock if the image is not busy performing other actions
+ Context *ctx = create_context_callback<
+ PreAcquireRequest<I>, &PreAcquireRequest<I>::handle_prepare_lock>(this);
+ m_image_ctx.state->prepare_lock(ctx);
+}
+
+template <typename I>
+void PreAcquireRequest<I>::handle_prepare_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ send_flush_notifies();
+}
+
+template <typename I>
+void PreAcquireRequest<I>::send_flush_notifies() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_flush_notifies>(
+ this);
+ m_image_ctx.image_watcher->flush(ctx);
+}
+
+template <typename I>
+void PreAcquireRequest<I>::handle_flush_notifies(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(r == 0);
+ finish();
+}
+
+template <typename I>
+void PreAcquireRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::PreAcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.h b/src/librbd/exclusive_lock/PreAcquireRequest.h
new file mode 100644
index 00000000..15d4b2c1
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreAcquireRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "librbd/ImageCtx.h"
+#include "msg/msg_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class PreAcquireRequest {
+public:
+ static PreAcquireRequest* create(ImageCtxT &image_ctx, Context *on_finish);
+
+ ~PreAcquireRequest();
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * PREPARE_LOCK
+ * |
+ * v
+ * FLUSH_NOTIFIES
+ * |
+ * |
+ * |
+ v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ PreAcquireRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ void send_prepare_lock();
+ void handle_prepare_lock(int r);
+
+ void send_flush_notifies();
+ void handle_flush_notifies(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::PreAcquireRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.cc b/src/librbd/exclusive_lock/PreReleaseRequest.cc
new file mode 100644
index 00000000..7dbae6c5
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreReleaseRequest.cc
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/PreReleaseRequest.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatcher.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::PreReleaseRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+PreReleaseRequest<I>* PreReleaseRequest<I>::create(
+ I &image_ctx, bool shutting_down, AsyncOpTracker &async_op_tracker,
+ Context *on_finish) {
+ return new PreReleaseRequest(image_ctx, shutting_down, async_op_tracker,
+ on_finish);
+}
+
+template <typename I>
+PreReleaseRequest<I>::PreReleaseRequest(I &image_ctx, bool shutting_down,
+ AsyncOpTracker &async_op_tracker,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_shutting_down(shutting_down),
+ m_async_op_tracker(async_op_tracker),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)) {
+}
+
+template <typename I>
+PreReleaseRequest<I>::~PreReleaseRequest() {
+ if (!m_shutting_down) {
+ m_image_ctx.state->handle_prepare_lock_complete();
+ }
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send() {
+ send_prepare_lock();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_prepare_lock() {
+ if (m_shutting_down) {
+ send_cancel_op_requests();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ // release the lock if the image is not busy performing other actions
+ Context *ctx = create_context_callback<
+ PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_prepare_lock>(this);
+ m_image_ctx.state->prepare_lock(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_prepare_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ send_cancel_op_requests();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_cancel_op_requests() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_cancel_op_requests>(this);
+ m_image_ctx.cancel_async_requests(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_cancel_op_requests(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ send_block_writes();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_block_writes() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_block_writes>(this);
+
+ {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ // setting the lock as required will automatically cause the IO
+ // queue to re-request the lock if any IO is queued
+ if (m_image_ctx.clone_copy_on_read ||
+ m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_BOTH, true);
+ } else {
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_WRITE, true);
+ }
+ m_image_ctx.io_work_queue->block_writes(ctx);
+ }
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_block_writes(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -EBLACKLISTED) {
+ // allow clean shut down if blacklisted
+ lderr(cct) << "failed to block writes because client is blacklisted"
+ << dendl;
+ } else if (r < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(r) << dendl;
+ m_image_ctx.io_work_queue->unblock_writes();
+ save_result(r);
+ finish();
+ return;
+ }
+
+ send_wait_for_ops();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_wait_for_ops() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_wait_for_ops>(this);
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_wait_for_ops(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ send_invalidate_cache();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_invalidate_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ Context *ctx = create_context_callback<
+ PreReleaseRequest<I>,
+ &PreReleaseRequest<I>::handle_invalidate_cache>(this);
+ m_image_ctx.io_object_dispatcher->invalidate_cache(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_invalidate_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -EBLACKLISTED && r != -EBUSY) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r)
+ << dendl;
+ m_image_ctx.io_work_queue->unblock_writes();
+ save_result(r);
+ finish();
+ return;
+ }
+
+ send_flush_notifies();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_flush_notifies() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx =
+ create_context_callback<klass, &klass::handle_flush_notifies>(this);
+ m_image_ctx.image_watcher->flush(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_flush_notifies(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(r == 0);
+ send_close_journal();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_close_journal() {
+ {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ std::swap(m_journal, m_image_ctx.journal);
+ }
+
+ if (m_journal == nullptr) {
+ send_close_object_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_close_journal>(
+ this);
+ m_journal->close(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_close_journal(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ // error implies some journal events were not flushed -- continue
+ lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl;
+ }
+
+ delete m_journal;
+
+ send_close_object_map();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_close_object_map() {
+ {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ std::swap(m_object_map, m_image_ctx.object_map);
+ }
+
+ if (m_object_map == nullptr) {
+ send_unlock();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_close_object_map>(this);
+ m_object_map->close(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_close_object_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl;
+ }
+
+ delete m_object_map;
+ send_unlock();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_unlock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ finish();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::PreReleaseRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.h b/src/librbd/exclusive_lock/PreReleaseRequest.h
new file mode 100644
index 00000000..e5b85a88
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreReleaseRequest.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H
+
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class AsyncOpTracker;
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class PreReleaseRequest {
+public:
+ static PreReleaseRequest* create(ImageCtxT &image_ctx, bool shutting_down,
+ AsyncOpTracker &async_op_tracker,
+ Context *on_finish);
+
+ ~PreReleaseRequest();
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * PREPARE_LOCK
+ * |
+ * v
+ * CANCEL_OP_REQUESTS
+ * |
+ * v
+ * BLOCK_WRITES
+ * |
+ * v
+ * WAIT_FOR_OPS
+ * |
+ * v
+ * INVALIDATE_CACHE
+ * |
+ * v
+ * FLUSH_NOTIFIES . . . . . . . . . . . . . .
+ * | .
+ * v .
+ * CLOSE_JOURNAL .
+ * | (journal disabled, .
+ * v object map enabled) .
+ * CLOSE_OBJECT_MAP < . . . . . . . . . . . .
+ * | .
+ * v (object map disabled) .
+ * <finish> < . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ PreReleaseRequest(ImageCtxT &image_ctx, bool shutting_down,
+ AsyncOpTracker &async_op_tracker, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ bool m_shutting_down;
+ AsyncOpTracker &m_async_op_tracker;
+ Context *m_on_finish;
+
+ int m_error_result = 0;
+
+ decltype(m_image_ctx.object_map) m_object_map = nullptr;
+ decltype(m_image_ctx.journal) m_journal = nullptr;
+
+ void send_prepare_lock();
+ void handle_prepare_lock(int r);
+
+ void send_cancel_op_requests();
+ void handle_cancel_op_requests(int r);
+
+ void send_block_writes();
+ void handle_block_writes(int r);
+
+ void send_wait_for_ops();
+ void handle_wait_for_ops(int r);
+
+ void send_invalidate_cache();
+ void handle_invalidate_cache(int r);
+
+ void send_flush_notifies();
+ void handle_flush_notifies(int r);
+
+ void send_close_journal();
+ void handle_close_journal(int r);
+
+ void send_close_object_map();
+ void handle_close_object_map(int r);
+
+ void send_unlock();
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/StandardPolicy.cc b/src/librbd/exclusive_lock/StandardPolicy.cc
new file mode 100644
index 00000000..6bdb313b
--- /dev/null
+++ b/src/librbd/exclusive_lock/StandardPolicy.cc
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ExclusiveLock.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock::StandardPolicy "
+
+namespace librbd {
+namespace exclusive_lock {
+
+int StandardPolicy::lock_requested(bool force) {
+ ceph_assert(m_image_ctx->owner_lock.is_locked());
+ ceph_assert(m_image_ctx->exclusive_lock != nullptr);
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force
+ << dendl;
+
+ return -EROFS;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
diff --git a/src/librbd/exclusive_lock/StandardPolicy.h b/src/librbd/exclusive_lock/StandardPolicy.h
new file mode 100644
index 00000000..c756db4f
--- /dev/null
+++ b/src/librbd/exclusive_lock/StandardPolicy.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
+
+#include "librbd/exclusive_lock/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+class StandardPolicy : public Policy {
+public:
+ StandardPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ bool may_auto_request_lock() override {
+ return false;
+ }
+
+ int lock_requested(bool force) override;
+
+private:
+ ImageCtx *m_image_ctx;
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
diff --git a/src/librbd/image/AttachChildRequest.cc b/src/librbd/image/AttachChildRequest.cc
new file mode 100644
index 00000000..f3631bf3
--- /dev/null
+++ b/src/librbd/image/AttachChildRequest.cc
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/AttachChildRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::AttachChildRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+AttachChildRequest<I>::AttachChildRequest(
+ I *image_ctx, I *parent_image_ctx, const librados::snap_t &parent_snap_id,
+ I *old_parent_image_ctx, const librados::snap_t &old_parent_snap_id,
+ uint32_t clone_format, Context* on_finish)
+ : m_image_ctx(image_ctx), m_parent_image_ctx(parent_image_ctx),
+ m_parent_snap_id(parent_snap_id),
+ m_old_parent_image_ctx(old_parent_image_ctx),
+ m_old_parent_snap_id(old_parent_snap_id), m_clone_format(clone_format),
+ m_on_finish(on_finish), m_cct(m_image_ctx->cct) {
+}
+
+template <typename I>
+void AttachChildRequest<I>::send() {
+ if (m_clone_format == 1) {
+ v1_add_child();
+ } else {
+ v2_set_op_feature();
+ }
+}
+
+template <typename I>
+void AttachChildRequest<I>::v1_add_child() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::add_child(&op, {m_parent_image_ctx->md_ctx.get_id(), "",
+ m_parent_image_ctx->id,
+ m_parent_snap_id}, m_image_ctx->id);
+
+ using klass = AttachChildRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v1_add_child>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v1_add_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -EEXIST && m_old_parent_image_ctx != nullptr) {
+ ldout(m_cct, 5) << "child already exists" << dendl;
+ } else {
+ lderr(m_cct) << "couldn't add child: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ }
+
+ v1_refresh();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v1_refresh() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = AttachChildRequest<I>;
+ RefreshRequest<I> *req = RefreshRequest<I>::create(
+ *m_parent_image_ctx, false, false,
+ create_context_callback<klass, &klass::handle_v1_refresh>(this));
+ req->send();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v1_refresh(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ bool snap_protected = false;
+ if (r == 0) {
+ RWLock::RLocker snap_locker(m_parent_image_ctx->snap_lock);
+ r = m_parent_image_ctx->is_snap_protected(m_parent_snap_id,
+ &snap_protected);
+ }
+
+ if (r < 0 || !snap_protected) {
+ lderr(m_cct) << "validate protected failed" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ v1_remove_child_from_old_parent();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v1_remove_child_from_old_parent() {
+ if (m_old_parent_image_ctx == nullptr) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::remove_child(&op, {m_old_parent_image_ctx->md_ctx.get_id(),
+ m_old_parent_image_ctx->md_ctx.get_namespace(),
+ m_old_parent_image_ctx->id,
+ m_old_parent_snap_id}, m_image_ctx->id);
+
+ using klass = AttachChildRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_remove_child_from_old_parent>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v1_remove_child_from_old_parent(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "couldn't remove child: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void AttachChildRequest<I>::v2_set_op_feature() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::op_features_set(&op, RBD_OPERATION_FEATURE_CLONE_CHILD,
+ RBD_OPERATION_FEATURE_CLONE_CHILD);
+
+ using klass = AttachChildRequest<I>;
+ auto aio_comp = create_rados_callback<
+ klass, &klass::handle_v2_set_op_feature>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v2_set_op_feature(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable clone v2: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ v2_child_attach();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v2_child_attach() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::child_attach(&op, m_parent_snap_id,
+ {m_image_ctx->md_ctx.get_id(),
+ m_image_ctx->md_ctx.get_namespace(),
+ m_image_ctx->id});
+
+ using klass = AttachChildRequest<I>;
+ auto aio_comp = create_rados_callback<
+ klass, &klass::handle_v2_child_attach>(this);
+ int r = m_parent_image_ctx->md_ctx.aio_operate(m_parent_image_ctx->header_oid,
+ aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v2_child_attach(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -EEXIST && m_old_parent_image_ctx != nullptr) {
+ ldout(m_cct, 5) << "child already exists" << dendl;
+ } else {
+ lderr(m_cct) << "failed to attach child image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+ }
+
+ v2_child_detach_from_old_parent();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v2_child_detach_from_old_parent() {
+ if (m_old_parent_image_ctx == nullptr) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::child_detach(&op, m_old_parent_snap_id,
+ {m_image_ctx->md_ctx.get_id(),
+ m_image_ctx->md_ctx.get_namespace(),
+ m_image_ctx->id});
+
+ using klass = AttachChildRequest<I>;
+ auto aio_comp = create_rados_callback<
+ klass, &klass::handle_v2_child_detach_from_old_parent>(this);
+ int r = m_old_parent_image_ctx->md_ctx.aio_operate(
+ m_old_parent_image_ctx->header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v2_child_detach_from_old_parent(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to detach child image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void AttachChildRequest<I>::finish(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::AttachChildRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/AttachChildRequest.h b/src/librbd/image/AttachChildRequest.h
new file mode 100644
index 00000000..d1450e8d
--- /dev/null
+++ b/src/librbd/image/AttachChildRequest.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+
+class CephContext;
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class AttachChildRequest {
+public:
+ static AttachChildRequest* create(ImageCtxT *image_ctx,
+ ImageCtxT *parent_image_ctx,
+ const librados::snap_t &parent_snap_id,
+ ImageCtxT *old_parent_image_ctx,
+ const librados::snap_t &old_parent_snap_id,
+ uint32_t clone_format,
+ Context* on_finish) {
+ return new AttachChildRequest(image_ctx, parent_image_ctx, parent_snap_id,
+ old_parent_image_ctx, old_parent_snap_id,
+ clone_format, on_finish);
+ }
+
+ AttachChildRequest(ImageCtxT *image_ctx,
+ ImageCtxT *parent_image_ctx,
+ const librados::snap_t &parent_snap_id,
+ ImageCtxT *old_parent_image_ctx,
+ const librados::snap_t &old_parent_snap_id,
+ uint32_t clone_format,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * (clone v1) | (clone v2)
+ * /----------------/ \---------------\
+ * | |
+ * v v
+ * V1 ADD CHILD V2 SET CLONE
+ * | |
+ * v v
+ * V1 VALIDATE PROTECTED V2 ATTACH CHILD
+ * | |
+ * | v
+ * V1 REMOVE CHILD FROM OLD PARENT V2 DETACH CHILD FROM OLD PARENT
+ * | |
+ * \----------------\ /---------------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ ImageCtxT *m_parent_image_ctx;
+ librados::snap_t m_parent_snap_id;
+ ImageCtxT *m_old_parent_image_ctx;
+ librados::snap_t m_old_parent_snap_id;
+ uint32_t m_clone_format;
+ Context* m_on_finish;
+
+ CephContext *m_cct;
+
+ void v1_add_child();
+ void handle_v1_add_child(int r);
+
+ void v1_refresh();
+ void handle_v1_refresh(int r);
+
+ void v1_remove_child_from_old_parent();
+ void handle_v1_remove_child_from_old_parent(int r);
+
+ void v2_set_op_feature();
+ void handle_v2_set_op_feature(int r);
+
+ void v2_child_attach();
+ void handle_v2_child_attach(int r);
+
+ void v2_child_detach_from_old_parent();
+ void handle_v2_child_detach_from_old_parent(int r);
+
+ void finish(int r);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::AttachChildRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H
diff --git a/src/librbd/image/AttachParentRequest.cc b/src/librbd/image/AttachParentRequest.cc
new file mode 100644
index 00000000..19e69594
--- /dev/null
+++ b/src/librbd/image/AttachParentRequest.cc
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/AttachParentRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::AttachParentRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+
+template <typename I>
+void AttachParentRequest<I>::send() {
+ attach_parent();
+}
+
+template <typename I>
+void AttachParentRequest<I>::attach_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (!m_legacy_parent) {
+ librbd::cls_client::parent_attach(&op, m_parent_image_spec,
+ m_parent_overlap, m_reattach);
+ } else {
+ librbd::cls_client::set_parent(&op, m_parent_image_spec, m_parent_overlap);
+ }
+
+ auto aio_comp = create_rados_callback<
+ AttachParentRequest<I>,
+ &AttachParentRequest<I>::handle_attach_parent>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachParentRequest<I>::handle_attach_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_legacy_parent && r == -EOPNOTSUPP && !m_reattach) {
+ if (m_parent_image_spec.pool_namespace ==
+ m_image_ctx.md_ctx.get_namespace()) {
+ m_parent_image_spec.pool_namespace = "";
+ }
+ if (m_parent_image_spec.pool_namespace.empty()) {
+ ldout(cct, 10) << "retrying using legacy parent method" << dendl;
+ m_legacy_parent = true;
+ attach_parent();
+ return;
+ }
+
+ // namespaces require newer OSDs
+ r = -EXDEV;
+ }
+
+ if (r < 0) {
+ lderr(cct) << "attach parent encountered an error: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void AttachParentRequest<I>::finish(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::AttachParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/AttachParentRequest.h b/src/librbd/image/AttachParentRequest.h
new file mode 100644
index 00000000..482e0327
--- /dev/null
+++ b/src/librbd/image/AttachParentRequest.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class AttachParentRequest {
+public:
+ static AttachParentRequest* create(ImageCtxT& image_ctx,
+ const cls::rbd::ParentImageSpec& pspec,
+ uint64_t parent_overlap,
+ bool reattach,
+ Context* on_finish) {
+ return new AttachParentRequest(image_ctx, pspec, parent_overlap, reattach,
+ on_finish);
+ }
+
+ AttachParentRequest(ImageCtxT& image_ctx,
+ const cls::rbd::ParentImageSpec& pspec,
+ uint64_t parent_overlap, bool reattach,
+ Context* on_finish)
+ : m_image_ctx(image_ctx), m_parent_image_spec(pspec),
+ m_parent_overlap(parent_overlap), m_reattach(reattach),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | * * * * * *
+ * | * * -EOPNOTSUPP
+ * v v *
+ * ATTACH_PARENT * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ cls::rbd::ParentImageSpec m_parent_image_spec;
+ uint64_t m_parent_overlap;
+ bool m_reattach;
+ Context* m_on_finish;
+
+ bool m_legacy_parent = false;
+
+ void attach_parent();
+ void handle_attach_parent(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::AttachParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H
diff --git a/src/librbd/image/CloneRequest.cc b/src/librbd/image/CloneRequest.cc
new file mode 100644
index 00000000..b7b07410
--- /dev/null
+++ b/src/librbd/image/CloneRequest.cc
@@ -0,0 +1,647 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/image/AttachChildRequest.h"
+#include "librbd/image/AttachParentRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CloneRequest: " << this << " " \
+ << __func__ << ": "
+
+#define MAX_KEYS 64
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+using util::create_context_callback;
+using util::create_async_context_callback;
+
+template <typename I>
+CloneRequest<I>::CloneRequest(ConfigProxy& config,
+ IoCtx& parent_io_ctx,
+ const std::string& parent_image_id,
+ const std::string& parent_snap_name,
+ uint64_t parent_snap_id,
+ IoCtx &c_ioctx,
+ const std::string &c_name,
+ const std::string &c_id,
+ ImageOptions c_options,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_config(config), m_parent_io_ctx(parent_io_ctx),
+ m_parent_image_id(parent_image_id), m_parent_snap_name(parent_snap_name),
+ m_parent_snap_id(parent_snap_id), m_ioctx(c_ioctx), m_name(c_name),
+ m_id(c_id), m_opts(c_options),
+ m_non_primary_global_image_id(non_primary_global_image_id),
+ m_primary_mirror_uuid(primary_mirror_uuid),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish),
+ m_use_p_features(true) {
+
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+
+ bool default_format_set;
+ m_opts.is_set(RBD_IMAGE_OPTION_FORMAT, &default_format_set);
+ if (!default_format_set) {
+ m_opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2));
+ }
+
+ ldout(m_cct, 20) << "parent_pool_id=" << parent_io_ctx.get_id() << ", "
+ << "parent_image_id=" << parent_image_id << ", "
+ << "parent_snap=" << parent_snap_name << "/"
+ << parent_snap_id << " clone to "
+ << "pool_id=" << m_ioctx.get_id() << ", "
+ << "name=" << m_name << ", "
+ << "opts=" << m_opts << dendl;
+}
+
+template <typename I>
+void CloneRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+ validate_options();
+}
+
+template <typename I>
+void CloneRequest<I>::validate_options() {
+ ldout(m_cct, 20) << dendl;
+
+ uint64_t format = 0;
+ m_opts.get(RBD_IMAGE_OPTION_FORMAT, &format);
+ if (format < 2) {
+ lderr(m_cct) << "format 2 or later required for clone" << dendl;
+ complete(-EINVAL);
+ return;
+ }
+
+ if (m_opts.get(RBD_IMAGE_OPTION_FEATURES, &m_features) == 0) {
+ if (m_features & ~RBD_FEATURES_ALL) {
+ lderr(m_cct) << "librbd does not support requested features" << dendl;
+ complete(-ENOSYS);
+ return;
+ }
+ m_use_p_features = false;
+ }
+
+ if (m_opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &m_clone_format) < 0) {
+ std::string default_clone_format = m_config.get_val<std::string>(
+ "rbd_default_clone_format");
+ if (default_clone_format == "1") {
+ m_clone_format = 1;
+ } else if (default_clone_format == "auto") {
+ librados::Rados rados(m_ioctx);
+ int8_t min_compat_client;
+ int8_t require_min_compat_client;
+ int r = rados.get_min_compatible_client(&min_compat_client,
+ &require_min_compat_client);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+ if (std::max(min_compat_client, require_min_compat_client) <
+ CEPH_RELEASE_MIMIC) {
+ m_clone_format = 1;
+ }
+ }
+ }
+
+ if (m_clone_format == 1 &&
+ m_parent_io_ctx.get_namespace() != m_ioctx.get_namespace()) {
+ ldout(m_cct, 1) << "clone v2 required for cross-namespace clones" << dendl;
+ complete(-EXDEV);
+ return;
+ }
+
+ open_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::open_parent() {
+ ldout(m_cct, 20) << dendl;
+ ceph_assert(m_parent_snap_name.empty() ^ (m_parent_snap_id == CEPH_NOSNAP));
+
+ if (m_parent_snap_id != CEPH_NOSNAP) {
+ m_parent_image_ctx = I::create("", m_parent_image_id, m_parent_snap_id,
+ m_parent_io_ctx, true);
+ } else {
+ m_parent_image_ctx = I::create("", m_parent_image_id,
+ m_parent_snap_name.c_str(), m_parent_io_ctx,
+ true);
+ }
+
+ Context *ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_open_parent>(this);
+ m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_open_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_parent_image_ctx->destroy();
+ m_parent_image_ctx = nullptr;
+
+ lderr(m_cct) << "failed to open parent image: " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ m_parent_snap_id = m_parent_image_ctx->snap_id;
+ m_pspec = {m_parent_io_ctx.get_id(), m_parent_io_ctx.get_namespace(),
+ m_parent_image_id, m_parent_snap_id};
+ validate_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::validate_parent() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_parent_image_ctx->operations_disabled) {
+ lderr(m_cct) << "image operations disabled due to unsupported op features"
+ << dendl;
+ m_r_saved = -EROFS;
+ close_parent();
+ return;
+ }
+
+ if (m_parent_image_ctx->snap_id == CEPH_NOSNAP) {
+ lderr(m_cct) << "image to be cloned must be a snapshot" << dendl;
+ m_r_saved = -EINVAL;
+ close_parent();
+ return;
+ }
+
+ if (m_parent_image_ctx->old_format) {
+ lderr(m_cct) << "parent image must be in new format" << dendl;
+ m_r_saved = -EINVAL;
+ close_parent();
+ return;
+ }
+
+ m_parent_image_ctx->snap_lock.get_read();
+ uint64_t p_features = m_parent_image_ctx->features;
+ m_size = m_parent_image_ctx->get_image_size(m_parent_image_ctx->snap_id);
+
+ bool snap_protected;
+ int r = m_parent_image_ctx->is_snap_protected(m_parent_image_ctx->snap_id, &snap_protected);
+ m_parent_image_ctx->snap_lock.put_read();
+
+ if ((p_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
+ lderr(m_cct) << "parent image must support layering" << dendl;
+ m_r_saved = -ENOSYS;
+ close_parent();
+ return;
+ }
+ if (m_use_p_features) {
+ m_features = (p_features & ~RBD_FEATURES_IMPLICIT_ENABLE);
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "unable to locate parent's snapshot" << dendl;
+ m_r_saved = r;
+ close_parent();
+ return;
+ }
+
+ if (m_clone_format == 1 && !snap_protected) {
+ lderr(m_cct) << "parent snapshot must be protected" << dendl;
+ m_r_saved = -EINVAL;
+ close_parent();
+ return;
+ }
+
+ validate_child();
+}
+
+template <typename I>
+void CloneRequest<I>::validate_child() {
+ ldout(m_cct, 15) << dendl;
+
+ if ((m_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
+ lderr(m_cct) << "cloning image must support layering" << dendl;
+ m_r_saved = -ENOSYS;
+ close_parent();
+ return;
+ }
+
+ using klass = CloneRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_validate_child>(this);
+
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+
+ int r = m_ioctx.aio_operate(util::old_header_name(m_name), comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_validate_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r != -ENOENT) {
+ lderr(m_cct) << "rbd image " << m_name << " already exists" << dendl;
+ m_r_saved = r;
+ close_parent();
+ return;
+ }
+
+ create_child();
+}
+
+template <typename I>
+void CloneRequest<I>::create_child() {
+ ldout(m_cct, 15) << dendl;
+
+ uint64_t order = m_parent_image_ctx->order;
+ if (m_opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ m_opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ m_opts.set(RBD_IMAGE_OPTION_FEATURES, m_features);
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_create_child>(this);
+
+ RWLock::RLocker snap_locker(m_parent_image_ctx->snap_lock);
+ CreateRequest<I> *req = CreateRequest<I>::create(
+ m_config, m_ioctx, m_name, m_id, m_size, m_opts,
+ m_non_primary_global_image_id, m_primary_mirror_uuid, true,
+ m_op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_create_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EBADF) {
+ ldout(m_cct, 5) << "image id already in-use" << dendl;
+ complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error creating child: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_parent();
+ return;
+ }
+ open_child();
+}
+
+template <typename I>
+void CloneRequest<I>::open_child() {
+ ldout(m_cct, 15) << dendl;
+
+ m_imctx = I::create(m_name, "", nullptr, m_ioctx, false);
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_open_child>(this);
+
+ uint64_t flags = OPEN_FLAG_SKIP_OPEN_PARENT;
+ if ((m_features & RBD_FEATURE_MIGRATING) != 0) {
+ flags |= OPEN_FLAG_IGNORE_MIGRATING;
+ }
+
+ m_imctx->state->open(flags, ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_open_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_imctx->destroy();
+ m_imctx = nullptr;
+
+ lderr(m_cct) << "Error opening new image: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ remove_child();
+ return;
+ }
+
+ attach_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::attach_parent() {
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_attach_parent>(this);
+ auto req = AttachParentRequest<I>::create(
+ *m_imctx, m_pspec, m_size, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_attach_parent(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ return;
+ }
+
+ attach_child();
+}
+
+template <typename I>
+void CloneRequest<I>::attach_child() {
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_attach_child>(this);
+ auto req = AttachChildRequest<I>::create(
+ m_imctx, m_parent_image_ctx, m_parent_image_ctx->snap_id, nullptr, 0,
+ m_clone_format, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_attach_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ return;
+ }
+
+ metadata_list();
+}
+
+template <typename I>
+void CloneRequest<I>::metadata_list() {
+ ldout(m_cct, 15) << "start_key=" << m_last_metadata_key << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::metadata_list_start(&op, m_last_metadata_key, 0);
+
+ using klass = CloneRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_metadata_list>(this);
+ m_out_bl.clear();
+ m_parent_image_ctx->md_ctx.aio_operate(m_parent_image_ctx->header_oid,
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_metadata_list(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ map<string, bufferlist> metadata;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::metadata_list_finish(&it, &metadata);
+ }
+
+ if (r < 0) {
+ if (r == -EOPNOTSUPP || r == -EIO) {
+ ldout(m_cct, 10) << "config metadata not supported by OSD" << dendl;
+ get_mirror_mode();
+ } else {
+ lderr(m_cct) << "couldn't list metadata: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ }
+ return;
+ }
+
+ if (!metadata.empty()) {
+ m_pairs.insert(metadata.begin(), metadata.end());
+ m_last_metadata_key = m_pairs.rbegin()->first;
+ }
+
+ if (metadata.size() == MAX_KEYS) {
+ metadata_list();
+ } else {
+ metadata_set();
+ }
+}
+
+template <typename I>
+void CloneRequest<I>::metadata_set() {
+ if (m_pairs.empty()) {
+ get_mirror_mode();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::metadata_set(&op, m_pairs);
+
+ using klass = CloneRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_metadata_set>(this);
+ int r = m_ioctx.aio_operate(m_imctx->header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_metadata_set(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "couldn't set metadata: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ } else {
+ get_mirror_mode();
+ }
+}
+
+template <typename I>
+void CloneRequest<I>::get_mirror_mode() {
+ ldout(m_cct, 15) << dendl;
+
+ if (!m_imctx->test_features(RBD_FEATURE_JOURNALING)) {
+ close_child();
+ return;
+ }
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = CloneRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_mode>(this);
+ m_out_bl.clear();
+ m_imctx->md_ctx.aio_operate(RBD_MIRRORING,
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_get_mirror_mode(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ close_child();
+ } else {
+ if (m_mirror_mode == cls::rbd::MIRROR_MODE_POOL ||
+ !m_non_primary_global_image_id.empty()) {
+ enable_mirror();
+ } else {
+ close_child();
+ }
+ }
+}
+
+template <typename I>
+void CloneRequest<I>::enable_mirror() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_enable_mirror>(this);
+
+ mirror::EnableRequest<I> *req = mirror::EnableRequest<I>::create(
+ m_imctx->md_ctx, m_id, m_non_primary_global_image_id,
+ m_imctx->op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_enable_mirror(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ m_r_saved = r;
+ }
+ close_child();
+}
+
+template <typename I>
+void CloneRequest<I>::close_child() {
+ ldout(m_cct, 15) << dendl;
+
+ ceph_assert(m_imctx != nullptr);
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_async_context_callback(
+ *m_imctx, create_context_callback<
+ klass, &klass::handle_close_child>(this));
+ m_imctx->state->close(ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_close_child(int r) {
+ ldout(m_cct, 15) << dendl;
+
+ m_imctx->destroy();
+ m_imctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "couldn't close image: " << cpp_strerror(r) << dendl;
+ if (m_r_saved == 0) {
+ m_r_saved = r;
+ }
+ }
+
+ if (m_r_saved < 0) {
+ remove_child();
+ return;
+ }
+
+ close_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::remove_child() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_remove_child>(this);
+
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_ioctx, m_name, m_id, false, false, m_no_op, m_op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_remove_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "Error removing failed clone: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ close_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::close_parent() {
+ ldout(m_cct, 20) << dendl;
+ ceph_assert(m_parent_image_ctx != nullptr);
+
+ Context *ctx = create_async_context_callback(
+ *m_parent_image_ctx, create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_close_parent>(this));
+ m_parent_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_close_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_parent_image_ctx->destroy();
+ m_parent_image_ctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to close parent image: "
+ << cpp_strerror(r) << dendl;
+ if (m_r_saved == 0) {
+ m_r_saved = r;
+ }
+ }
+
+ complete(m_r_saved);
+}
+
+template <typename I>
+void CloneRequest<I>::complete(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} //namespace image
+} //namespace librbd
+
+template class librbd::image::CloneRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CloneRequest.h b/src/librbd/image/CloneRequest.h
new file mode 100644
index 00000000..640de9fe
--- /dev/null
+++ b/src/librbd/image/CloneRequest.h
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H
+
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/internal.h"
+
+class ConfigProxy;
+class Context;
+
+using librados::IoCtx;
+
+namespace librbd {
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CloneRequest {
+public:
+ static CloneRequest *create(ConfigProxy& config, IoCtx& parent_io_ctx,
+ const std::string& parent_image_id,
+ const std::string& parent_snap_name,
+ uint64_t parent_snap_id,
+ IoCtx &c_ioctx, const std::string &c_name,
+ const std::string &c_id, ImageOptions c_options,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new CloneRequest(config, parent_io_ctx, parent_image_id,
+ parent_snap_name, parent_snap_id, c_ioctx, c_name,
+ c_id, c_options, non_primary_global_image_id,
+ primary_mirror_uuid, op_work_queue, on_finish);
+ }
+
+ CloneRequest(ConfigProxy& config, IoCtx& parent_io_ctx,
+ const std::string& parent_image_id,
+ const std::string& parent_snap_name,
+ uint64_t parent_snap_id,
+ IoCtx &c_ioctx, const std::string &c_name,
+ const std::string &c_id, ImageOptions c_options,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN PARENT
+ * |
+ * v
+ * VALIDATE CHILD <finish>
+ * | ^
+ * v |
+ * CREATE CHILD * * * * * * * * * > CLOSE PARENT
+ * | ^
+ * v |
+ * OPEN CHILD * * * * * * * * * * > REMOVE CHILD
+ * | ^
+ * v |
+ * ATTACH PARENT * * * * * * * * > CLOSE CHILD
+ * | ^
+ * v *
+ * ATTACH CHILD * * * * * * * * * * * *
+ * | *
+ * v *
+ * GET PARENT META * * * * * * * * * ^
+ * | *
+ * v (skip if not needed) *
+ * SET CHILD META * * * * * * * * * * ^
+ * | *
+ * v (skip if not needed) *
+ * GET MIRROR MODE * * * * * * * * * ^
+ * | *
+ * v (skip if not needed) *
+ * SET MIRROR ENABLED * * * * * * * * *
+ * |
+ * v
+ * CLOSE CHILD
+ * |
+ * v
+ * CLOSE PARENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ConfigProxy& m_config;
+ IoCtx &m_parent_io_ctx;
+ std::string m_parent_image_id;
+ std::string m_parent_snap_name;
+ uint64_t m_parent_snap_id;
+ ImageCtxT *m_parent_image_ctx;
+
+ IoCtx &m_ioctx;
+ std::string m_name;
+ std::string m_id;
+ ImageOptions m_opts;
+ cls::rbd::ParentImageSpec m_pspec;
+ ImageCtxT *m_imctx;
+ cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ const std::string m_non_primary_global_image_id;
+ const std::string m_primary_mirror_uuid;
+ NoOpProgressContext m_no_op;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ uint64_t m_clone_format = 2;
+ bool m_use_p_features;
+ uint64_t m_features;
+ map<string, bufferlist> m_pairs;
+ std::string m_last_metadata_key;
+ bufferlist m_out_bl;
+ uint64_t m_size;
+ int m_r_saved = 0;
+
+ void validate_options();
+
+ void open_parent();
+ void handle_open_parent(int r);
+
+ void validate_parent();
+
+ void validate_child();
+ void handle_validate_child(int r);
+
+ void create_child();
+ void handle_create_child(int r);
+
+ void open_child();
+ void handle_open_child(int r);
+
+ void attach_parent();
+ void handle_attach_parent(int r);
+
+ void attach_child();
+ void handle_attach_child(int r);
+
+ void metadata_list();
+ void handle_metadata_list(int r);
+
+ void metadata_set();
+ void handle_metadata_set(int r);
+
+ void get_mirror_mode();
+ void handle_get_mirror_mode(int r);
+
+ void enable_mirror();
+ void handle_enable_mirror(int r);
+
+ void close_child();
+ void handle_close_child(int r);
+
+ void remove_child();
+ void handle_remove_child(int r);
+
+ void close_parent();
+ void handle_close_parent(int r);
+
+ void complete(int r);
+};
+
+} //namespace image
+} //namespace librbd
+
+extern template class librbd::image::CloneRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H
diff --git a/src/librbd/image/CloseRequest.cc b/src/librbd/image/CloseRequest.cc
new file mode 100644
index 00000000..c43e1833
--- /dev/null
+++ b/src/librbd/image/CloseRequest.cc
@@ -0,0 +1,342 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/CloseRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ConfigWatcher.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatcher.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CloseRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+CloseRequest<I>::CloseRequest(I *image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0),
+ m_exclusive_lock(nullptr) {
+ ceph_assert(image_ctx != nullptr);
+}
+
+template <typename I>
+void CloseRequest<I>::send() {
+ if (m_image_ctx->config_watcher != nullptr) {
+ m_image_ctx->config_watcher->shut_down();
+
+ delete m_image_ctx->config_watcher;
+ m_image_ctx->config_watcher = nullptr;
+ }
+
+ send_block_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_block_image_watcher() {
+ if (m_image_ctx->image_watcher == nullptr) {
+ send_shut_down_update_watchers();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // prevent incoming requests from our peers
+ m_image_ctx->image_watcher->block_notifies(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_block_image_watcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_block_image_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ send_shut_down_update_watchers();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_update_watchers() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->state->shut_down_update_watchers(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_shut_down_update_watchers>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_update_watchers(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down update watchers: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ send_shut_down_io_queue();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_io_queue() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ m_image_ctx->io_work_queue->shut_down(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_shut_down_io_queue>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_io_queue(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ send_shut_down_exclusive_lock();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_exclusive_lock() {
+ {
+ RWLock::WLocker owner_locker(m_image_ctx->owner_lock);
+ m_exclusive_lock = m_image_ctx->exclusive_lock;
+
+ // if reading a snapshot -- possible object map is open
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ if (m_exclusive_lock == nullptr) {
+ delete m_image_ctx->object_map;
+ m_image_ctx->object_map = nullptr;
+ }
+ }
+
+ if (m_exclusive_lock == nullptr) {
+ send_flush();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // in-flight IO will be flushed and in-flight requests will be canceled
+ // before releasing lock
+ m_exclusive_lock->shut_down(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_shut_down_exclusive_lock>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_exclusive_lock(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ ceph_assert(m_image_ctx->exclusive_lock == nullptr);
+
+ // object map and journal closed during exclusive lock shutdown
+ RWLock::RLocker snap_locker(m_image_ctx->snap_lock);
+ ceph_assert(m_image_ctx->journal == nullptr);
+ ceph_assert(m_image_ctx->object_map == nullptr);
+ }
+
+ delete m_exclusive_lock;
+ m_exclusive_lock = nullptr;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ send_unregister_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ auto ctx = create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(ctx, m_image_ctx,
+ io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec<I>::create_flush_request(
+ *m_image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ delete req;
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl;
+ }
+ send_unregister_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_unregister_image_watcher() {
+ if (m_image_ctx->image_watcher == nullptr) {
+ send_flush_readahead();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->image_watcher->unregister_watch(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_unregister_image_watcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_unregister_image_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to unregister image watcher: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ send_flush_readahead();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_readahead() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->readahead.wait_for_pending(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush_readahead>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_readahead(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ send_shut_down_object_dispatcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_object_dispatcher() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->io_object_dispatcher->shut_down(create_context_callback<
+ CloseRequest<I>,
+ &CloseRequest<I>::handle_shut_down_object_dispatcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_object_dispatcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down object dispatcher: "
+ << cpp_strerror(r) << dendl;
+ }
+ send_flush_op_work_queue();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_op_work_queue() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->op_work_queue->queue(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush_op_work_queue>(this), 0);
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_op_work_queue(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+ send_close_parent();
+}
+
+template <typename I>
+void CloseRequest<I>::send_close_parent() {
+ if (m_image_ctx->parent == nullptr) {
+ send_flush_image_watcher();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->parent->state->close(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_close_parent>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_close_parent(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ delete m_image_ctx->parent;
+ m_image_ctx->parent = nullptr;
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "error closing parent image: " << cpp_strerror(r) << dendl;
+ }
+ send_flush_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_image_watcher() {
+ if (m_image_ctx->image_watcher == nullptr) {
+ finish();
+ return;
+ }
+
+ m_image_ctx->image_watcher->flush(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush_image_watcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_image_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "error flushing image watcher: " << cpp_strerror(r) << dendl;
+ }
+ save_result(r);
+ finish();
+}
+
+template <typename I>
+void CloseRequest<I>::finish() {
+ m_image_ctx->shutdown();
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::CloseRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CloseRequest.h b/src/librbd/image/CloseRequest.h
new file mode 100644
index 00000000..fd101bee
--- /dev/null
+++ b/src/librbd/image/CloseRequest.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
+
+#include "librbd/ImageCtx.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CloseRequest {
+public:
+ static CloseRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return new CloseRequest(image_ctx, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * BLOCK_IMAGE_WATCHER (skip if R/O)
+ * |
+ * v
+ * SHUT_DOWN_UPDATE_WATCHERS
+ * |
+ * v
+ * SHUT_DOWN_AIO_WORK_QUEUE . . .
+ * | . (exclusive lock disabled)
+ * v v
+ * SHUT_DOWN_EXCLUSIVE_LOCK FLUSH
+ * | .
+ * | . . . . . . . . . . .
+ * | .
+ * v v
+ * UNREGISTER_IMAGE_WATCHER (skip if R/O)
+ * |
+ * v
+ * FLUSH_READAHEAD
+ * |
+ * v
+ * SHUT_DOWN_OBJECT_DISPATCHER
+ * |
+ * v
+ * FLUSH_OP_WORK_QUEUE
+ * |
+ * v (skip if no parent)
+ * CLOSE_PARENT
+ * |
+ * v
+ * FLUSH_IMAGE_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ CloseRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ decltype(m_image_ctx->exclusive_lock) m_exclusive_lock;
+
+ void send_block_image_watcher();
+ void handle_block_image_watcher(int r);
+
+ void send_shut_down_update_watchers();
+ void handle_shut_down_update_watchers(int r);
+
+ void send_shut_down_io_queue();
+ void handle_shut_down_io_queue(int r);
+
+ void send_shut_down_exclusive_lock();
+ void handle_shut_down_exclusive_lock(int r);
+
+ void send_flush();
+ void handle_flush(int r);
+
+ void send_unregister_image_watcher();
+ void handle_unregister_image_watcher(int r);
+
+ void send_flush_readahead();
+ void handle_flush_readahead(int r);
+
+ void send_shut_down_object_dispatcher();
+ void handle_shut_down_object_dispatcher(int r);
+
+ void send_flush_op_work_queue();
+ void handle_flush_op_work_queue(int r);
+
+ void send_close_parent();
+ void handle_close_parent(int r);
+
+ void send_flush_image_watcher();
+ void handle_flush_image_watcher(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::CloseRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
diff --git a/src/librbd/image/CreateRequest.cc b/src/librbd/image/CreateRequest.cc
new file mode 100644
index 00000000..2dfdc6b3
--- /dev/null
+++ b/src/librbd/image/CreateRequest.cc
@@ -0,0 +1,830 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/CreateRequest.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/ceph_context.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "osdc/Striper.h"
+#include "librbd/Features.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/ValidatePoolRequest.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "journal/Journaler.h"
+
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CreateRequest: " << __func__ \
+ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+using util::create_context_callback;
+
+namespace {
+
+int validate_features(CephContext *cct, uint64_t features,
+ bool force_non_primary) {
+ if (features & ~RBD_FEATURES_ALL) {
+ lderr(cct) << "librbd does not support requested features." << dendl;
+ return -ENOSYS;
+ }
+ if ((features & RBD_FEATURES_INTERNAL) != 0) {
+ lderr(cct) << "cannot use internally controlled features" << dendl;
+ return -EINVAL;
+ }
+ if ((features & RBD_FEATURE_FAST_DIFF) != 0 &&
+ (features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ lderr(cct) << "cannot use fast diff without object map" << dendl;
+ return -EINVAL;
+ }
+ if ((features & RBD_FEATURE_OBJECT_MAP) != 0 &&
+ (features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot use object map without exclusive lock" << dendl;
+ return -EINVAL;
+ }
+ if ((features & RBD_FEATURE_JOURNALING) != 0) {
+ if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot use journaling without exclusive lock" << dendl;
+ return -EINVAL;
+ }
+ } else if (force_non_primary) {
+ ceph_abort();
+ }
+
+ return 0;
+}
+
+int validate_striping(CephContext *cct, uint8_t order, uint64_t stripe_unit,
+ uint64_t stripe_count) {
+ if ((stripe_unit && !stripe_count) ||
+ (!stripe_unit && stripe_count)) {
+ lderr(cct) << "must specify both (or neither) of stripe-unit and "
+ << "stripe-count" << dendl;
+ return -EINVAL;
+ } else if (stripe_unit || stripe_count) {
+ if ((1ull << order) % stripe_unit || stripe_unit > (1ull << order)) {
+ lderr(cct) << "stripe unit is not a factor of the object size" << dendl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+bool validate_layout(CephContext *cct, uint64_t size, file_layout_t &layout) {
+ if (!librbd::ObjectMap<>::is_compatible(layout, size)) {
+ lderr(cct) << "image size not compatible with object map" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+int get_image_option(const ImageOptions &image_options, int option,
+ uint8_t *value) {
+ uint64_t large_value;
+ int r = image_options.get(option, &large_value);
+ if (r < 0) {
+ return r;
+ }
+ *value = static_cast<uint8_t>(large_value);
+ return 0;
+}
+
+} // anonymous namespace
+
+template<typename I>
+int CreateRequest<I>::validate_order(CephContext *cct, uint8_t order) {
+ if (order > 25 || order < 12) {
+ lderr(cct) << "order must be in the range [12, 25]" << dendl;
+ return -EDOM;
+ }
+ return 0;
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CreateRequest: " << this << " " \
+ << __func__ << ": "
+
+template<typename I>
+CreateRequest<I>::CreateRequest(const ConfigProxy& config, IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ const ImageOptions &image_options,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_config(config), m_image_name(image_name), m_image_id(image_id),
+ m_size(size), m_non_primary_global_image_id(non_primary_global_image_id),
+ m_primary_mirror_uuid(primary_mirror_uuid),
+ m_skip_mirror_enable(skip_mirror_enable),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+
+ m_io_ctx.dup(ioctx);
+ m_cct = reinterpret_cast<CephContext *>(m_io_ctx.cct());
+
+ m_id_obj = util::id_obj_name(m_image_name);
+ m_header_obj = util::header_name(m_image_id);
+ m_objmap_name = ObjectMap<>::object_map_name(m_image_id, CEPH_NOSNAP);
+ m_force_non_primary = !non_primary_global_image_id.empty();
+
+ if (image_options.get(RBD_IMAGE_OPTION_FEATURES, &m_features) != 0) {
+ m_features = librbd::rbd_features_from_string(
+ m_config.get_val<std::string>("rbd_default_features"), nullptr);
+ m_negotiate_features = true;
+ }
+
+ uint64_t features_clear = 0;
+ uint64_t features_set = 0;
+ image_options.get(RBD_IMAGE_OPTION_FEATURES_CLEAR, &features_clear);
+ image_options.get(RBD_IMAGE_OPTION_FEATURES_SET, &features_set);
+
+ uint64_t features_conflict = features_clear & features_set;
+ features_clear &= ~features_conflict;
+ features_set &= ~features_conflict;
+ m_features |= features_set;
+ m_features &= ~features_clear;
+
+ m_features &= ~RBD_FEATURES_IMPLICIT_ENABLE;
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP) {
+ m_features |= RBD_FEATURE_FAST_DIFF;
+ }
+
+ if (image_options.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &m_stripe_unit) != 0 ||
+ m_stripe_unit == 0) {
+ m_stripe_unit = m_config.get_val<Option::size_t>("rbd_default_stripe_unit");
+ }
+ if (image_options.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &m_stripe_count) != 0 ||
+ m_stripe_count == 0) {
+ m_stripe_count = m_config.get_val<uint64_t>("rbd_default_stripe_count");
+ }
+ if (get_image_option(image_options, RBD_IMAGE_OPTION_ORDER, &m_order) != 0 ||
+ m_order == 0) {
+ m_order = config.get_val<uint64_t>("rbd_default_order");
+ }
+ if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_ORDER,
+ &m_journal_order) != 0) {
+ m_journal_order = m_config.get_val<uint64_t>("rbd_journal_order");
+ }
+ if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
+ &m_journal_splay_width) != 0) {
+ m_journal_splay_width = m_config.get_val<uint64_t>(
+ "rbd_journal_splay_width");
+ }
+ if (image_options.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &m_journal_pool) != 0) {
+ m_journal_pool = m_config.get_val<std::string>("rbd_journal_pool");
+ }
+ if (image_options.get(RBD_IMAGE_OPTION_DATA_POOL, &m_data_pool) != 0) {
+ m_data_pool = m_config.get_val<std::string>("rbd_default_data_pool");
+ }
+
+ m_layout.object_size = 1ull << m_order;
+ if (m_stripe_unit == 0 || m_stripe_count == 0) {
+ m_layout.stripe_unit = m_layout.object_size;
+ m_layout.stripe_count = 1;
+ } else {
+ m_layout.stripe_unit = m_stripe_unit;
+ m_layout.stripe_count = m_stripe_count;
+ }
+
+ if (!m_data_pool.empty() && m_data_pool != ioctx.get_pool_name()) {
+ m_features |= RBD_FEATURE_DATA_POOL;
+ } else {
+ m_data_pool.clear();
+ }
+
+ if ((m_stripe_unit != 0 && m_stripe_unit != (1ULL << m_order)) ||
+ (m_stripe_count != 0 && m_stripe_count != 1)) {
+ m_features |= RBD_FEATURE_STRIPINGV2;
+ }
+
+ ldout(m_cct, 10) << "name=" << m_image_name << ", "
+ << "id=" << m_image_id << ", "
+ << "size=" << m_size << ", "
+ << "features=" << m_features << ", "
+ << "order=" << (uint64_t)m_order << ", "
+ << "stripe_unit=" << m_stripe_unit << ", "
+ << "stripe_count=" << m_stripe_count << ", "
+ << "journal_order=" << (uint64_t)m_journal_order << ", "
+ << "journal_splay_width="
+ << (uint64_t)m_journal_splay_width << ", "
+ << "journal_pool=" << m_journal_pool << ", "
+ << "data_pool=" << m_data_pool << dendl;
+}
+
+template<typename I>
+void CreateRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+
+ int r = validate_features(m_cct, m_features, m_force_non_primary);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ r = validate_order(m_cct, m_order);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ r = validate_striping(m_cct, m_order, m_stripe_unit, m_stripe_count);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) &&
+ (!validate_layout(m_cct, m_size, m_layout))) {
+ complete(-EINVAL);
+ return;
+ }
+
+ validate_data_pool();
+}
+
+template <typename I>
+void CreateRequest<I>::validate_data_pool() {
+ m_data_io_ctx = m_io_ctx;
+ if ((m_features & RBD_FEATURE_DATA_POOL) != 0) {
+ librados::Rados rados(m_io_ctx);
+ int r = rados.ioctx_create(m_data_pool.c_str(), m_data_io_ctx);
+ if (r < 0) {
+ lderr(m_cct) << "data pool " << m_data_pool << " does not exist" << dendl;
+ complete(r);
+ return;
+ }
+ m_data_pool_id = m_data_io_ctx.get_id();
+ m_data_io_ctx.set_namespace(m_io_ctx.get_namespace());
+ }
+
+ if (!m_config.get_val<bool>("rbd_validate_pool")) {
+ add_image_to_directory();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CreateRequest<I>, &CreateRequest<I>::handle_validate_data_pool>(this);
+ auto req = ValidatePoolRequest<I>::create(m_data_io_ctx, m_op_work_queue,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateRequest<I>::handle_validate_data_pool(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EINVAL) {
+ lderr(m_cct) << "pool does not support RBD images" << dendl;
+ complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to validate pool: " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ add_image_to_directory();
+}
+
+template<typename I>
+void CreateRequest<I>::add_image_to_directory() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (!m_io_ctx.get_namespace().empty()) {
+ cls_client::dir_state_assert(&op, cls::rbd::DIRECTORY_STATE_READY);
+ }
+ cls_client::dir_add_image(&op, m_image_name, m_image_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_add_image_to_directory>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_add_image_to_directory(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 5) << "directory entry for image " << m_image_name
+ << " already exists" << dendl;
+ complete(r);
+ return;
+ } else if (!m_io_ctx.get_namespace().empty() && r == -ENOENT) {
+ ldout(m_cct, 5) << "namespace " << m_io_ctx.get_namespace()
+ << " does not exist" << dendl;
+ complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error adding image to directory: " << cpp_strerror(r)
+ << dendl;
+ complete(r);
+ return;
+ }
+
+ create_id_object();
+}
+
+template<typename I>
+void CreateRequest<I>::create_id_object() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_client::set_id(&op, m_image_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_create_id_object>(this);
+ int r = m_io_ctx.aio_operate(m_id_obj, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_create_id_object(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 5) << "id object for " << m_image_name << " already exists"
+ << dendl;
+ m_r_saved = r;
+ remove_from_dir();
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error creating RBD id object: " << cpp_strerror(r)
+ << dendl;
+ m_r_saved = r;
+ remove_from_dir();
+ return;
+ }
+
+ negotiate_features();
+}
+
+template<typename I>
+void CreateRequest<I>::negotiate_features() {
+ if (!m_negotiate_features) {
+ create_image();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_all_features_start(&op);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_negotiate_features>(this);
+
+ m_outbl.clear();
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_negotiate_features(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ uint64_t all_features;
+ if (r >= 0) {
+ auto it = m_outbl.cbegin();
+ r = cls_client::get_all_features_finish(&it, &all_features);
+ }
+ if (r < 0) {
+ ldout(m_cct, 10) << "error retrieving server supported features set: "
+ << cpp_strerror(r) << dendl;
+ } else if ((m_features & all_features) != m_features) {
+ m_features &= all_features;
+ ldout(m_cct, 10) << "limiting default features set to server supported: "
+ << m_features << dendl;
+ }
+
+ create_image();
+}
+
+template<typename I>
+void CreateRequest<I>::create_image() {
+ ldout(m_cct, 15) << dendl;
+ ceph_assert(m_data_pool.empty() || m_data_pool_id != -1);
+
+ ostringstream oss;
+ oss << RBD_DATA_PREFIX;
+ if (m_data_pool_id != -1) {
+ oss << stringify(m_io_ctx.get_id()) << ".";
+ }
+ oss << m_image_id;
+ if (oss.str().length() > RBD_MAX_BLOCK_NAME_PREFIX_LENGTH) {
+ lderr(m_cct) << "object prefix '" << oss.str() << "' too large" << dendl;
+ m_r_saved = -EINVAL;
+ remove_id_object();
+ return;
+ }
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_client::create_image(&op, m_size, m_order, m_features, oss.str(),
+ m_data_pool_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_create_image>(this);
+ int r = m_io_ctx.aio_operate(m_header_obj, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_create_image(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 5) << "image id already in-use" << dendl;
+ complete(-EBADF);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ remove_id_object();
+ return;
+ }
+
+ set_stripe_unit_count();
+}
+
+template<typename I>
+void CreateRequest<I>::set_stripe_unit_count() {
+ if ((!m_stripe_unit && !m_stripe_count) ||
+ ((m_stripe_count == 1) && (m_stripe_unit == (1ull << m_order)))) {
+ object_map_resize();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_stripe_unit_count(&op, m_stripe_unit, m_stripe_count);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_stripe_unit_count>(this);
+ int r = m_io_ctx.aio_operate(m_header_obj, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_set_stripe_unit_count(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error setting stripe unit/count: "
+ << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ remove_header_object();
+ return;
+ }
+
+ object_map_resize();
+}
+
+template<typename I>
+void CreateRequest<I>::object_map_resize() {
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ fetch_mirror_mode();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::object_map_resize(&op, Striper::get_num_objects(m_layout, m_size),
+ OBJECT_NONEXISTENT);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_object_map_resize>(this);
+ int r = m_io_ctx.aio_operate(m_objmap_name, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_object_map_resize(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error creating initial object map: "
+ << cpp_strerror(r) << dendl;
+
+ m_r_saved = r;
+ remove_header_object();
+ return;
+ }
+
+ fetch_mirror_mode();
+}
+
+template<typename I>
+void CreateRequest<I>::fetch_mirror_mode() {
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ complete(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_fetch_mirror_mode>(this);
+ m_outbl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_outbl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_fetch_mirror_mode(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if ((r < 0) && (r != -ENOENT)) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ remove_object_map();
+ return;
+ }
+
+ cls::rbd::MirrorMode mirror_mode_internal = cls::rbd::MIRROR_MODE_DISABLED;
+ if (r == 0) {
+ auto it = m_outbl.cbegin();
+ r = cls_client::mirror_mode_get_finish(&it, &mirror_mode_internal);
+ if (r < 0) {
+ lderr(m_cct) << "Failed to retrieve mirror mode" << dendl;
+
+ m_r_saved = r;
+ remove_object_map();
+ return;
+ }
+ }
+
+ // TODO: remove redundant code...
+ switch (mirror_mode_internal) {
+ case cls::rbd::MIRROR_MODE_DISABLED:
+ case cls::rbd::MIRROR_MODE_IMAGE:
+ case cls::rbd::MIRROR_MODE_POOL:
+ m_mirror_mode = static_cast<rbd_mirror_mode_t>(mirror_mode_internal);
+ break;
+ default:
+ lderr(m_cct) << "Unknown mirror mode ("
+ << static_cast<uint32_t>(mirror_mode_internal) << ")" << dendl;
+ r = -EINVAL;
+ remove_object_map();
+ return;
+ }
+
+ journal_create();
+}
+
+template<typename I>
+void CreateRequest<I>::journal_create() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journal_create>(
+ this);
+
+ librbd::journal::TagData tag_data;
+ tag_data.mirror_uuid = (m_force_non_primary ? m_primary_mirror_uuid :
+ librbd::Journal<I>::LOCAL_MIRROR_UUID);
+
+ librbd::journal::CreateRequest<I> *req =
+ librbd::journal::CreateRequest<I>::create(
+ m_io_ctx, m_image_id, m_journal_order, m_journal_splay_width,
+ m_journal_pool, cls::journal::Tag::TAG_CLASS_NEW, tag_data,
+ librbd::Journal<I>::IMAGE_CLIENT_ID, m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_journal_create(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error creating journal: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ remove_object_map();
+ return;
+ }
+
+ mirror_image_enable();
+}
+
+template<typename I>
+void CreateRequest<I>::mirror_image_enable() {
+ if (((m_mirror_mode != RBD_MIRROR_MODE_POOL) && !m_force_non_primary) ||
+ m_skip_mirror_enable) {
+ complete(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+ auto ctx = create_context_callback<
+ CreateRequest<I>, &CreateRequest<I>::handle_mirror_image_enable>(this);
+ auto req = mirror::EnableRequest<I>::create(m_io_ctx, m_image_id,
+ m_non_primary_global_image_id,
+ m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_mirror_image_enable(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "cannot enable mirroring: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ journal_remove();
+ return;
+ }
+
+ complete(0);
+}
+
+template<typename I>
+void CreateRequest<I>::complete(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_data_io_ctx.close();
+ auto on_finish = m_on_finish;
+ delete this;
+ on_finish->complete(r);
+}
+
+// cleanup
+template<typename I>
+void CreateRequest<I>::journal_remove() {
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ remove_object_map();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journal_remove>(
+ this);
+
+ librbd::journal::RemoveRequest<I> *req =
+ librbd::journal::RemoveRequest<I>::create(
+ m_io_ctx, m_image_id, librbd::Journal<I>::IMAGE_CLIENT_ID, m_op_work_queue,
+ ctx);
+ req->send();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_journal_remove(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up journal after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_object_map();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_object_map() {
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ remove_header_object();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_object_map>(this);
+ int r = m_io_ctx.aio_remove(m_objmap_name, comp);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_object_map(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up object map after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_header_object();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_header_object() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_header_object>(this);
+ int r = m_io_ctx.aio_remove(m_header_obj, comp);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_header_object(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up image header after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_id_object();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_id_object() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_id_object>(this);
+ int r = m_io_ctx.aio_remove(m_id_obj, comp);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_id_object(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up id object after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_from_dir();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_from_dir() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::dir_remove_image(&op, m_image_name, m_image_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_from_dir>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_from_dir(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up image from rbd_directory object "
+ << "after creation failed: " << cpp_strerror(r) << dendl;
+ }
+
+ complete(m_r_saved);
+}
+
+} //namespace image
+} //namespace librbd
+
+template class librbd::image::CreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CreateRequest.h b/src/librbd/image/CreateRequest.h
new file mode 100644
index 00000000..6d5b641d
--- /dev/null
+++ b/src/librbd/image/CreateRequest.h
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+
+class ConfigProxy;
+class Context;
+class ContextWQ;
+
+using librados::IoCtx;
+
+namespace journal { class Journaler; }
+
+namespace librbd {
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CreateRequest {
+public:
+ static CreateRequest *create(const ConfigProxy& config, IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ const ImageOptions &image_options,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new CreateRequest(config, ioctx, image_name, image_id, size,
+ image_options, non_primary_global_image_id,
+ primary_mirror_uuid, skip_mirror_enable,
+ op_work_queue, on_finish);
+ }
+
+ static int validate_order(CephContext *cct, uint8_t order);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> . . . . > . . . . .
+ * | .
+ * v .
+ * VALIDATE DATA POOL v (pool validation
+ * | . disabled)
+ * v .
+ * (error: bottom up) ADD IMAGE TO DIRECTORY < . . . .
+ * _______<_______ |
+ * | | v
+ * | | CREATE ID OBJECT
+ * | | / |
+ * | REMOVE FROM DIR <-------/ v
+ * | | NEGOTIATE FEATURES (when using default features)
+ * | | |
+ * | | v (stripingv2 disabled)
+ * | | CREATE IMAGE. . . . > . . . .
+ * v | / | .
+ * | REMOVE ID OBJ <---------/ v .
+ * | | SET STRIPE UNIT COUNT .
+ * | | / | \ . . . . . > . . . .
+ * | REMOVE HEADER OBJ<------/ v /. (object-map
+ * | |\ OBJECT MAP RESIZE . . < . . * v disabled)
+ * | | \ / | \ . . . . . > . . . .
+ * | | *<-----------/ v /. (journaling
+ * | | FETCH MIRROR MODE. . < . . * v disabled)
+ * | | / | .
+ * | REMOVE OBJECT MAP<--------/ v .
+ * | |\ JOURNAL CREATE .
+ * | | \ / | .
+ * v | *<------------/ v .
+ * | | MIRROR IMAGE ENABLE .
+ * | | / | .
+ * | JOURNAL REMOVE*<-------/ | .
+ * | v .
+ * |_____________>___________________<finish> . . . . < . . . .
+ *
+ * @endverbatim
+ */
+
+ CreateRequest(const ConfigProxy& config, IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ const ImageOptions &image_options,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ const ConfigProxy& m_config;
+ IoCtx m_io_ctx;
+ IoCtx m_data_io_ctx;
+ std::string m_image_name;
+ std::string m_image_id;
+ uint64_t m_size;
+ uint8_t m_order = 0;
+ uint64_t m_features = 0;
+ uint64_t m_stripe_unit = 0;
+ uint64_t m_stripe_count = 0;
+ uint8_t m_journal_order = 0;
+ uint8_t m_journal_splay_width = 0;
+ std::string m_journal_pool;
+ std::string m_data_pool;
+ int64_t m_data_pool_id = -1;
+ const std::string m_non_primary_global_image_id;
+ const std::string m_primary_mirror_uuid;
+ bool m_skip_mirror_enable;
+ bool m_negotiate_features = false;
+
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ int m_r_saved = 0; // used to return actual error after cleanup
+ bool m_force_non_primary;
+ file_layout_t m_layout;
+ std::string m_id_obj, m_header_obj, m_objmap_name;
+
+ bufferlist m_outbl;
+ rbd_mirror_mode_t m_mirror_mode = RBD_MIRROR_MODE_DISABLED;
+ cls::rbd::MirrorImage m_mirror_image_internal;
+
+ void validate_data_pool();
+ void handle_validate_data_pool(int r);
+
+ void add_image_to_directory();
+ void handle_add_image_to_directory(int r);
+
+ void create_id_object();
+ void handle_create_id_object(int r);
+
+ void negotiate_features();
+ void handle_negotiate_features(int r);
+
+ void create_image();
+ void handle_create_image(int r);
+
+ void set_stripe_unit_count();
+ void handle_set_stripe_unit_count(int r);
+
+ void object_map_resize();
+ void handle_object_map_resize(int r);
+
+ void fetch_mirror_mode();
+ void handle_fetch_mirror_mode(int r);
+
+ void journal_create();
+ void handle_journal_create(int r);
+
+ void mirror_image_enable();
+ void handle_mirror_image_enable(int r);
+
+ void complete(int r);
+
+ // cleanup
+ void journal_remove();
+ void handle_journal_remove(int r);
+
+ void remove_object_map();
+ void handle_remove_object_map(int r);
+
+ void remove_header_object();
+ void handle_remove_header_object(int r);
+
+ void remove_id_object();
+ void handle_remove_id_object(int r);
+
+ void remove_from_dir();
+ void handle_remove_from_dir(int r);
+
+};
+
+} //namespace image
+} //namespace librbd
+
+extern template class librbd::image::CreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H
diff --git a/src/librbd/image/DetachChildRequest.cc b/src/librbd/image/DetachChildRequest.cc
new file mode 100644
index 00000000..86cd4a88
--- /dev/null
+++ b/src/librbd/image/DetachChildRequest.cc
@@ -0,0 +1,303 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/DetachChildRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/DisabledPolicy.h"
+#include <string>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::DetachChildRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+DetachChildRequest<I>::~DetachChildRequest() {
+ ceph_assert(m_parent_image_ctx == nullptr);
+}
+
+template <typename I>
+void DetachChildRequest<I>::send() {
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!m_image_ctx.snap_info.empty()) {
+ m_parent_spec = m_image_ctx.snap_info.begin()->second.parent.spec;
+ } else {
+ m_parent_spec = m_image_ctx.parent_md.spec;
+ }
+ }
+
+ if (m_parent_spec.pool_id == -1) {
+ // ignore potential race with parent disappearing
+ m_image_ctx.op_work_queue->queue(create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::finish>(this), 0);
+ return;
+ } else if (!m_image_ctx.test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) {
+ clone_v1_remove_child();
+ return;
+ }
+
+ clone_v2_child_detach();
+}
+
+template <typename I>
+void DetachChildRequest<I>::clone_v2_child_detach() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::child_detach(&op, m_parent_spec.snap_id,
+ {m_image_ctx.md_ctx.get_id(),
+ m_image_ctx.md_ctx.get_namespace(),
+ m_image_ctx.id});
+
+ int r = util::create_ioctx(m_image_ctx.md_ctx, "parent image",
+ m_parent_spec.pool_id,
+ m_parent_spec.pool_namespace, &m_parent_io_ctx);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ m_parent_header_name = util::header_name(m_parent_spec.image_id);
+
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_child_detach>(this);
+ r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DetachChildRequest<I>::handle_clone_v2_child_detach(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error detaching child from parent: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ clone_v2_get_snapshot();
+}
+
+template <typename I>
+void DetachChildRequest<I>::clone_v2_get_snapshot() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::snapshot_get_start(&op, m_parent_spec.snap_id);
+
+ m_out_bl.clear();
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_get_snapshot>(this);
+ int r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DetachChildRequest<I>::handle_clone_v2_get_snapshot(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ bool remove_snapshot = false;
+ if (r == 0) {
+ cls::rbd::SnapshotInfo snap_info;
+ auto it = m_out_bl.cbegin();
+ r = cls_client::snapshot_get_finish(&it, &snap_info);
+ if (r == 0) {
+ m_parent_snap_namespace = snap_info.snapshot_namespace;
+ m_parent_snap_name = snap_info.name;
+
+ if (cls::rbd::get_snap_namespace_type(m_parent_snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH &&
+ snap_info.child_count == 0) {
+ // snapshot is in trash w/ zero children, so remove it
+ remove_snapshot = true;
+ }
+ }
+ }
+
+ if (r < 0) {
+ ldout(cct, 5) << "failed to retrieve snapshot: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ if (!remove_snapshot) {
+ finish(0);
+ return;
+ }
+
+ clone_v2_open_parent();
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_open_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ m_parent_image_ctx = I::create("", m_parent_spec.image_id, nullptr,
+ m_parent_io_ctx, false);
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_open_parent>(this);
+ m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_open_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 5) << "failed to open parent for read/write: "
+ << cpp_strerror(r) << dendl;
+ m_parent_image_ctx->destroy();
+ m_parent_image_ctx = nullptr;
+ finish(0);
+ return;
+ }
+
+ // do not attempt to open the parent journal when removing the trash
+ // snapshot, because the parent may be not promoted
+ if (m_parent_image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ RWLock::WLocker snap_locker(m_parent_image_ctx->snap_lock);
+ m_parent_image_ctx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
+ // disallow any proxied maintenance operations
+ {
+ RWLock::RLocker owner_lock(m_parent_image_ctx->owner_lock);
+ if (m_parent_image_ctx->exclusive_lock != nullptr) {
+ m_parent_image_ctx->exclusive_lock->block_requests(0);
+ }
+ }
+
+ clone_v2_remove_snapshot();
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_remove_snapshot() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_remove_snapshot>(this);
+ m_parent_image_ctx->operations->snap_remove(m_parent_snap_namespace,
+ m_parent_snap_name, ctx);
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_remove_snapshot(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ ldout(cct, 5) << "failed to remove trashed clone snapshot: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ clone_v2_close_parent();
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_close_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_close_parent>(this);
+ m_parent_image_ctx->state->close(ctx);
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_close_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 5) << "failed to close parent image:" << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_parent_image_ctx->destroy();
+ m_parent_image_ctx = nullptr;
+ finish(0);
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v1_remove_child() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ m_parent_spec.pool_namespace = "";
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
+
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v1_remove_child>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v1_remove_child(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "failed to remove child from children list: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void DetachChildRequest<I>::finish(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::DetachChildRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/DetachChildRequest.h b/src/librbd/image/DetachChildRequest.h
new file mode 100644
index 00000000..bb0c81d2
--- /dev/null
+++ b/src/librbd/image/DetachChildRequest.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class DetachChildRequest {
+public:
+ static DetachChildRequest* create(ImageCtxT& image_ctx, Context* on_finish) {
+ return new DetachChildRequest(image_ctx, on_finish);
+ }
+
+ DetachChildRequest(ImageCtxT& image_ctx, Context* on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+ }
+ ~DetachChildRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * (v1) | (v2)
+ * /--------------/ \--------------\
+ * | |
+ * v v
+ * REMOVE_CHILD CHILD_DETACH
+ * | |
+ * | v
+ * | GET_SNAPSHOT
+ * | (snapshot in-use) . |
+ * |/. . . . . . . . . . . . . . . |
+ * | v
+ * | OPEN_PARENT
+ * | |
+ * | v
+ * | REMOVE_SNAPSHOT
+ * | |
+ * | v
+ * | CLOSE_PARENT
+ * | |
+ * |/------------------------------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ Context* m_on_finish;
+
+ librados::IoCtx m_parent_io_ctx;
+ cls::rbd::ParentImageSpec m_parent_spec;
+ std::string m_parent_header_name;
+
+ cls::rbd::SnapshotNamespace m_parent_snap_namespace;
+ std::string m_parent_snap_name;
+
+ ImageCtxT* m_parent_image_ctx = nullptr;
+
+ ceph::bufferlist m_out_bl;
+
+ void clone_v2_child_detach();
+ void handle_clone_v2_child_detach(int r);
+
+ void clone_v2_get_snapshot();
+ void handle_clone_v2_get_snapshot(int r);
+
+ void clone_v2_open_parent();
+ void handle_clone_v2_open_parent(int r);
+
+ void clone_v2_remove_snapshot();
+ void handle_clone_v2_remove_snapshot(int r);
+
+ void clone_v2_close_parent();
+ void handle_clone_v2_close_parent(int r);
+
+ void clone_v1_remove_child();
+ void handle_clone_v1_remove_child(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::DetachChildRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H
diff --git a/src/librbd/image/DetachParentRequest.cc b/src/librbd/image/DetachParentRequest.cc
new file mode 100644
index 00000000..40ff44b7
--- /dev/null
+++ b/src/librbd/image/DetachParentRequest.cc
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/DetachParentRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::DetachParentRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void DetachParentRequest<I>::send() {
+ detach_parent();
+}
+
+template <typename I>
+void DetachParentRequest<I>::detach_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (!m_legacy_parent) {
+ librbd::cls_client::parent_detach(&op);
+ } else {
+ librbd::cls_client::remove_parent(&op);
+ }
+
+ auto aio_comp = create_rados_callback<
+ DetachParentRequest<I>,
+ &DetachParentRequest<I>::handle_detach_parent>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DetachParentRequest<I>::handle_detach_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_legacy_parent && r == -EOPNOTSUPP) {
+ ldout(cct, 10) << "retrying using legacy parent method" << dendl;
+ m_legacy_parent = true;
+ detach_parent();
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "detach parent encountered an error: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void DetachParentRequest<I>::finish(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::DetachParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/DetachParentRequest.h b/src/librbd/image/DetachParentRequest.h
new file mode 100644
index 00000000..17c86aaa
--- /dev/null
+++ b/src/librbd/image/DetachParentRequest.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class DetachParentRequest {
+public:
+ static DetachParentRequest* create(ImageCtxT& image_ctx, Context* on_finish) {
+ return new DetachParentRequest(image_ctx, on_finish);
+ }
+
+ DetachParentRequest(ImageCtxT& image_ctx, Context* on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | * * * * * *
+ * | * * -EOPNOTSUPP
+ * v v *
+ * DETACH_PARENT * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ Context* m_on_finish;
+
+ bool m_legacy_parent = false;
+
+ void detach_parent();
+ void handle_detach_parent(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::DetachParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H
diff --git a/src/librbd/image/ListWatchersRequest.cc b/src/librbd/image/ListWatchersRequest.cc
new file mode 100644
index 00000000..e93fffc1
--- /dev/null
+++ b/src/librbd/image/ListWatchersRequest.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ListWatchersRequest.h"
+#include "common/RWLock.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Utils.h"
+
+#include <algorithm>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::ListWatchersRequest: " << this \
+ << " " << __func__ << ": "
+
+static std::ostream& operator<<(std::ostream& os, const obj_watch_t& watch) {
+ os << "{addr=" << watch.addr << ", "
+ << "watcher_id=" << watch.watcher_id << ", "
+ << "cookie=" << watch.cookie << "}";
+ return os;
+}
+
+namespace librbd {
+namespace image {
+
+using librados::IoCtx;
+using util::create_rados_callback;
+
+template<typename I>
+ListWatchersRequest<I>::ListWatchersRequest(I &image_ctx, int flags,
+ std::list<obj_watch_t> *watchers,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_flags(flags), m_watchers(watchers),
+ m_on_finish(on_finish), m_cct(m_image_ctx.cct) {
+}
+
+template<typename I>
+void ListWatchersRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+
+ list_image_watchers();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::list_image_watchers() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_watchers(&m_object_watchers, &m_ret_val);
+
+ using klass = ListWatchersRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_list_image_watchers>(this);
+
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+ rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::handle_list_image_watchers(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0 && m_ret_val < 0) {
+ r = m_ret_val;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "error listing image watchers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(m_cct, 20) << "object_watchers=" << m_object_watchers << dendl;
+ list_mirror_watchers();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::list_mirror_watchers() {
+ if ((m_object_watchers.empty()) ||
+ (m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) == 0 ||
+ (m_image_ctx.features & RBD_FEATURE_JOURNALING) == 0) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_watchers(&m_mirror_watchers, &m_ret_val);
+
+ using klass = ListWatchersRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_list_mirror_watchers>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, rados_completion,
+ &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::handle_list_mirror_watchers(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0 && m_ret_val < 0) {
+ r = m_ret_val;
+ }
+ if (r < 0 && r != -ENOENT) {
+ ldout(m_cct, 1) << "error listing mirror watchers: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ ldout(m_cct, 20) << "mirror_watchers=" << m_mirror_watchers << dendl;
+ finish(0);
+}
+
+template<typename I>
+void ListWatchersRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ m_watchers->clear();
+
+ if (m_object_watchers.size() > 0) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ uint64_t watch_handle = m_image_ctx.image_watcher != nullptr ?
+ m_image_ctx.image_watcher->get_watch_handle() : 0;
+
+ for (auto &w : m_object_watchers) {
+ if ((m_flags & LIST_WATCHERS_FILTER_OUT_MY_INSTANCE) != 0) {
+ if (w.cookie == watch_handle) {
+ ldout(m_cct, 20) << "filtering out my instance: " << w << dendl;
+ continue;
+ }
+ }
+ if ((m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) != 0) {
+ auto it = std::find_if(m_mirror_watchers.begin(),
+ m_mirror_watchers.end(),
+ [w] (obj_watch_t &watcher) {
+ return (strncmp(w.addr, watcher.addr,
+ sizeof(w.addr)) == 0);
+ });
+ if (it != m_mirror_watchers.end()) {
+ ldout(m_cct, 20) << "filtering out mirror instance: " << w << dendl;
+ continue;
+ }
+ }
+ m_watchers->push_back(w);
+ }
+ }
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::ListWatchersRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/ListWatchersRequest.h b/src/librbd/image/ListWatchersRequest.h
new file mode 100644
index 00000000..941bf728
--- /dev/null
+++ b/src/librbd/image/ListWatchersRequest.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H
+
+#include "include/rados/rados_types.hpp"
+
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+enum {
+ LIST_WATCHERS_FILTER_OUT_MY_INSTANCE = 1 << 0,
+ LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES = 1 << 1,
+};
+
+template<typename ImageCtxT = ImageCtx>
+class ListWatchersRequest {
+public:
+ static ListWatchersRequest *create(ImageCtxT &image_ctx, int flags,
+ std::list<obj_watch_t> *watchers,
+ Context *on_finish) {
+ return new ListWatchersRequest(image_ctx, flags, watchers, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * LIST_IMAGE_WATCHERS
+ * |
+ * v
+ * LIST_MIRROR_WATCHERS (skip if not needed)
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ListWatchersRequest(ImageCtxT &image_ctx, int flags, std::list<obj_watch_t> *watchers,
+ Context *on_finish);
+
+ ImageCtxT& m_image_ctx;
+ int m_flags;
+ std::list<obj_watch_t> *m_watchers;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ int m_ret_val;
+ bufferlist m_out_bl;
+ std::list<obj_watch_t> m_object_watchers;
+ std::list<obj_watch_t> m_mirror_watchers;
+
+ void list_image_watchers();
+ void handle_list_image_watchers(int r);
+
+ void list_mirror_watchers();
+ void handle_list_mirror_watchers(int r);
+
+ void finish(int r);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::ListWatchersRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H
diff --git a/src/librbd/image/OpenRequest.cc b/src/librbd/image/OpenRequest.cc
new file mode 100644
index 00000000..6f5f3083
--- /dev/null
+++ b/src/librbd/image/OpenRequest.cc
@@ -0,0 +1,662 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/OpenRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ConfigWatcher.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/ObjectCacherObjectDispatch.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+#include <boost/algorithm/string/predicate.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::OpenRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+OpenRequest<I>::OpenRequest(I *image_ctx, uint64_t flags,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_skip_open_parent_image(flags & OPEN_FLAG_SKIP_OPEN_PARENT),
+ m_on_finish(on_finish), m_error_result(0) {
+ if ((flags & OPEN_FLAG_OLD_FORMAT) != 0) {
+ m_image_ctx->old_format = true;
+ }
+ if ((flags & OPEN_FLAG_IGNORE_MIGRATING) != 0) {
+ m_image_ctx->ignore_migrating = true;
+ }
+}
+
+template <typename I>
+void OpenRequest<I>::send() {
+ if (m_image_ctx->old_format) {
+ send_v1_detect_header();
+ } else {
+ send_v2_detect_header();
+ }
+}
+
+template <typename I>
+void OpenRequest<I>::send_v1_detect_header() {
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v1_detect_header>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(util::old_header_name(m_image_ctx->name),
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v1_detect_header(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ if (*result != -ENOENT) {
+ lderr(cct) << "failed to stat image header: " << cpp_strerror(*result)
+ << dendl;
+ }
+ send_close_image(*result);
+ } else {
+ ldout(cct, 1) << "RBD image format 1 is deprecated. "
+ << "Please copy this image to image format 2." << dendl;
+
+ m_image_ctx->old_format = true;
+ m_image_ctx->header_oid = util::old_header_name(m_image_ctx->name);
+ m_image_ctx->apply_metadata({}, true);
+
+ send_refresh();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_detect_header() {
+ if (m_image_ctx->id.empty()) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v2_detect_header>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+ comp, &op, &m_out_bl);
+ comp->release();
+ } else {
+ send_v2_get_name();
+ }
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_detect_header(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ENOENT) {
+ send_v1_detect_header();
+ } else if (*result < 0) {
+ lderr(cct) << "failed to stat v2 image header: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ } else {
+ m_image_ctx->old_format = false;
+ send_v2_get_id();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_id() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_id_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v2_get_id>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_id(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_id_finish(&it, &m_image_ctx->id);
+ }
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve image id: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ } else {
+ send_v2_get_initial_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_name() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::dir_get_name_start(&op, m_image_ctx->id);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_name>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_name(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::dir_get_name_finish(&it, &m_image_ctx->name);
+ }
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to retrieve name: "
+ << cpp_strerror(*result) << dendl;
+ send_close_image(*result);
+ } else if (*result == -ENOENT) {
+ // image does not exist in directory, look in the trash bin
+ ldout(cct, 10) << "image id " << m_image_ctx->id << " does not exist in "
+ << "rbd directory, searching in rbd trash..." << dendl;
+ send_v2_get_name_from_trash();
+ } else {
+ send_v2_get_initial_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_name_from_trash() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::trash_get_start(&op, m_image_ctx->id);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_name_from_trash>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(RBD_TRASH, comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_name_from_trash(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ cls::rbd::TrashImageSpec trash_spec;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::trash_get_finish(&it, &trash_spec);
+ m_image_ctx->name = trash_spec.name;
+ }
+ if (*result < 0) {
+ if (*result == -EOPNOTSUPP) {
+ *result = -ENOENT;
+ }
+ if (*result == -ENOENT) {
+ ldout(cct, 5) << "failed to retrieve name for image id "
+ << m_image_ctx->id << dendl;
+ } else {
+ lderr(cct) << "failed to retrieve name from trash: "
+ << cpp_strerror(*result) << dendl;
+ }
+ send_close_image(*result);
+ } else {
+ send_v2_get_initial_metadata();
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_initial_metadata() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->old_format = false;
+ m_image_ctx->header_oid = util::header_name(m_image_ctx->id);
+
+ librados::ObjectReadOperation op;
+ cls_client::get_size_start(&op, CEPH_NOSNAP);
+ cls_client::get_object_prefix_start(&op);
+ cls_client::get_features_start(&op, true);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_initial_metadata>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_initial_metadata(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (*result >= 0) {
+ uint64_t size;
+ *result = cls_client::get_size_finish(&it, &size, &m_image_ctx->order);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_object_prefix_finish(&it,
+ &m_image_ctx->object_prefix);
+ }
+
+ if (*result >= 0) {
+ uint64_t incompatible_features;
+ *result = cls_client::get_features_finish(&it, &m_image_ctx->features,
+ &incompatible_features);
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve initial metadata: "
+ << cpp_strerror(*result) << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ if (m_image_ctx->test_features(RBD_FEATURE_STRIPINGV2)) {
+ send_v2_get_stripe_unit_count();
+ } else {
+ send_v2_get_create_timestamp();
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_stripe_unit_count() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_stripe_unit_count_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_stripe_unit_count>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_stripe_unit_count(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_stripe_unit_count_finish(
+ &it, &m_image_ctx->stripe_unit, &m_image_ctx->stripe_count);
+ }
+
+ if (*result == -ENOEXEC || *result == -EINVAL) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to read striping metadata: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_v2_get_create_timestamp();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_create_timestamp() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_create_timestamp_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_create_timestamp>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_create_timestamp(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_create_timestamp_finish(&it,
+ &m_image_ctx->create_timestamp);
+ }
+ if (*result < 0 && *result != -EOPNOTSUPP) {
+ lderr(cct) << "failed to retrieve create_timestamp: "
+ << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_v2_get_access_modify_timestamp();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_access_modify_timestamp() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_access_timestamp_start(&op);
+ cls_client::get_modify_timestamp_start(&op);
+ //TODO: merge w/ create timestamp query after luminous EOLed
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_access_modify_timestamp>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_access_modify_timestamp(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_access_timestamp_finish(&it,
+ &m_image_ctx->access_timestamp);
+ if (*result == 0)
+ *result = cls_client::get_modify_timestamp_finish(&it,
+ &m_image_ctx->modify_timestamp);
+ }
+ if (*result < 0 && *result != -EOPNOTSUPP) {
+ lderr(cct) << "failed to retrieve access/modify_timestamp: "
+ << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_v2_get_data_pool();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_data_pool() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_data_pool_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_data_pool>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_data_pool(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ int64_t data_pool_id = -1;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_data_pool_finish(&it, &data_pool_id);
+ } else if (*result == -EOPNOTSUPP) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to read data pool: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ if (data_pool_id != -1) {
+ *result = util::create_ioctx(m_image_ctx->md_ctx, "data pool", data_pool_id,
+ {}, &m_image_ctx->data_ctx);
+ if (*result < 0) {
+ if (*result != -ENOENT) {
+ send_close_image(*result);
+ return nullptr;
+ }
+ m_image_ctx->data_ctx.close();
+ } else {
+ m_image_ctx->data_ctx.set_namespace(m_image_ctx->md_ctx.get_namespace());
+ }
+ } else {
+ data_pool_id = m_image_ctx->md_ctx.get_id();
+ }
+
+ m_image_ctx->init_layout(data_pool_id);
+ send_refresh();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_refresh() {
+ m_image_ctx->init();
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->config_watcher = ConfigWatcher<I>::create(*m_image_ctx);
+ m_image_ctx->config_watcher->init();
+
+ using klass = OpenRequest<I>;
+ RefreshRequest<I> *req = RefreshRequest<I>::create(
+ *m_image_ctx, false, m_skip_open_parent_image,
+ create_context_callback<klass, &klass::handle_refresh>(this));
+ req->send();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_refresh(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ return send_init_cache(result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_init_cache(int *result) {
+ // cache is disabled or parent image context
+ if (!m_image_ctx->cache || m_image_ctx->child != nullptr ||
+ !m_image_ctx->data_ctx.is_valid()) {
+ return send_register_watch(result);
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ auto cache = cache::ObjectCacherObjectDispatch<I>::create(m_image_ctx);
+ cache->init();
+
+ // readahead requires the cache
+ m_image_ctx->readahead.set_trigger_requests(
+ m_image_ctx->config.template get_val<uint64_t>("rbd_readahead_trigger_requests"));
+ m_image_ctx->readahead.set_max_readahead_size(
+ m_image_ctx->config.template get_val<Option::size_t>("rbd_readahead_max_bytes"));
+
+ return send_register_watch(result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_register_watch(int *result) {
+ if (m_image_ctx->read_only) {
+ return send_set_snap(result);
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = OpenRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_register_watch>(this);
+ m_image_ctx->register_watch(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_register_watch(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -EPERM) {
+ ldout(cct, 5) << "user does not have write permission" << dendl;
+ send_close_image(*result);
+ return nullptr;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to register watch: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ return send_set_snap(result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_set_snap(int *result) {
+ if (m_image_ctx->snap_name.empty() &&
+ m_image_ctx->open_snap_id == CEPH_NOSNAP) {
+ *result = 0;
+ return m_on_finish;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ uint64_t snap_id = CEPH_NOSNAP;
+ std::swap(m_image_ctx->open_snap_id, snap_id);
+ if (snap_id == CEPH_NOSNAP) {
+ RWLock::RLocker snap_locker(m_image_ctx->snap_lock);
+ snap_id = m_image_ctx->get_snap_id(m_image_ctx->snap_namespace,
+ m_image_ctx->snap_name);
+ }
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "failed to find snapshot " << m_image_ctx->snap_name << dendl;
+ send_close_image(-ENOENT);
+ return nullptr;
+ }
+
+ using klass = OpenRequest<I>;
+ SetSnapRequest<I> *req = SetSnapRequest<I>::create(
+ *m_image_ctx, snap_id,
+ create_context_callback<klass, &klass::handle_set_snap>(this));
+ req->send();
+ return nullptr;
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_set_snap(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to set image snapshot: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+void OpenRequest<I>::send_close_image(int error_result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_error_result = error_result;
+
+ using klass = OpenRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_close_image>(
+ this);
+ CloseRequest<I> *req = CloseRequest<I>::create(m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_close_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close image: " << cpp_strerror(*result) << dendl;
+ }
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ }
+ return m_on_finish;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::OpenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/OpenRequest.h b/src/librbd/image/OpenRequest.h
new file mode 100644
index 00000000..3476832a
--- /dev/null
+++ b/src/librbd/image/OpenRequest.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
+
+#include "include/buffer.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class OpenRequest {
+public:
+ static OpenRequest *create(ImageCtxT *image_ctx, uint64_t flags,
+ Context *on_finish) {
+ return new OpenRequest(image_ctx, flags, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (v1)
+ * |-----> V1_DETECT_HEADER
+ * | |
+ * | \-------------------------------\
+ * | (v2) |
+ * \-----> V2_DETECT_HEADER |
+ * | |
+ * v |
+ * V2_GET_ID|NAME |
+ * | |
+ * v (skip if have name) |
+ * V2_GET_NAME_FROM_TRASH |
+ * | |
+ * v |
+ * V2_GET_INITIAL_METADATA |
+ * | |
+ * v |
+ * V2_GET_STRIPE_UNIT_COUNT (skip if |
+ * | disabled) |
+ * v |
+ * V2_GET_CREATE_TIMESTAMP |
+ * | |
+ * v |
+ * V2_GET_ACCESS_MODIFIY_TIMESTAMP |
+ * | |
+ * v |
+ * V2_GET_DATA_POOL --------------> REFRESH
+ * |
+ * v
+ * INIT_CACHE
+ * |
+ * v
+ * REGISTER_WATCH (skip if
+ * | read-only)
+ * v
+ * SET_SNAP (skip if no snap)
+ * |
+ * v
+ * <finish>
+ * ^
+ * (on error) |
+ * * * * * * * > CLOSE ------------------------/
+ *
+ * @endverbatim
+ */
+
+ OpenRequest(ImageCtxT *image_ctx, uint64_t flags, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ bool m_skip_open_parent_image;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ int m_error_result;
+
+ void send_v1_detect_header();
+ Context *handle_v1_detect_header(int *result);
+
+ void send_v2_detect_header();
+ Context *handle_v2_detect_header(int *result);
+
+ void send_v2_get_id();
+ Context *handle_v2_get_id(int *result);
+
+ void send_v2_get_name();
+ Context *handle_v2_get_name(int *result);
+
+ void send_v2_get_name_from_trash();
+ Context *handle_v2_get_name_from_trash(int *result);
+
+ void send_v2_get_initial_metadata();
+ Context *handle_v2_get_initial_metadata(int *result);
+
+ void send_v2_get_stripe_unit_count();
+ Context *handle_v2_get_stripe_unit_count(int *result);
+
+ void send_v2_get_create_timestamp();
+ Context *handle_v2_get_create_timestamp(int *result);
+
+ void send_v2_get_access_modify_timestamp();
+ Context *handle_v2_get_access_modify_timestamp(int *result);
+
+ void send_v2_get_data_pool();
+ Context *handle_v2_get_data_pool(int *result);
+
+ void send_refresh();
+ Context *handle_refresh(int *result);
+
+ Context *send_init_cache(int *result);
+
+ Context *send_register_watch(int *result);
+ Context *handle_register_watch(int *result);
+
+ Context *send_set_snap(int *result);
+ Context *handle_set_snap(int *result);
+
+ void send_close_image(int error_result);
+ Context *handle_close_image(int *result);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::OpenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
diff --git a/src/librbd/image/PreRemoveRequest.cc b/src/librbd/image/PreRemoveRequest.cc
new file mode 100644
index 00000000..3326c143
--- /dev/null
+++ b/src/librbd/image/PreRemoveRequest.cc
@@ -0,0 +1,321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/PreRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/Utils.h"
+#include "librbd/image/ListWatchersRequest.h"
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/operation/SnapshotRemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::PreRemoveRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+namespace {
+
+bool auto_delete_snapshot(const SnapInfo& snap_info) {
+ auto snap_namespace_type = cls::rbd::get_snap_namespace_type(
+ snap_info.snap_namespace);
+ switch (snap_namespace_type) {
+ case cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH:
+ return true;
+ default:
+ return false;
+ }
+}
+
+} // anonymous namespace
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void PreRemoveRequest<I>::send() {
+ auto cct = m_image_ctx->cct;
+ if (m_image_ctx->operations_disabled) {
+ lderr(cct) << "image operations disabled due to unsupported op features"
+ << dendl;
+ finish(-EROFS);
+ return;
+ }
+
+ acquire_exclusive_lock();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::acquire_exclusive_lock() {
+ RWLock::RLocker owner_lock(m_image_ctx->owner_lock);
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ validate_image_removal();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // do not attempt to open the journal when removing the image in case
+ // it's corrupt
+ if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ m_image_ctx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_exclusive_lock>(this);
+
+ m_image_ctx->exclusive_lock->try_acquire_lock(ctx);
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_exclusive_lock(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 || !m_image_ctx->exclusive_lock->is_lock_owner()) {
+ if (!m_force) {
+ lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl;
+ finish(-EBUSY);
+ } else {
+ ldout(cct, 5) << "cannot obtain exclusive lock - "
+ << "proceeding due to force flag set" << dendl;
+ shut_down_exclusive_lock();
+ }
+ return;
+ }
+
+ validate_image_removal();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::shut_down_exclusive_lock() {
+ RWLock::RLocker owner_lock(m_image_ctx->owner_lock);
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ validate_image_removal();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>,
+ &PreRemoveRequest<I>::handle_shut_down_exclusive_lock>(this);
+
+ m_exclusive_lock = m_image_ctx->exclusive_lock;
+ m_exclusive_lock->shut_down(ctx);
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_shut_down_exclusive_lock(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ delete m_exclusive_lock;
+ m_exclusive_lock = nullptr;
+
+ if (r < 0) {
+ lderr(cct) << "error shutting down exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ceph_assert(m_image_ctx->exclusive_lock == nullptr);
+ validate_image_removal();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::validate_image_removal() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_image_ctx->ignore_migrating &&
+ m_image_ctx->test_features(RBD_FEATURE_MIGRATING)) {
+ lderr(cct) << "image in migration state - not removing" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ check_image_snaps();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::check_image_snaps() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_image_ctx->snap_lock.get_read();
+ for (auto& snap_info : m_image_ctx->snap_info) {
+ if (auto_delete_snapshot(snap_info.second)) {
+ m_snap_infos.insert(snap_info);
+ } else {
+ m_image_ctx->snap_lock.put_read();
+
+ lderr(cct) << "image has snapshots - not removing" << dendl;
+ finish(-ENOTEMPTY);
+ return;
+ }
+ }
+ m_image_ctx->snap_lock.put_read();
+
+ list_image_watchers();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::list_image_watchers() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ int flags = LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES;
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>,
+ &PreRemoveRequest<I>::handle_list_image_watchers>(this);
+ auto req = ListWatchersRequest<I>::create(*m_image_ctx, flags, &m_watchers,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_list_image_watchers(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "error listing image watchers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ check_image_watchers();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::check_image_watchers() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_watchers.empty()) {
+ lderr(cct) << "image has watchers - not removing" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ check_group();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::check_group() {
+ if (m_image_ctx->old_format) {
+ finish(0);
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::image_group_get_start(&op);
+
+ auto rados_completion = create_rados_callback<
+ PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_check_group>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid,
+ rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_check_group(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ cls::rbd::GroupSpec s;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::image_group_get_finish(&it, &s);
+ }
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error fetching group for image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (s.is_valid()) {
+ lderr(cct) << "image is in a group - not removing" << dendl;
+ finish(-EMLINK);
+ return;
+ }
+
+ remove_snapshot();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::remove_snapshot() {
+ if (m_snap_infos.empty()) {
+ finish(0);
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ auto snap_id = m_snap_infos.begin()->first;
+ auto& snap_info = m_snap_infos.begin()->second;
+ ldout(cct, 20) << "snap_id=" << snap_id << ", "
+ << "snap_name=" << snap_info.name << dendl;
+
+ RWLock::RLocker owner_lock(m_image_ctx->owner_lock);
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_remove_snapshot>(this);
+ auto req = librbd::operation::SnapshotRemoveRequest<I>::create(
+ *m_image_ctx, snap_info.snap_namespace, snap_info.name,
+ snap_id, ctx);
+ req->send();
+
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_remove_snapshot(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ auto snap_id = m_snap_infos.begin()->first;
+ lderr(cct) << "failed to auto-prune snapshot " << snap_id << ": "
+ << cpp_strerror(r) << dendl;
+
+ if (r == -EBUSY) {
+ r = -ENOTEMPTY;
+ }
+ finish(r);
+ return;
+ }
+
+ ceph_assert(!m_snap_infos.empty());
+ m_snap_infos.erase(m_snap_infos.begin());
+
+ remove_snapshot();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::finish(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::PreRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/PreRemoveRequest.h b/src/librbd/image/PreRemoveRequest.h
new file mode 100644
index 00000000..30f16b36
--- /dev/null
+++ b/src/librbd/image/PreRemoveRequest.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "librbd/ImageCtx.h"
+#include <list>
+#include <map>
+
+class Context;
+
+namespace librbd {
+namespace image {
+
+template <typename ImageCtxT>
+class PreRemoveRequest {
+public:
+
+ static PreRemoveRequest *create(ImageCtxT *image_ctx, bool force,
+ Context *on_finish) {
+ return new PreRemoveRequest(image_ctx, force, on_finish);
+ }
+
+ PreRemoveRequest(ImageCtxT *image_ctx, bool force, Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | (skip if
+ * v not needed) (error)
+ * ACQUIRE EXCLUSIVE LOCK * * * * * * > SHUT DOWN EXCLUSIVE LOCK
+ * | |
+ * v |
+ * CHECK IMAGE WATCHERS <------------------/
+ * |
+ * v
+ * CHECK GROUP
+ * |
+ * | /------\
+ * | | |
+ * v v |
+ * REMOVE SNAPS ----/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT* m_image_ctx;
+ bool m_force;
+ Context* m_on_finish;
+
+ decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr;
+
+ bufferlist m_out_bl;
+ std::list<obj_watch_t> m_watchers;
+
+ std::map<uint64_t, SnapInfo> m_snap_infos;
+
+ void acquire_exclusive_lock();
+ void handle_exclusive_lock(int r);
+
+ void shut_down_exclusive_lock();
+ void handle_shut_down_exclusive_lock(int r);
+
+ void validate_image_removal();
+ void check_image_snaps();
+
+ void list_image_watchers();
+ void handle_list_image_watchers(int r);
+
+ void check_image_watchers();
+
+ void check_group();
+ void handle_check_group(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::PreRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H
diff --git a/src/librbd/image/RefreshParentRequest.cc b/src/librbd/image/RefreshParentRequest.cc
new file mode 100644
index 00000000..668ad56b
--- /dev/null
+++ b/src/librbd/image/RefreshParentRequest.cc
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/RefreshParentRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/OpenRequest.h"
+#include "librbd/io/ObjectDispatcher.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RefreshParentRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+RefreshParentRequest<I>::RefreshParentRequest(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info, Context *on_finish)
+ : m_child_image_ctx(child_image_ctx), m_parent_md(parent_md),
+ m_migration_info(migration_info), m_on_finish(on_finish),
+ m_parent_image_ctx(nullptr), m_parent_snap_id(CEPH_NOSNAP),
+ m_error_result(0) {
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_refresh_required(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ ceph_assert(child_image_ctx.snap_lock.is_locked());
+ ceph_assert(child_image_ctx.parent_lock.is_locked());
+ return (is_open_required(child_image_ctx, parent_md, migration_info) ||
+ is_close_required(child_image_ctx, parent_md, migration_info));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_close_required(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ return (child_image_ctx.parent != nullptr &&
+ !does_parent_exist(child_image_ctx, parent_md, migration_info));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_open_required(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ return (does_parent_exist(child_image_ctx, parent_md, migration_info) &&
+ (child_image_ctx.parent == nullptr ||
+ child_image_ctx.parent->md_ctx.get_id() != parent_md.spec.pool_id ||
+ child_image_ctx.parent->md_ctx.get_namespace() !=
+ parent_md.spec.pool_namespace ||
+ child_image_ctx.parent->id != parent_md.spec.image_id ||
+ child_image_ctx.parent->snap_id != parent_md.spec.snap_id));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::does_parent_exist(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ if (child_image_ctx.child != nullptr &&
+ child_image_ctx.child->migration_info.empty() && parent_md.overlap == 0) {
+ // intermediate, non-migrating images should only open their parent if they
+ // overlap
+ return false;
+ }
+
+ return (parent_md.spec.pool_id > -1 && parent_md.overlap > 0) ||
+ !migration_info.empty();
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send() {
+ if (is_open_required(m_child_image_ctx, m_parent_md, m_migration_info)) {
+ send_open_parent();
+ } else {
+ // parent will be closed (if necessary) during finalize
+ send_complete(0);
+ }
+}
+
+template <typename I>
+void RefreshParentRequest<I>::apply() {
+ ceph_assert(m_child_image_ctx.snap_lock.is_wlocked());
+ ceph_assert(m_child_image_ctx.parent_lock.is_wlocked());
+ std::swap(m_child_image_ctx.parent, m_parent_image_ctx);
+}
+
+template <typename I>
+void RefreshParentRequest<I>::finalize(Context *on_finish) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_finish = on_finish;
+ if (m_parent_image_ctx != nullptr) {
+ send_close_parent();
+ } else {
+ send_complete(0);
+ }
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_open_parent() {
+ ceph_assert(m_parent_md.spec.pool_id >= 0);
+
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::IoCtx parent_io_ctx;
+ int r = util::create_ioctx(m_child_image_ctx.md_ctx, "parent image",
+ m_parent_md.spec.pool_id,
+ m_parent_md.spec.pool_namespace, &parent_io_ctx);
+ if (r < 0) {
+ send_complete(r);
+ return;
+ }
+
+ std::string image_name;
+ uint64_t flags = 0;
+ if (!m_migration_info.empty() && !m_migration_info.image_name.empty()) {
+ image_name = m_migration_info.image_name;
+ flags |= OPEN_FLAG_OLD_FORMAT;
+ }
+
+ m_parent_image_ctx = new I(image_name, m_parent_md.spec.image_id,
+ m_parent_md.spec.snap_id, parent_io_ctx, true);
+ m_parent_image_ctx->child = &m_child_image_ctx;
+
+ // set rados flags for reading the parent image
+ if (m_child_image_ctx.config.template get_val<bool>("rbd_balance_parent_reads")) {
+ m_parent_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS);
+ } else if (m_child_image_ctx.config.template get_val<bool>("rbd_localize_parent_reads")) {
+ m_parent_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS);
+ }
+
+ using klass = RefreshParentRequest<I>;
+ Context *ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ klass, &klass::handle_open_parent, false>(this));
+ OpenRequest<I> *req = OpenRequest<I>::create(m_parent_image_ctx, flags, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_open_parent(int *result) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+ save_result(result);
+ if (*result < 0) {
+ lderr(cct) << "failed to open parent image: " << cpp_strerror(*result)
+ << dendl;
+
+ // image already closed by open state machine
+ delete m_parent_image_ctx;
+ m_parent_image_ctx = nullptr;
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_close_parent() {
+ ceph_assert(m_parent_image_ctx != nullptr);
+
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshParentRequest<I>;
+ Context *ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ klass, &klass::handle_close_parent, false>(this));
+ CloseRequest<I> *req = CloseRequest<I>::create(m_parent_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_close_parent(int *result) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+ delete m_parent_image_ctx;
+ m_parent_image_ctx = nullptr;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close parent image: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ send_reset_existence_cache();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_reset_existence_cache() {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ RefreshParentRequest<I>,
+ &RefreshParentRequest<I>::handle_reset_existence_cache, false>(this));
+ m_child_image_ctx.io_object_dispatcher->reset_existence_cache(ctx);
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_reset_existence_cache(int *result) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to reset object existence cache: "
+ << cpp_strerror(*result) << dendl;
+ }
+
+ if (m_error_result < 0) {
+ // propagate errors from opening the image
+ *result = m_error_result;
+ } else {
+ *result = 0;
+ }
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_complete(int r) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_finish->complete(r);
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RefreshParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RefreshParentRequest.h b/src/librbd/image/RefreshParentRequest.h
new file mode 100644
index 00000000..086d8ec1
--- /dev/null
+++ b/src/librbd/image/RefreshParentRequest.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class RefreshParentRequest {
+public:
+ static RefreshParentRequest *create(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info,
+ Context *on_finish) {
+ return new RefreshParentRequest(child_image_ctx, parent_md, migration_info,
+ on_finish);
+ }
+
+ static bool is_refresh_required(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+
+ void send();
+ void apply();
+ void finalize(Context *on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (open required)
+ * |----------------> OPEN_PARENT * * * * * * * * * * * * * * *
+ * | | *
+ * | v (on error) *
+ * \----------------> <apply> *
+ * | *
+ * | (close required) *
+ * |-----------------> CLOSE_PARENT *
+ * | | *
+ * | v *
+ * | RESET_EXISTENCE *
+ * | | *
+ * | v *
+ * \-----------------> <finish> < * * * *
+ *
+ * @endverbatim
+ */
+
+ RefreshParentRequest(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info, Context *on_finish);
+
+ ImageCtxT &m_child_image_ctx;
+ ParentImageInfo m_parent_md;
+ MigrationInfo m_migration_info;
+ Context *m_on_finish;
+
+ ImageCtxT *m_parent_image_ctx;
+ uint64_t m_parent_snap_id;
+
+ int m_error_result;
+
+ static bool is_close_required(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+ static bool is_open_required(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+ static bool does_parent_exist(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+
+ void send_open_parent();
+ Context *handle_open_parent(int *result);
+
+ void send_close_parent();
+ Context *handle_close_parent(int *result);
+
+ void send_reset_existence_cache();
+ Context *handle_reset_existence_cache(int *result);
+
+ void send_complete(int r);
+
+ void save_result(int *result) {
+ if (m_error_result == 0 && *result < 0) {
+ m_error_result = *result;
+ }
+ }
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RefreshParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
diff --git a/src/librbd/image/RefreshRequest.cc b/src/librbd/image/RefreshRequest.cc
new file mode 100644
index 00000000..3cc1b79e
--- /dev/null
+++ b/src/librbd/image/RefreshRequest.cc
@@ -0,0 +1,1551 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string/predicate.hpp>
+#include "include/ceph_assert.h"
+
+#include "librbd/image/RefreshRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/Utils.h"
+#include "librbd/image/RefreshParentRequest.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/journal/Policy.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RefreshRequest: "
+
+namespace librbd {
+namespace image {
+
+namespace {
+
+const uint64_t MAX_METADATA_ITEMS = 128;
+
+}
+
+using util::create_rados_callback;
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+RefreshRequest<I>::RefreshRequest(I &image_ctx, bool acquiring_lock,
+ bool skip_open_parent, Context *on_finish)
+ : m_image_ctx(image_ctx), m_acquiring_lock(acquiring_lock),
+ m_skip_open_parent_image(skip_open_parent),
+ m_on_finish(create_async_context_callback(m_image_ctx, on_finish)),
+ m_error_result(0), m_flush_aio(false), m_exclusive_lock(nullptr),
+ m_object_map(nullptr), m_journal(nullptr), m_refresh_parent(nullptr) {
+ m_pool_metadata_io_ctx.dup(image_ctx.md_ctx);
+ m_pool_metadata_io_ctx.set_namespace("");
+}
+
+template <typename I>
+RefreshRequest<I>::~RefreshRequest() {
+ // these require state machine to close
+ ceph_assert(m_exclusive_lock == nullptr);
+ ceph_assert(m_object_map == nullptr);
+ ceph_assert(m_journal == nullptr);
+ ceph_assert(m_refresh_parent == nullptr);
+ ceph_assert(!m_blocked_writes);
+}
+
+template <typename I>
+void RefreshRequest<I>::send() {
+ if (m_image_ctx.old_format) {
+ send_v1_read_header();
+ } else {
+ send_v2_get_mutable_metadata();
+ }
+}
+
+template <typename I>
+void RefreshRequest<I>::send_get_migration_header() {
+ if (m_image_ctx.ignore_migrating) {
+ if (m_image_ctx.old_format) {
+ send_v1_get_snapshots();
+ } else {
+ send_v2_get_metadata();
+ }
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::migration_get_start(&op);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_migration_header>(this);
+ m_out_bl.clear();
+ m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_get_migration_header(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::migration_get_finish(&it, &m_migration_spec);
+ } else if (*result == -ENOENT) {
+ ldout(cct, 5) << this << " " << __func__ << ": no migration header found"
+ << ", retrying" << dendl;
+ send();
+ return nullptr;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve migration header: "
+ << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ switch(m_migration_spec.header_type) {
+ case cls::rbd::MIGRATION_HEADER_TYPE_SRC:
+ if (!m_image_ctx.read_only) {
+ lderr(cct) << "image being migrated" << dendl;
+ *result = -EROFS;
+ return m_on_finish;
+ }
+ ldout(cct, 1) << this << " " << __func__ << ": migrating to: "
+ << m_migration_spec << dendl;
+ break;
+ case cls::rbd::MIGRATION_HEADER_TYPE_DST:
+ ldout(cct, 1) << this << " " << __func__ << ": migrating from: "
+ << m_migration_spec << dendl;
+ switch (m_migration_spec.state) {
+ case cls::rbd::MIGRATION_STATE_PREPARING:
+ ldout(cct, 5) << this << " " << __func__ << ": current migration state: "
+ << m_migration_spec.state << ", retrying" << dendl;
+ send();
+ return nullptr;
+ case cls::rbd::MIGRATION_STATE_PREPARED:
+ case cls::rbd::MIGRATION_STATE_EXECUTING:
+ case cls::rbd::MIGRATION_STATE_EXECUTED:
+ break;
+ case cls::rbd::MIGRATION_STATE_ABORTING:
+ if (!m_image_ctx.read_only) {
+ lderr(cct) << this << " " << __func__ << ": migration is being aborted"
+ << dendl;
+ *result = -EROFS;
+ return m_on_finish;
+ }
+ break;
+ default:
+ lderr(cct) << this << " " << __func__ << ": migration is in an "
+ << "unexpected state" << dendl;
+ *result = -EINVAL;
+ return m_on_finish;
+ }
+ break;
+ default:
+ ldout(cct, 1) << this << " " << __func__ << ": migration type "
+ << m_migration_spec.header_type << dendl;
+ *result = -EBADMSG;
+ return m_on_finish;
+ }
+
+ if (m_image_ctx.old_format) {
+ send_v1_get_snapshots();
+ } else {
+ send_v2_get_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_read_header() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, nullptr, nullptr);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_read_header>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_read_header(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+ rbd_obj_header_ondisk v1_header;
+ bool migrating = false;
+ if (*result < 0) {
+ return m_on_finish;
+ } else if (m_out_bl.length() < sizeof(v1_header)) {
+ lderr(cct) << "v1 header too small" << dendl;
+ *result = -EIO;
+ return m_on_finish;
+ } else if (memcmp(RBD_HEADER_TEXT, m_out_bl.c_str(),
+ sizeof(RBD_HEADER_TEXT)) != 0) {
+ if (memcmp(RBD_MIGRATE_HEADER_TEXT, m_out_bl.c_str(),
+ sizeof(RBD_MIGRATE_HEADER_TEXT)) == 0) {
+ ldout(cct, 1) << this << " " << __func__ << ": migration v1 header detected"
+ << dendl;
+ migrating = true;
+ } else {
+ lderr(cct) << "unrecognized v1 header" << dendl;
+ *result = -ENXIO;
+ return m_on_finish;
+ }
+ }
+
+ memcpy(&v1_header, m_out_bl.c_str(), sizeof(v1_header));
+ m_order = v1_header.options.order;
+ m_size = v1_header.image_size;
+ m_object_prefix = v1_header.block_name;
+ if (migrating) {
+ send_get_migration_header();
+ } else {
+ send_v1_get_snapshots();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_get_snapshots() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::old_snapshot_list_start(&op);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_get_snapshots>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_get_snapshots(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+ std::vector<std::string> snap_names;
+ std::vector<uint64_t> snap_sizes;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::old_snapshot_list_finish(&it, &snap_names,
+ &snap_sizes, &m_snapc);
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve v1 snapshots: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if (!m_snapc.is_valid()) {
+ lderr(cct) << "v1 image snap context is invalid" << dendl;
+ *result = -EIO;
+ return m_on_finish;
+ }
+
+ m_snap_infos.clear();
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ m_snap_infos.push_back({m_snapc.snaps[i],
+ {cls::rbd::UserSnapshotNamespace{}},
+ snap_names[i], snap_sizes[i], {}, 0});
+ }
+
+ send_v1_get_locks();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_get_locks() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_get_locks>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_get_locks(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ ClsLockType lock_type;
+ *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+ &lock_type, &m_lock_tag);
+ if (*result == 0) {
+ m_exclusive_locked = (lock_type == LOCK_EXCLUSIVE);
+ }
+ }
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve locks: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v1_apply();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // ensure we are not in a rados callback when applying updates
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v1_apply>(this);
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_apply(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ apply();
+ return send_flush_aio();
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_mutable_metadata() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ uint64_t snap_id;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ snap_id = m_image_ctx.snap_id;
+ }
+
+ bool read_only = m_image_ctx.read_only || snap_id != CEPH_NOSNAP;
+ librados::ObjectReadOperation op;
+ cls_client::get_size_start(&op, CEPH_NOSNAP);
+ cls_client::get_features_start(&op, read_only);
+ cls_client::get_flags_start(&op, CEPH_NOSNAP);
+ cls_client::get_snapcontext_start(&op);
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_mutable_metadata>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_mutable_metadata(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (*result >= 0) {
+ uint8_t order;
+ *result = cls_client::get_size_finish(&it, &m_size, &order);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_features_finish(&it, &m_features,
+ &m_incompatible_features);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_flags_finish(&it, &m_flags);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_snapcontext_finish(&it, &m_snapc);
+ }
+
+ if (*result >= 0) {
+ ClsLockType lock_type = LOCK_NONE;
+ *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+ &lock_type, &m_lock_tag);
+ if (*result == 0) {
+ m_exclusive_locked = (lock_type == LOCK_EXCLUSIVE);
+ }
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve mutable metadata: "
+ << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ uint64_t unsupported = m_incompatible_features & ~RBD_FEATURES_ALL;
+ if (unsupported != 0ULL) {
+ lderr(cct) << "Image uses unsupported features: " << unsupported << dendl;
+ *result = -ENOSYS;
+ return m_on_finish;
+ }
+
+ if (!m_snapc.is_valid()) {
+ lderr(cct) << "image snap context is invalid!" << dendl;
+ *result = -EIO;
+ return m_on_finish;
+ }
+
+ if (m_acquiring_lock && (m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ ldout(cct, 5) << "ignoring dynamically disabled exclusive lock" << dendl;
+ m_features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+ m_incomplete_update = true;
+ }
+
+ send_v2_get_parent();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_parent() {
+ // NOTE: remove support when Mimic is EOLed
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": legacy=" << m_legacy_parent
+ << dendl;
+
+ librados::ObjectReadOperation op;
+ if (!m_legacy_parent) {
+ cls_client::parent_get_start(&op);
+ cls_client::parent_overlap_get_start(&op, CEPH_NOSNAP);
+ } else {
+ cls_client::get_parent_start(&op, CEPH_NOSNAP);
+ }
+
+ auto aio_comp = create_rados_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_parent>(this);
+ m_out_bl.clear();
+ m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op,
+ &m_out_bl);
+ aio_comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_parent(int *result) {
+ // NOTE: remove support when Mimic is EOLed
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (!m_legacy_parent) {
+ if (*result == 0) {
+ *result = cls_client::parent_get_finish(&it, &m_parent_md.spec);
+ }
+
+ std::optional<uint64_t> parent_overlap;
+ if (*result == 0) {
+ *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap);
+ }
+
+ if (*result == 0 && parent_overlap) {
+ m_parent_md.overlap = *parent_overlap;
+ m_head_parent_overlap = true;
+ }
+ } else if (*result == 0) {
+ *result = cls_client::get_parent_finish(&it, &m_parent_md.spec,
+ &m_parent_md.overlap);
+ m_head_parent_overlap = true;
+ }
+
+ if (*result == -EOPNOTSUPP && !m_legacy_parent) {
+ ldout(cct, 10) << "retrying using legacy parent method" << dendl;
+ m_legacy_parent = true;
+ send_v2_get_parent();
+ return nullptr;
+ } if (*result < 0) {
+ lderr(cct) << "failed to retrieve parent: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if ((m_features & RBD_FEATURE_MIGRATING) != 0) {
+ ldout(cct, 1) << "migrating feature set" << dendl;
+ send_get_migration_header();
+ return nullptr;
+ }
+
+ send_v2_get_metadata();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_metadata() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "start_key=" << m_last_metadata_key << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::metadata_list_start(&op, m_last_metadata_key, MAX_METADATA_ITEMS);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v2_get_metadata>(this);
+ m_out_bl.clear();
+ m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_metadata(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ std::map<std::string, bufferlist> metadata;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::metadata_list_finish(&it, &metadata);
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve metadata: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if (!metadata.empty()) {
+ m_metadata.insert(metadata.begin(), metadata.end());
+ m_last_metadata_key = metadata.rbegin()->first;
+ if (boost::starts_with(m_last_metadata_key,
+ ImageCtx::METADATA_CONF_PREFIX)) {
+ send_v2_get_metadata();
+ return nullptr;
+ }
+ }
+
+ m_last_metadata_key.clear();
+ send_v2_get_pool_metadata();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_pool_metadata() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "start_key=" << m_last_metadata_key << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::metadata_list_start(&op, m_last_metadata_key, MAX_METADATA_ITEMS);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v2_get_pool_metadata>(this);
+ m_out_bl.clear();
+ m_pool_metadata_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_pool_metadata(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ std::map<std::string, bufferlist> metadata;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::metadata_list_finish(&it, &metadata);
+ }
+
+ if (*result == -EOPNOTSUPP || *result == -ENOENT) {
+ ldout(cct, 10) << "pool metadata not supported by OSD" << dendl;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to retrieve pool metadata: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if (!metadata.empty()) {
+ m_metadata.insert(metadata.begin(), metadata.end());
+ m_last_metadata_key = metadata.rbegin()->first;
+ if (boost::starts_with(m_last_metadata_key,
+ ImageCtx::METADATA_CONF_PREFIX)) {
+ send_v2_get_pool_metadata();
+ return nullptr;
+ }
+ }
+
+ bool thread_safe = m_image_ctx.image_watcher->is_unregistered();
+ m_image_ctx.apply_metadata(m_metadata, thread_safe);
+
+ send_v2_get_op_features();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_op_features() {
+ if ((m_features & RBD_FEATURE_OPERATIONS) == 0LL) {
+ send_v2_get_group();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::op_features_get_start(&op);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_op_features>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_op_features(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ // -EOPNOTSUPP handler not required since feature bit implies OSD
+ // supports the method
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ cls_client::op_features_get_finish(&it, &m_op_features);
+ } else if (*result < 0) {
+ lderr(cct) << "failed to retrieve op features: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v2_get_group();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_group() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::image_group_get_start(&op);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_group>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_group(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ cls_client::image_group_get_finish(&it, &m_group_spec);
+ }
+ if (*result < 0 && *result != -EOPNOTSUPP) {
+ lderr(cct) << "failed to retrieve group: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v2_get_snapshots();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_snapshots() {
+ m_snap_infos.resize(m_snapc.snaps.size());
+ m_snap_flags.resize(m_snapc.snaps.size());
+ m_snap_parents.resize(m_snapc.snaps.size());
+ m_snap_protection.resize(m_snapc.snaps.size());
+
+ if (m_snapc.snaps.empty()) {
+ send_v2_refresh_parent();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ for (auto snap_id : m_snapc.snaps) {
+ if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) {
+ /// NOTE: remove after Luminous is retired
+ cls_client::get_snapshot_name_start(&op, snap_id);
+ cls_client::get_size_start(&op, snap_id);
+ if (m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) {
+ cls_client::get_snapshot_timestamp_start(&op, snap_id);
+ }
+ } else {
+ cls_client::snapshot_get_start(&op, snap_id);
+ }
+
+ if (m_legacy_parent) {
+ cls_client::get_parent_start(&op, snap_id);
+ } else {
+ cls_client::parent_overlap_get_start(&op, snap_id);
+ }
+
+ cls_client::get_flags_start(&op, snap_id);
+ cls_client::get_protection_status_start(&op, snap_id);
+ }
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_snapshots>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_snapshots(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) {
+ /// NOTE: remove after Luminous is retired
+ std::string snap_name;
+ if (*result >= 0) {
+ *result = cls_client::get_snapshot_name_finish(&it, &snap_name);
+ }
+
+ uint64_t snap_size;
+ if (*result >= 0) {
+ uint8_t order;
+ *result = cls_client::get_size_finish(&it, &snap_size, &order);
+ }
+
+ utime_t snap_timestamp;
+ if (*result >= 0 &&
+ m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) {
+ /// NOTE: remove after Jewel is retired
+ *result = cls_client::get_snapshot_timestamp_finish(&it,
+ &snap_timestamp);
+ }
+
+ if (*result >= 0) {
+ m_snap_infos[i] = {m_snapc.snaps[i],
+ {cls::rbd::UserSnapshotNamespace{}},
+ snap_name, snap_size, snap_timestamp, 0};
+ }
+ } else if (*result >= 0) {
+ *result = cls_client::snapshot_get_finish(&it, &m_snap_infos[i]);
+ }
+
+ if (*result == 0) {
+ if (m_legacy_parent) {
+ *result = cls_client::get_parent_finish(&it, &m_snap_parents[i].spec,
+ &m_snap_parents[i].overlap);
+ } else {
+ std::optional<uint64_t> parent_overlap;
+ *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap);
+ if (*result == 0 && parent_overlap && m_parent_md.spec.pool_id > -1) {
+ m_snap_parents[i].spec = m_parent_md.spec;
+ m_snap_parents[i].overlap = *parent_overlap;
+ }
+ }
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_flags_finish(&it, &m_snap_flags[i]);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_protection_status_finish(
+ &it, &m_snap_protection[i]);
+ }
+
+ if (*result < 0) {
+ break;
+ }
+ }
+
+ if (*result == -ENOENT) {
+ ldout(cct, 10) << "out-of-sync snapshot state detected" << dendl;
+ send_v2_get_mutable_metadata();
+ return nullptr;
+ } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_DISABLED &&
+ *result == -EOPNOTSUPP) {
+ ldout(cct, 10) << "retrying using legacy snapshot methods" << dendl;
+ m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED;
+ send_v2_get_snapshots();
+ return nullptr;
+ } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_ENABLED &&
+ *result == -EOPNOTSUPP) {
+ ldout(cct, 10) << "retrying using legacy snapshot methods (jewel)" << dendl;
+ m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP;
+ send_v2_get_snapshots();
+ return nullptr;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to retrieve snapshots: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v2_refresh_parent();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_refresh_parent() {
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+
+ ParentImageInfo parent_md;
+ MigrationInfo migration_info;
+ int r = get_parent_info(m_image_ctx.snap_id, &parent_md, &migration_info);
+ if (!m_skip_open_parent_image && (r < 0 ||
+ RefreshParentRequest<I>::is_refresh_required(m_image_ctx, parent_md,
+ migration_info))) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_refresh_parent>(this);
+ m_refresh_parent = RefreshParentRequest<I>::create(
+ m_image_ctx, parent_md, migration_info, ctx);
+ }
+ }
+
+ if (m_refresh_parent != nullptr) {
+ m_refresh_parent->send();
+ } else {
+ send_v2_init_exclusive_lock();
+ }
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to refresh parent image: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ send_v2_apply();
+ return nullptr;
+ }
+
+ send_v2_init_exclusive_lock();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_init_exclusive_lock() {
+ if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0 ||
+ m_image_ctx.read_only || !m_image_ctx.snap_name.empty() ||
+ m_image_ctx.exclusive_lock != nullptr) {
+ send_v2_open_object_map();
+ return;
+ }
+
+ // implies exclusive lock dynamically enabled or image open in-progress
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // TODO need safe shut down
+ m_exclusive_lock = m_image_ctx.create_exclusive_lock();
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_init_exclusive_lock>(this);
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ m_exclusive_lock->init(m_features, ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_init_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ save_result(result);
+ }
+
+ // object map and journal will be opened when exclusive lock is
+ // acquired (if features are enabled)
+ send_v2_apply();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_open_journal() {
+ bool journal_disabled = (
+ (m_features & RBD_FEATURE_JOURNALING) == 0 ||
+ m_image_ctx.read_only ||
+ !m_image_ctx.snap_name.empty() ||
+ m_image_ctx.journal != nullptr ||
+ m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+ bool journal_disabled_by_policy;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ journal_disabled_by_policy = (
+ !journal_disabled &&
+ m_image_ctx.get_journal_policy()->journal_disabled());
+ }
+
+ if (journal_disabled || journal_disabled_by_policy) {
+ // journal dynamically enabled -- doesn't own exclusive lock
+ if ((m_features & RBD_FEATURE_JOURNALING) != 0 &&
+ !journal_disabled_by_policy &&
+ m_image_ctx.exclusive_lock != nullptr &&
+ m_image_ctx.journal == nullptr) {
+ m_image_ctx.io_work_queue->set_require_lock(librbd::io::DIRECTION_BOTH,
+ true);
+ }
+ send_v2_block_writes();
+ return;
+ }
+
+ // implies journal dynamically enabled since ExclusiveLock will init
+ // the journal upon acquiring the lock
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_open_journal>(this);
+
+ // TODO need safe close
+ m_journal = m_image_ctx.create_journal();
+ m_journal->open(ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_open_journal(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize journal: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ }
+
+ send_v2_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_block_writes() {
+ bool disabled_journaling = false;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ disabled_journaling = ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
+ (m_features & RBD_FEATURE_JOURNALING) == 0 &&
+ m_image_ctx.journal != nullptr);
+ }
+
+ if (!disabled_journaling) {
+ send_v2_apply();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // we need to block writes temporarily to avoid in-flight journal
+ // writes
+ m_blocked_writes = true;
+ Context *ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_block_writes>(this);
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ m_image_ctx.io_work_queue->block_writes(ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_block_writes(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ }
+ send_v2_apply();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_open_object_map() {
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0 ||
+ m_image_ctx.object_map != nullptr ||
+ (m_image_ctx.snap_name.empty() &&
+ (m_image_ctx.read_only ||
+ m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner()))) {
+ send_v2_open_journal();
+ return;
+ }
+
+ // implies object map dynamically enabled or image open in-progress
+ // since SetSnapRequest loads the object map for a snapshot and
+ // ExclusiveLock loads the object map for HEAD
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ if (m_image_ctx.snap_name.empty()) {
+ m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP);
+ } else {
+ for (size_t snap_idx = 0; snap_idx < m_snap_infos.size(); ++snap_idx) {
+ if (m_snap_infos[snap_idx].name == m_image_ctx.snap_name) {
+ m_object_map = m_image_ctx.create_object_map(
+ m_snapc.snaps[snap_idx].val);
+ break;
+ }
+ }
+
+ if (m_object_map == nullptr) {
+ lderr(cct) << "failed to locate snapshot: " << m_image_ctx.snap_name
+ << dendl;
+ send_v2_open_journal();
+ return;
+ }
+ }
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_open_object_map>(this);
+ m_object_map->open(ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_open_object_map(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to open object map: " << cpp_strerror(*result)
+ << dendl;
+ delete m_object_map;
+ m_object_map = nullptr;
+
+ if (*result != -EFBIG) {
+ save_result(result);
+ }
+ }
+
+ send_v2_open_journal();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // ensure we are not in a rados callback when applying updates
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_apply>(this);
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_apply(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ apply();
+
+ return send_v2_finalize_refresh_parent();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_finalize_refresh_parent() {
+ if (m_refresh_parent == nullptr) {
+ return send_v2_shut_down_exclusive_lock();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_finalize_refresh_parent>(this);
+ m_refresh_parent->finalize(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_finalize_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ ceph_assert(m_refresh_parent != nullptr);
+ delete m_refresh_parent;
+ m_refresh_parent = nullptr;
+
+ return send_v2_shut_down_exclusive_lock();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_shut_down_exclusive_lock() {
+ if (m_exclusive_lock == nullptr) {
+ return send_v2_close_journal();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // exclusive lock feature was dynamically disabled. in-flight IO will be
+ // flushed and in-flight requests will be canceled before releasing lock
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_shut_down_exclusive_lock>(this);
+ m_exclusive_lock->shut_down(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_shut_down_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to shut down exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ save_result(result);
+ }
+
+ {
+ RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr);
+ }
+
+ ceph_assert(m_exclusive_lock != nullptr);
+ delete m_exclusive_lock;
+ m_exclusive_lock = nullptr;
+
+ return send_v2_close_journal();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_close_journal() {
+ if (m_journal == nullptr) {
+ return send_v2_close_object_map();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // journal feature was dynamically disabled
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_close_journal>(this);
+ m_journal->close(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_close_journal(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ save_result(result);
+ lderr(cct) << "failed to close journal: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ ceph_assert(m_journal != nullptr);
+ delete m_journal;
+ m_journal = nullptr;
+
+ ceph_assert(m_blocked_writes);
+ m_blocked_writes = false;
+
+ m_image_ctx.io_work_queue->unblock_writes();
+ return send_v2_close_object_map();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_close_object_map() {
+ if (m_object_map == nullptr) {
+ return send_flush_aio();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // object map was dynamically disabled
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_close_object_map>(this);
+ m_object_map->close(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_close_object_map(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close object map: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ ceph_assert(m_object_map != nullptr);
+ delete m_object_map;
+ m_object_map = nullptr;
+
+ return send_flush_aio();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_flush_aio() {
+ if (m_incomplete_update && m_error_result == 0) {
+ // if this was a partial refresh, notify ImageState
+ m_error_result = -ERESTART;
+ }
+
+ if (m_flush_aio) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ auto ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_flush_aio>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec<I>::create_flush_request(
+ m_image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ delete req;
+ return nullptr;
+ } else if (m_error_result < 0) {
+ // propagate saved error back to caller
+ Context *ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_error>(this);
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+ return nullptr;
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_flush_aio(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to flush pending AIO: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ return handle_error(result);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_error(int *result) {
+ if (m_error_result < 0) {
+ *result = m_error_result;
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+ }
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker md_locker(m_image_ctx.md_lock);
+
+ {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::WLocker parent_locker(m_image_ctx.parent_lock);
+
+ m_image_ctx.size = m_size;
+ m_image_ctx.lockers = m_lockers;
+ m_image_ctx.lock_tag = m_lock_tag;
+ m_image_ctx.exclusive_locked = m_exclusive_locked;
+
+ std::map<uint64_t, uint64_t> migration_reverse_snap_seq;
+
+ if (m_image_ctx.old_format) {
+ m_image_ctx.order = m_order;
+ m_image_ctx.features = 0;
+ m_image_ctx.flags = 0;
+ m_image_ctx.op_features = 0;
+ m_image_ctx.operations_disabled = false;
+ m_image_ctx.object_prefix = std::move(m_object_prefix);
+ m_image_ctx.init_layout(m_image_ctx.md_ctx.get_id());
+ } else {
+ // HEAD revision doesn't have a defined overlap so it's only
+ // applicable to snapshots
+ if (!m_head_parent_overlap) {
+ m_parent_md = {};
+ }
+
+ m_image_ctx.features = m_features;
+ m_image_ctx.flags = m_flags;
+ m_image_ctx.op_features = m_op_features;
+ m_image_ctx.operations_disabled = (
+ (m_op_features & ~RBD_OPERATION_FEATURES_ALL) != 0ULL);
+ m_image_ctx.group_spec = m_group_spec;
+
+ bool migration_info_valid;
+ int r = get_migration_info(&m_image_ctx.parent_md,
+ &m_image_ctx.migration_info,
+ &migration_info_valid);
+ ceph_assert(r == 0); // validated in refresh parent step
+
+ if (migration_info_valid) {
+ for (auto it : m_image_ctx.migration_info.snap_map) {
+ migration_reverse_snap_seq[it.second.front()] = it.first;
+ }
+ } else {
+ m_image_ctx.parent_md = m_parent_md;
+ m_image_ctx.migration_info = {};
+ }
+ }
+
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ std::vector<librados::snap_t>::const_iterator it = std::find(
+ m_image_ctx.snaps.begin(), m_image_ctx.snaps.end(),
+ m_snapc.snaps[i].val);
+ if (it == m_image_ctx.snaps.end()) {
+ m_flush_aio = true;
+ ldout(cct, 20) << "new snapshot id=" << m_snapc.snaps[i].val
+ << " name=" << m_snap_infos[i].name
+ << " size=" << m_snap_infos[i].image_size
+ << dendl;
+ }
+ }
+
+ m_image_ctx.snaps.clear();
+ m_image_ctx.snap_info.clear();
+ m_image_ctx.snap_ids.clear();
+ auto overlap = m_image_ctx.parent_md.overlap;
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ uint64_t flags = m_image_ctx.old_format ? 0 : m_snap_flags[i];
+ uint8_t protection_status = m_image_ctx.old_format ?
+ static_cast<uint8_t>(RBD_PROTECTION_STATUS_UNPROTECTED) :
+ m_snap_protection[i];
+ ParentImageInfo parent;
+ if (!m_image_ctx.old_format) {
+ if (!m_image_ctx.migration_info.empty()) {
+ parent = m_image_ctx.parent_md;
+ auto it = migration_reverse_snap_seq.find(m_snapc.snaps[i].val);
+ if (it != migration_reverse_snap_seq.end()) {
+ parent.spec.snap_id = it->second;
+ parent.overlap = m_snap_infos[i].image_size;
+ } else {
+ overlap = std::min(overlap, m_snap_infos[i].image_size);
+ parent.overlap = overlap;
+ }
+ } else {
+ parent = m_snap_parents[i];
+ }
+ }
+ m_image_ctx.add_snap(m_snap_infos[i].snapshot_namespace,
+ m_snap_infos[i].name, m_snapc.snaps[i].val,
+ m_snap_infos[i].image_size, parent,
+ protection_status, flags,
+ m_snap_infos[i].timestamp);
+ }
+ m_image_ctx.parent_md.overlap = std::min(overlap, m_image_ctx.size);
+ m_image_ctx.snapc = m_snapc;
+
+ if (m_image_ctx.snap_id != CEPH_NOSNAP &&
+ m_image_ctx.get_snap_id(m_image_ctx.snap_namespace,
+ m_image_ctx.snap_name) != m_image_ctx.snap_id) {
+ lderr(cct) << "tried to read from a snapshot that no longer exists: "
+ << m_image_ctx.snap_name << dendl;
+ m_image_ctx.snap_exists = false;
+ }
+
+ if (m_refresh_parent != nullptr) {
+ m_refresh_parent->apply();
+ }
+ if (m_image_ctx.data_ctx.is_valid()) {
+ m_image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(m_image_ctx.snapc.seq,
+ m_image_ctx.snaps);
+ }
+
+ // handle dynamically enabled / disabled features
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK,
+ m_image_ctx.snap_lock)) {
+ // disabling exclusive lock will automatically handle closing
+ // object map and journaling
+ ceph_assert(m_exclusive_lock == nullptr);
+ m_exclusive_lock = m_image_ctx.exclusive_lock;
+ } else {
+ if (m_exclusive_lock != nullptr) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr);
+ std::swap(m_exclusive_lock, m_image_ctx.exclusive_lock);
+ }
+ if (!m_image_ctx.test_features(RBD_FEATURE_JOURNALING,
+ m_image_ctx.snap_lock)) {
+ if (!m_image_ctx.clone_copy_on_read && m_image_ctx.journal != nullptr) {
+ m_image_ctx.io_work_queue->set_require_lock(io::DIRECTION_READ,
+ false);
+ }
+ std::swap(m_journal, m_image_ctx.journal);
+ } else if (m_journal != nullptr) {
+ std::swap(m_journal, m_image_ctx.journal);
+ }
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.snap_lock) ||
+ m_object_map != nullptr) {
+ std::swap(m_object_map, m_image_ctx.object_map);
+ }
+ }
+ }
+}
+
+template <typename I>
+int RefreshRequest<I>::get_parent_info(uint64_t snap_id,
+ ParentImageInfo *parent_md,
+ MigrationInfo *migration_info) {
+ bool migration_info_valid;
+ int r = get_migration_info(parent_md, migration_info, &migration_info_valid);
+ if (r < 0) {
+ return r;
+ }
+
+ if (migration_info_valid) {
+ return 0;
+ } else if (snap_id == CEPH_NOSNAP) {
+ *parent_md = m_parent_md;
+ *migration_info = {};
+ return 0;
+ } else {
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ if (m_snapc.snaps[i].val == snap_id) {
+ *parent_md = m_snap_parents[i];
+ *migration_info = {};
+ return 0;
+ }
+ }
+ }
+ return -ENOENT;
+}
+
+template <typename I>
+int RefreshRequest<I>::get_migration_info(ParentImageInfo *parent_md,
+ MigrationInfo *migration_info,
+ bool* migration_info_valid) {
+ CephContext *cct = m_image_ctx.cct;
+ if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST ||
+ (m_migration_spec.state != cls::rbd::MIGRATION_STATE_PREPARED &&
+ m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTING &&
+ m_migration_spec.state != cls::rbd::MIGRATION_STATE_ABORTING)) {
+ if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_SRC &&
+ m_migration_spec.pool_id != -1 &&
+ m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTED) {
+ lderr(cct) << this << " " << __func__ << ": invalid migration spec"
+ << dendl;
+ return -EINVAL;
+ }
+
+ *migration_info_valid = false;
+ return 0;
+ }
+
+ parent_md->spec.pool_id = m_migration_spec.pool_id;
+ parent_md->spec.pool_namespace = m_migration_spec.pool_namespace;
+ parent_md->spec.image_id = m_migration_spec.image_id;
+ parent_md->spec.snap_id = CEPH_NOSNAP;
+ parent_md->overlap = std::min(m_size, m_migration_spec.overlap);
+
+ auto snap_seqs = m_migration_spec.snap_seqs;
+ // If new snapshots have been created on destination image after
+ // migration stared, map the source CEPH_NOSNAP to the earliest of
+ // these snapshots.
+ snapid_t snap_id = snap_seqs.empty() ? 0 : snap_seqs.rbegin()->second;
+ auto it = std::upper_bound(m_snapc.snaps.rbegin(), m_snapc.snaps.rend(),
+ snap_id);
+ if (it != m_snapc.snaps.rend()) {
+ snap_seqs[CEPH_NOSNAP] = *it;
+ } else {
+ snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP;
+ }
+
+ std::set<uint64_t> snap_ids;
+ for (auto& it : snap_seqs) {
+ snap_ids.insert(it.second);
+ }
+ uint64_t overlap = snap_ids.find(CEPH_NOSNAP) != snap_ids.end() ?
+ parent_md->overlap : 0;
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ if (snap_ids.find(m_snapc.snaps[i].val) != snap_ids.end()) {
+ overlap = std::max(overlap, m_snap_infos[i].image_size);
+ }
+ }
+
+ *migration_info = {m_migration_spec.pool_id, m_migration_spec.pool_namespace,
+ m_migration_spec.image_name, m_migration_spec.image_id, {},
+ overlap, m_migration_spec.flatten};
+ *migration_info_valid = true;
+
+ deep_copy::util::compute_snap_map(m_image_ctx.cct, 0, CEPH_NOSNAP, {},
+ snap_seqs, &migration_info->snap_map);
+ return 0;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RefreshRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RefreshRequest.h b/src/librbd/image/RefreshRequest.h
new file mode 100644
index 00000000..0a0f1e8d
--- /dev/null
+++ b/src/librbd/image/RefreshRequest.h
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/utime.h"
+#include "common/snap_types.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <string>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template<typename> class RefreshParentRequest;
+
+template<typename ImageCtxT = ImageCtx>
+class RefreshRequest {
+public:
+ static RefreshRequest *create(ImageCtxT &image_ctx, bool acquiring_lock,
+ bool skip_open_parent, Context *on_finish) {
+ return new RefreshRequest(image_ctx, acquiring_lock, skip_open_parent,
+ on_finish);
+ }
+
+ RefreshRequest(ImageCtxT &image_ctx, bool acquiring_lock,
+ bool skip_open_parent, Context *on_finish);
+ ~RefreshRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> < * * * * * * * * * * * * * * * * * * * * * * * * * * (ENOENT)
+ * ^ | *
+ * * | (v1) *
+ * * |-----> V1_READ_HEADER -------------> GET_MIGRATION_HEADER (skip if not
+ * * | | migrating)
+ * * | (v2) v
+ * * \-----> V2_GET_MUTABLE_METADATA V1_GET_SNAPSHOTS
+ * * | |
+ * * | -EOPNOTSUPP v
+ * * | * * * V1_GET_LOCKS
+ * * | * * |
+ * * v v * v
+ * * V2_GET_PARENT <apply>
+ * * | |
+ * * v |
+ * * * * * * GET_MIGRATION_HEADER (skip if not |
+ * (ENOENT) | migrating) |
+ * v |
+ * V2_GET_METADATA |
+ * | |
+ * v |
+ * V2_GET_POOL_METADATA |
+ * | |
+ * v (skip if not enabled) |
+ * V2_GET_OP_FEATURES |
+ * | |
+ * v |
+ * V2_GET_GROUP |
+ * | |
+ * | -EOPNOTSUPP |
+ * | * * * * |
+ * | * * |
+ * v v * |
+ * V2_GET_SNAPSHOTS (skip if no snaps) |
+ * | |
+ * v |
+ * V2_REFRESH_PARENT (skip if no parent or |
+ * | refresh not needed) |
+ * v |
+ * V2_INIT_EXCLUSIVE_LOCK (skip if lock |
+ * | active or disabled) |
+ * v |
+ * V2_OPEN_OBJECT_MAP (skip if map |
+ * | active or disabled) |
+ * v |
+ * V2_OPEN_JOURNAL (skip if journal |
+ * | active or disabled) |
+ * v |
+ * V2_BLOCK_WRITES (skip if journal not |
+ * | disabled) |
+ * v |
+ * <apply> |
+ * | |
+ * v |
+ * V2_FINALIZE_REFRESH_PARENT (skip if refresh |
+ * | not needed) |
+ * (error) v |
+ * * * * * > V2_SHUT_DOWN_EXCLUSIVE_LOCK (skip if lock |
+ * | active or enabled) |
+ * v |
+ * V2_CLOSE_JOURNAL (skip if journal inactive |
+ * | or enabled) |
+ * v |
+ * V2_CLOSE_OBJECT_MAP (skip if map inactive |
+ * | or enabled) |
+ * | |
+ * \-------------------\/--------------------/
+ * |
+ * v
+ * FLUSH (skip if no new
+ * | snapshots)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ enum LegacySnapshot {
+ LEGACY_SNAPSHOT_DISABLED,
+ LEGACY_SNAPSHOT_ENABLED,
+ LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP
+ };
+
+ ImageCtxT &m_image_ctx;
+ bool m_acquiring_lock;
+ bool m_skip_open_parent_image;
+ Context *m_on_finish;
+
+ cls::rbd::MigrationSpec m_migration_spec;
+ int m_error_result;
+ bool m_flush_aio;
+ decltype(m_image_ctx.exclusive_lock) m_exclusive_lock;
+ decltype(m_image_ctx.object_map) m_object_map;
+ decltype(m_image_ctx.journal) m_journal;
+ RefreshParentRequest<ImageCtxT> *m_refresh_parent;
+
+ bufferlist m_out_bl;
+
+ bool m_legacy_parent = false;
+ LegacySnapshot m_legacy_snapshot = LEGACY_SNAPSHOT_DISABLED;
+
+ uint8_t m_order = 0;
+ uint64_t m_size = 0;
+ uint64_t m_features = 0;
+ uint64_t m_incompatible_features = 0;
+ uint64_t m_flags = 0;
+ uint64_t m_op_features = 0;
+
+ librados::IoCtx m_pool_metadata_io_ctx;
+ std::string m_last_metadata_key;
+ std::map<std::string, bufferlist> m_metadata;
+
+ std::string m_object_prefix;
+ ParentImageInfo m_parent_md;
+ bool m_head_parent_overlap = false;
+ cls::rbd::GroupSpec m_group_spec;
+
+ ::SnapContext m_snapc;
+ std::vector<cls::rbd::SnapshotInfo> m_snap_infos;
+ std::vector<ParentImageInfo> m_snap_parents;
+ std::vector<uint8_t> m_snap_protection;
+ std::vector<uint64_t> m_snap_flags;
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> m_lockers;
+ std::string m_lock_tag;
+ bool m_exclusive_locked = false;
+
+ bool m_blocked_writes = false;
+ bool m_incomplete_update = false;
+
+ void send_get_migration_header();
+ Context *handle_get_migration_header(int *result);
+
+ void send_v1_read_header();
+ Context *handle_v1_read_header(int *result);
+
+ void send_v1_get_snapshots();
+ Context *handle_v1_get_snapshots(int *result);
+
+ void send_v1_get_locks();
+ Context *handle_v1_get_locks(int *result);
+
+ void send_v1_apply();
+ Context *handle_v1_apply(int *result);
+
+ void send_v2_get_mutable_metadata();
+ Context *handle_v2_get_mutable_metadata(int *result);
+
+ void send_v2_get_parent();
+ Context *handle_v2_get_parent(int *result);
+
+ void send_v2_get_metadata();
+ Context *handle_v2_get_metadata(int *result);
+
+ void send_v2_get_pool_metadata();
+ Context *handle_v2_get_pool_metadata(int *result);
+
+ void send_v2_get_op_features();
+ Context *handle_v2_get_op_features(int *result);
+
+ void send_v2_get_group();
+ Context *handle_v2_get_group(int *result);
+
+ void send_v2_get_snapshots();
+ Context *handle_v2_get_snapshots(int *result);
+
+ void send_v2_get_snapshots_legacy();
+ Context *handle_v2_get_snapshots_legacy(int *result);
+
+ void send_v2_refresh_parent();
+ Context *handle_v2_refresh_parent(int *result);
+
+ void send_v2_init_exclusive_lock();
+ Context *handle_v2_init_exclusive_lock(int *result);
+
+ void send_v2_open_journal();
+ Context *handle_v2_open_journal(int *result);
+
+ void send_v2_block_writes();
+ Context *handle_v2_block_writes(int *result);
+
+ void send_v2_open_object_map();
+ Context *handle_v2_open_object_map(int *result);
+
+ void send_v2_apply();
+ Context *handle_v2_apply(int *result);
+
+ Context *send_v2_finalize_refresh_parent();
+ Context *handle_v2_finalize_refresh_parent(int *result);
+
+ Context *send_v2_shut_down_exclusive_lock();
+ Context *handle_v2_shut_down_exclusive_lock(int *result);
+
+ Context *send_v2_close_journal();
+ Context *handle_v2_close_journal(int *result);
+
+ Context *send_v2_close_object_map();
+ Context *handle_v2_close_object_map(int *result);
+
+ Context *send_flush_aio();
+ Context *handle_flush_aio(int *result);
+
+ Context *handle_error(int *result);
+
+ void save_result(int *result) {
+ if (m_error_result == 0 && *result < 0) {
+ m_error_result = *result;
+ }
+ }
+
+ void apply();
+ int get_parent_info(uint64_t snap_id, ParentImageInfo *parent_md,
+ MigrationInfo *migration_info);
+ int get_migration_info(ParentImageInfo *parent_md,
+ MigrationInfo *migration_info,
+ bool* migration_info_valid);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RefreshRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
diff --git a/src/librbd/image/RemoveRequest.cc b/src/librbd/image/RemoveRequest.cc
new file mode 100644
index 00000000..8e029f8b
--- /dev/null
+++ b/src/librbd/image/RemoveRequest.cc
@@ -0,0 +1,605 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/RemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/internal.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/image/PreRemoveRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/operation/TrimRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RemoveRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using librados::IoCtx;
+using util::create_context_callback;
+using util::create_async_context_callback;
+using util::create_rados_callback;
+
+template<typename I>
+RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, const std::string &image_name,
+ const std::string &image_id, bool force,
+ bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_ioctx(ioctx), m_image_name(image_name), m_image_id(image_id),
+ m_force(force), m_from_trash_remove(from_trash_remove),
+ m_prog_ctx(prog_ctx), m_op_work_queue(op_work_queue),
+ m_on_finish(on_finish) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+template<typename I>
+RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, I *image_ctx, bool force,
+ bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_ioctx(ioctx), m_image_name(image_ctx->name), m_image_id(image_ctx->id),
+ m_image_ctx(image_ctx), m_force(force),
+ m_from_trash_remove(from_trash_remove), m_prog_ctx(prog_ctx),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish),
+ m_cct(image_ctx->cct), m_header_oid(image_ctx->header_oid),
+ m_old_format(image_ctx->old_format), m_unknown_format(false) {
+}
+
+template<typename I>
+void RemoveRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+
+ open_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::open_image() {
+ if (m_image_ctx != nullptr) {
+ pre_remove_image();
+ return;
+ }
+
+ m_image_ctx = I::create(m_image_id.empty() ? m_image_name : "", m_image_id,
+ nullptr, m_ioctx, false);
+
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_open_image>(
+ this);
+
+ m_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_open_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+
+ if (r != -ENOENT) {
+ lderr(m_cct) << "error opening image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_image();
+ return;
+ }
+
+ m_image_id = m_image_ctx->id;
+ m_image_name = m_image_ctx->name;
+ m_header_oid = m_image_ctx->header_oid;
+ m_old_format = m_image_ctx->old_format;
+ m_unknown_format = false;
+
+ pre_remove_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::pre_remove_image() {
+ ldout(m_cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_pre_remove_image>(this);
+ auto req = PreRemoveRequest<I>::create(m_image_ctx, m_force, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_pre_remove_image(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ send_close_image(r);
+ return;
+ }
+
+ if (!m_image_ctx->data_ctx.is_valid()) {
+ detach_child();
+ return;
+ }
+
+ trim_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::trim_image() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ klass, &klass::handle_trim_image>(this));
+
+ RWLock::RLocker owner_lock(m_image_ctx->owner_lock);
+ auto req = librbd::operation::TrimRequest<I>::create(
+ *m_image_ctx, ctx, m_image_ctx->size, 0, m_prog_ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_trim_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove some object(s): "
+ << cpp_strerror(r) << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ if (m_old_format) {
+ send_close_image(r);
+ return;
+ }
+
+ detach_child();
+}
+
+template<typename I>
+void RemoveRequest<I>::detach_child() {
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_detach_child>(this);
+ auto req = DetachChildRequest<I>::create(*m_image_ctx, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_detach_child(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to detach child from parent: "
+ << cpp_strerror(r) << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ send_disable_mirror();
+}
+
+template<typename I>
+void RemoveRequest<I>::send_disable_mirror() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_disable_mirror>(this);
+
+ mirror::DisableRequest<I> *req =
+ mirror::DisableRequest<I>::create(m_image_ctx, m_force, !m_force, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_disable_mirror(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ r = 0;
+ } else if (r < 0) {
+ lderr(m_cct) << "error disabling image mirroring: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ send_close_image(r);
+}
+
+template<typename I>
+void RemoveRequest<I>::send_close_image(int r) {
+ ldout(m_cct, 20) << dendl;
+
+ m_ret_val = r;
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_send_close_image>(this);
+
+ m_image_ctx->state->close(ctx);
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_send_close_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error encountered while closing image: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ finish(r);
+ return;
+ }
+
+ remove_header();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_header() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_remove_header>(this);
+ int r = m_ioctx.aio_remove(m_header_oid, rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_header(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ remove_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_header_v2() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_header_oid.empty()) {
+ m_header_oid = util::header_name(m_image_id);
+ }
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_remove_header_v2>(this);
+ int r = m_ioctx.aio_remove(m_header_oid, rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_header_v2(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_journal_remove();
+}
+
+template<typename I>
+void RemoveRequest<I>::send_journal_remove() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_journal_remove>(this);
+
+ journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create(
+ m_ioctx, m_image_id, Journal<>::IMAGE_CLIENT_ID, m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_journal_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else {
+ r = 0;
+ }
+
+ send_object_map_remove();
+}
+
+template<typename I>
+void RemoveRequest<I>::send_object_map_remove() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_object_map_remove>(this);
+
+ int r = ObjectMap<>::aio_remove(m_ioctx,
+ m_image_id,
+ rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_object_map_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else {
+ r = 0;
+ }
+
+ mirror_image_remove();
+}
+
+template<typename I>
+void RemoveRequest<I>::mirror_image_remove() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_remove(&op, m_image_id);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_mirror_image_remove>(this);
+ int r = m_ioctx.aio_operate(RBD_MIRRORING, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_mirror_image_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) {
+ lderr(m_cct) << "failed to remove mirror image state: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_from_trash_remove) {
+ // both the id object and the directory entry have been removed in
+ // a previous call to trash_move.
+ finish(0);
+ return;
+ }
+
+ remove_id_object();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_image() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_old_format || m_unknown_format) {
+ remove_v1_image();
+ } else {
+ remove_v2_image();
+ }
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_v1_image() {
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = new FunctionContext([this] (int r) {
+ r = tmap_rm(m_ioctx, m_image_name);
+ handle_remove_v1_image(r);
+ });
+
+ m_op_work_queue->queue(ctx, 0);
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_v1_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_old_format = (r == 0);
+ if (r == 0 || (r < 0 && !m_unknown_format)) {
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing image from v1 directory: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+ return;
+ }
+
+ if (!m_old_format) {
+ remove_v2_image();
+ }
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_v2_image() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_image_id.empty()) {
+ dir_get_image_id();
+ return;
+ } else if (m_image_name.empty()) {
+ dir_get_image_name();
+ return;
+ }
+
+ remove_header_v2();
+ return;
+}
+
+template<typename I>
+void RemoveRequest<I>::dir_get_image_id() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_id_start(&op, m_image_name);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_dir_get_image_id>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_dir_get_image_id(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error fetching image id: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::dir_get_id_finish(&iter, &m_image_id);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+ }
+
+ remove_header_v2();
+}
+
+template<typename I>
+void RemoveRequest<I>::dir_get_image_name() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_name_start(&op, m_image_id);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_dir_get_image_name>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_dir_get_image_name(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error fetching image name: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::dir_get_name_finish(&iter, &m_image_name);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+ }
+
+ remove_header_v2();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_id_object() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_remove_id_object>(this);
+ int r = m_ioctx.aio_remove(util::id_obj_name(m_image_name), rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_id_object(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing id object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ dir_remove_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::dir_remove_image() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::dir_remove_image(&op, m_image_name, m_image_id);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_dir_remove_image>(this);
+ int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_dir_remove_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing image from v2 directory: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ finish(r);
+}
+
+template<typename I>
+void RemoveRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RemoveRequest.h b/src/librbd/image/RemoveRequest.h
new file mode 100644
index 00000000..98d59764
--- /dev/null
+++ b/src/librbd/image/RemoveRequest.h
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/image/TypeTraits.h"
+
+#include <list>
+
+class Context;
+class ContextWQ;
+class SafeTimer;
+
+namespace librbd {
+
+class ProgressContext;
+
+namespace image {
+
+template<typename ImageCtxT = ImageCtx>
+class RemoveRequest {
+private:
+ // mock unit testing support
+ typedef ::librbd::image::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::ContextWQ ContextWQ;
+public:
+ static RemoveRequest *create(librados::IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id,
+ bool force, bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new RemoveRequest(ioctx, image_name, image_id, force,
+ from_trash_remove, prog_ctx, op_work_queue,
+ on_finish);
+ }
+
+ static RemoveRequest *create(librados::IoCtx &ioctx, ImageCtxT *image_ctx,
+ bool force, bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new RemoveRequest(ioctx, image_ctx, force, from_trash_remove,
+ prog_ctx, op_work_queue, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * (skip if already opened) OPEN IMAGE------------------\
+ * | |
+ * v |
+ * PRE REMOVE IMAGE * * * |
+ * | * |
+ * v * |
+ * (skip if invalid data pool) TRIM IMAGE * * * * * |
+ * | * |
+ * v * |
+ * DETACH CHILD * |
+ * | * |
+ * v * v
+ * CLOSE IMAGE < * * * * |
+ * | |
+ * error v |
+ * /------<--------\ REMOVE HEADER<--------------/
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE JOURNAL
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * v ^ REMOVE OBJECTMAP
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE MIRROR IMAGE
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE ID OBJECT
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE IMAGE
+ * | | / |
+ * | \-------<-------/ |
+ * | v
+ * \------------------>------------<finish>
+ *
+ * @endverbatim
+ */
+
+ RemoveRequest(librados::IoCtx &ioctx, const std::string &image_name,
+ const std::string &image_id, bool force, bool from_trash_remove,
+ ProgressContext &prog_ctx, ContextWQ *op_work_queue,
+ Context *on_finish);
+
+ RemoveRequest(librados::IoCtx &ioctx, ImageCtxT *image_ctx, bool force,
+ bool from_trash_remove, ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ librados::IoCtx &m_ioctx;
+ std::string m_image_name;
+ std::string m_image_id;
+ ImageCtxT *m_image_ctx = nullptr;
+ bool m_force;
+ bool m_from_trash_remove;
+ ProgressContext &m_prog_ctx;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ std::string m_header_oid;
+ bool m_old_format = false;
+ bool m_unknown_format = true;
+
+ librados::IoCtx m_parent_io_ctx;
+
+ decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr;
+
+ int m_ret_val = 0;
+ bufferlist m_out_bl;
+ std::list<obj_watch_t> m_watchers;
+
+ std::map<uint64_t, SnapInfo> m_snap_infos;
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void send_journal_remove();
+ void handle_journal_remove(int r);
+
+ void send_object_map_remove();
+ void handle_object_map_remove(int r);
+
+ void mirror_image_remove();
+ void handle_mirror_image_remove(int r);
+
+ void pre_remove_image();
+ void handle_pre_remove_image(int r);
+
+ void trim_image();
+ void handle_trim_image(int r);
+
+ void detach_child();
+ void handle_detach_child(int r);
+
+ void send_disable_mirror();
+ void handle_disable_mirror(int r);
+
+ void send_close_image(int r);
+ void handle_send_close_image(int r);
+
+ void remove_header();
+ void handle_remove_header(int r);
+
+ void remove_header_v2();
+ void handle_remove_header_v2(int r);
+
+ void remove_image();
+
+ void remove_v1_image();
+ void handle_remove_v1_image(int r);
+
+ void remove_v2_image();
+
+ void dir_get_image_id();
+ void handle_dir_get_image_id(int r);
+
+ void dir_get_image_name();
+ void handle_dir_get_image_name(int r);
+
+ void remove_id_object();
+ void handle_remove_id_object(int r);
+
+ void dir_remove_image();
+ void handle_dir_remove_image(int r);
+
+ void finish(int r);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H
diff --git a/src/librbd/image/SetFlagsRequest.cc b/src/librbd/image/SetFlagsRequest.cc
new file mode 100644
index 00000000..a18c7499
--- /dev/null
+++ b/src/librbd/image/SetFlagsRequest.cc
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/SetFlagsRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::SetFlagsRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+SetFlagsRequest<I>::SetFlagsRequest(I *image_ctx, uint64_t flags,
+ uint64_t mask, Context *on_finish)
+ : m_image_ctx(image_ctx), m_flags(flags), m_mask(mask),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void SetFlagsRequest<I>::send() {
+ send_set_flags();
+}
+
+template <typename I>
+void SetFlagsRequest<I>::send_set_flags() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ std::vector<uint64_t> snap_ids;
+ snap_ids.push_back(CEPH_NOSNAP);
+ for (auto it : m_image_ctx->snap_info) {
+ snap_ids.push_back(it.first);
+ }
+
+ Context *ctx = create_context_callback<
+ SetFlagsRequest<I>, &SetFlagsRequest<I>::handle_set_flags>(this);
+ C_Gather *gather_ctx = new C_Gather(cct, ctx);
+
+ for (auto snap_id : snap_ids) {
+ librados::ObjectWriteOperation op;
+ cls_client::set_flags(&op, snap_id, m_flags, m_mask);
+
+ librados::AioCompletion *comp =
+ create_rados_callback(gather_ctx->new_sub());
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+Context *SetFlagsRequest<I>::handle_set_flags(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "set_flags failed: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return m_on_finish;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::SetFlagsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/SetFlagsRequest.h b/src/librbd/image/SetFlagsRequest.h
new file mode 100644
index 00000000..36905e62
--- /dev/null
+++ b/src/librbd/image/SetFlagsRequest.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H
+
+#include "include/buffer.h"
+#include "common/Mutex.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class SetFlagsRequest {
+public:
+ static SetFlagsRequest *create(ImageCtxT *image_ctx, uint64_t flags,
+ uint64_t mask, Context *on_finish) {
+ return new SetFlagsRequest(image_ctx, flags, mask, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . .
+ * v v .
+ * SET_FLAGS . (for every snapshot)
+ * | . .
+ * v . . .
+ * <finis>
+ *
+ * @endverbatim
+ */
+
+ SetFlagsRequest(ImageCtxT *image_ctx, uint64_t flags, uint64_t mask,
+ Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_flags;
+ uint64_t m_mask;
+ Context *m_on_finish;
+
+ void send_set_flags();
+ Context *handle_set_flags(int *result);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::SetFlagsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H
diff --git a/src/librbd/image/SetSnapRequest.cc b/src/librbd/image/SetSnapRequest.cc
new file mode 100644
index 00000000..f342147f
--- /dev/null
+++ b/src/librbd/image/SetSnapRequest.cc
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/SetSnapRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshParentRequest.h"
+#include "librbd/io/ImageRequestWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::SetSnapRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+
+template <typename I>
+SetSnapRequest<I>::SetSnapRequest(I &image_ctx, uint64_t snap_id,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish),
+ m_exclusive_lock(nullptr), m_object_map(nullptr), m_refresh_parent(nullptr),
+ m_writes_blocked(false) {
+}
+
+template <typename I>
+SetSnapRequest<I>::~SetSnapRequest() {
+ ceph_assert(!m_writes_blocked);
+ delete m_refresh_parent;
+ delete m_object_map;
+ delete m_exclusive_lock;
+}
+
+template <typename I>
+void SetSnapRequest<I>::send() {
+ if (m_snap_id == CEPH_NOSNAP) {
+ send_init_exclusive_lock();
+ } else {
+ send_block_writes();
+ }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_init_exclusive_lock() {
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ ceph_assert(m_image_ctx.snap_id == CEPH_NOSNAP);
+ send_complete();
+ return;
+ }
+ }
+
+ if (m_image_ctx.read_only ||
+ !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ int r = 0;
+ if (send_refresh_parent(&r) != nullptr) {
+ send_complete();
+ }
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_exclusive_lock = ExclusiveLock<I>::create(m_image_ctx);
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_init_exclusive_lock>(this);
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ m_exclusive_lock->init(m_image_ctx.features, ctx);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_init_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ finalize();
+ return m_on_finish;
+ }
+ return send_refresh_parent(result);
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_block_writes() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_writes_blocked = true;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_block_writes>(this);
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ m_image_ctx.io_work_queue->block_writes(ctx);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_block_writes(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result)
+ << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ auto it = m_image_ctx.snap_info.find(m_snap_id);
+ if (it == m_image_ctx.snap_info.end()) {
+ ldout(cct, 5) << "failed to locate snapshot '" << m_snap_id << "'"
+ << dendl;
+
+ *result = -ENOENT;
+ finalize();
+ return m_on_finish;
+ }
+ }
+
+ return send_shut_down_exclusive_lock(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_shut_down_exclusive_lock(int *result) {
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ m_exclusive_lock = m_image_ctx.exclusive_lock;
+ }
+
+ if (m_exclusive_lock == nullptr) {
+ return send_refresh_parent(result);
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_shut_down_exclusive_lock>(this);
+ m_exclusive_lock->shut_down(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_shut_down_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to shut down exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_refresh_parent(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ParentImageInfo parent_md;
+ bool refresh_parent;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+
+ const auto parent_info = m_image_ctx.get_parent_info(m_snap_id);
+ if (parent_info == nullptr) {
+ *result = -ENOENT;
+ lderr(cct) << "failed to retrieve snapshot parent info" << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ parent_md = *parent_info;
+ refresh_parent = RefreshParentRequest<I>::is_refresh_required(
+ m_image_ctx, parent_md, m_image_ctx.migration_info);
+ }
+
+ if (!refresh_parent) {
+ if (m_snap_id == CEPH_NOSNAP) {
+ // object map is loaded when exclusive lock is acquired
+ *result = apply();
+ finalize();
+ return m_on_finish;
+ } else {
+ // load snapshot object map
+ return send_open_object_map(result);
+ }
+ }
+
+ ldout(cct, 10) << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_refresh_parent>(this);
+ m_refresh_parent = RefreshParentRequest<I>::create(m_image_ctx, parent_md,
+ m_image_ctx.migration_info,
+ ctx);
+ m_refresh_parent->send();
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to refresh snapshot parent: " << cpp_strerror(*result)
+ << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ if (m_snap_id == CEPH_NOSNAP) {
+ // object map is loaded when exclusive lock is acquired
+ *result = apply();
+ if (*result < 0) {
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_finalize_refresh_parent(result);
+ } else {
+ // load snapshot object map
+ return send_open_object_map(result);
+ }
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_open_object_map(int *result) {
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ *result = apply();
+ if (*result < 0) {
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_finalize_refresh_parent(result);
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_open_object_map>(this);
+ m_object_map = ObjectMap<I>::create(m_image_ctx, m_snap_id);
+ m_object_map->open(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_open_object_map(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to open object map: " << cpp_strerror(*result)
+ << dendl;
+ delete m_object_map;
+ m_object_map = nullptr;
+ }
+
+ *result = apply();
+ if (*result < 0) {
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_finalize_refresh_parent(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_finalize_refresh_parent(int *result) {
+ if (m_refresh_parent == nullptr) {
+ finalize();
+ return m_on_finish;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_finalize_refresh_parent>(this);
+ m_refresh_parent->finalize(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_finalize_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close parent image: " << cpp_strerror(*result)
+ << dendl;
+ }
+ finalize();
+ return m_on_finish;
+}
+
+template <typename I>
+int SetSnapRequest<I>::apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::WLocker parent_locker(m_image_ctx.parent_lock);
+ if (m_snap_id != CEPH_NOSNAP) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr);
+ int r = m_image_ctx.snap_set(m_snap_id);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ std::swap(m_image_ctx.exclusive_lock, m_exclusive_lock);
+ m_image_ctx.snap_unset();
+ }
+
+ if (m_refresh_parent != nullptr) {
+ m_refresh_parent->apply();
+ }
+
+ std::swap(m_object_map, m_image_ctx.object_map);
+ return 0;
+}
+
+template <typename I>
+void SetSnapRequest<I>::finalize() {
+ if (m_writes_blocked) {
+ m_image_ctx.io_work_queue->unblock_writes();
+ m_writes_blocked = false;
+ }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_complete() {
+ finalize();
+ m_on_finish->complete(0);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::SetSnapRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/SetSnapRequest.h b/src/librbd/image/SetSnapRequest.h
new file mode 100644
index 00000000..c12ea9f2
--- /dev/null
+++ b/src/librbd/image/SetSnapRequest.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
+
+#include "cls/rbd/cls_rbd_client.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+template <typename> class ExclusiveLock;
+class ImageCtx;
+template <typename> class ObjectMap;
+
+namespace image {
+
+template <typename> class RefreshParentRequest;
+
+template <typename ImageCtxT = ImageCtx>
+class SetSnapRequest {
+public:
+ static SetSnapRequest *create(ImageCtxT &image_ctx, uint64_t snap_id,
+ Context *on_finish) {
+ return new SetSnapRequest(image_ctx, snap_id, on_finish);
+ }
+
+ ~SetSnapRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (set snap)
+ * |-----------> BLOCK_WRITES
+ * | |
+ * | v
+ * | SHUTDOWN_EXCLUSIVE_LOCK (skip if lock inactive
+ * | | or disabled)
+ * | v
+ * | REFRESH_PARENT (skip if no parent
+ * | | or refresh not needed)
+ * | v
+ * | OPEN_OBJECT_MAP (skip if map disabled)
+ * | |
+ * | v
+ * | <apply>
+ * | |
+ * | v
+ * | FINALIZE_REFRESH_PARENT (skip if no parent
+ * | | or refresh not needed)
+ * | v
+ * | <finish>
+ * |
+ * \-----------> INIT_EXCLUSIVE_LOCK (skip if active or
+ * | disabled)
+ * v
+ * REFRESH_PARENT (skip if no parent
+ * | or refresh not needed)
+ * v
+ * <apply>
+ * |
+ * v
+ * FINALIZE_REFRESH_PARENT (skip if no parent
+ * | or refresh not needed)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ SetSnapRequest(ImageCtxT &image_ctx, uint64_t snap_id, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ uint64_t m_snap_id;
+ Context *m_on_finish;
+
+ ExclusiveLock<ImageCtxT> *m_exclusive_lock;
+ ObjectMap<ImageCtxT> *m_object_map;
+ RefreshParentRequest<ImageCtxT> *m_refresh_parent;
+
+ bool m_writes_blocked;
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_init_exclusive_lock();
+ Context *handle_init_exclusive_lock(int *result);
+
+ Context *send_shut_down_exclusive_lock(int *result);
+ Context *handle_shut_down_exclusive_lock(int *result);
+
+ Context *send_refresh_parent(int *result);
+ Context *handle_refresh_parent(int *result);
+
+ Context *send_open_object_map(int *result);
+ Context *handle_open_object_map(int *result);
+
+ Context *send_finalize_refresh_parent(int *result);
+ Context *handle_finalize_refresh_parent(int *result);
+
+ int apply();
+ void finalize();
+ void send_complete();
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::SetSnapRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
diff --git a/src/librbd/image/TypeTraits.h b/src/librbd/image/TypeTraits.h
new file mode 100644
index 00000000..17d84fc8
--- /dev/null
+++ b/src/librbd/image/TypeTraits.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H
+#define CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H
+
+class ContextWQ;
+
+namespace librbd {
+namespace image {
+
+template <typename ImageCtxT>
+struct TypeTraits {
+ typedef ::ContextWQ ContextWQ;
+};
+
+} // namespace image
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H
diff --git a/src/librbd/image/ValidatePoolRequest.cc b/src/librbd/image/ValidatePoolRequest.cc
new file mode 100644
index 00000000..2214f81b
--- /dev/null
+++ b/src/librbd/image/ValidatePoolRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/ValidatePoolRequest.h"
+#include "include/rados/librados.hpp"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::ValidatePoolRequest: " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+namespace {
+
+const std::string OVERWRITE_VALIDATED("overwrite validated");
+const std::string VALIDATE("validate");
+
+} // anonymous namespace
+
+using util::create_rados_callback;
+using util::create_context_callback;
+using util::create_async_context_callback;
+
+template <typename I>
+ValidatePoolRequest<I>::ValidatePoolRequest(librados::IoCtx& io_ctx,
+ ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ // validatation should occur in default namespace
+ m_io_ctx.dup(io_ctx);
+ m_io_ctx.set_namespace("");
+ }
+
+template <typename I>
+void ValidatePoolRequest<I>::send() {
+ read_rbd_info();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::read_rbd_info() {
+ ldout(m_cct, 5) << dendl;
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_read_rbd_info>(this);
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, nullptr, nullptr);
+
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_read_rbd_info(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r >= 0) {
+ bufferlist validated_bl;
+ validated_bl.append(OVERWRITE_VALIDATED);
+
+ bufferlist validate_bl;
+ validate_bl.append(VALIDATE);
+
+ if (m_out_bl.contents_equal(validated_bl)) {
+ // already validated pool
+ finish(0);
+ return;
+ } else if (m_out_bl.contents_equal(validate_bl)) {
+ // implies snapshot was already successfully created
+ overwrite_rbd_info();
+ return;
+ }
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to read RBD info: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ create_snapshot();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::create_snapshot() {
+ ldout(m_cct, 5) << dendl;
+
+ // allocate a self-managed snapshot id if this a new pool to force
+ // self-managed snapshot mode
+ auto ctx = new FunctionContext([this](int r) {
+ r = m_io_ctx.selfmanaged_snap_create(&m_snap_id);
+ handle_create_snapshot(r);
+ });
+ m_op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_create_snapshot(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r == -EINVAL) {
+ lderr(m_cct) << "pool not configured for self-managed RBD snapshot support"
+ << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to allocate self-managed snapshot: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ write_rbd_info();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::write_rbd_info() {
+ ldout(m_cct, 5) << dendl;
+
+ bufferlist bl;
+ bl.append(VALIDATE);
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ op.write(0, bl);
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_write_rbd_info>(this);
+ int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_write_rbd_info(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ lderr(m_cct) << "pool missing required overwrite support" << dendl;
+ m_ret_val = -EINVAL;
+ } else if (r < 0 && r != -EEXIST) {
+ lderr(m_cct) << "failed to write RBD info: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ remove_snapshot();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::remove_snapshot() {
+ ldout(m_cct, 5) << dendl;
+
+ auto ctx = new FunctionContext([this](int r) {
+ r = m_io_ctx.selfmanaged_snap_remove(m_snap_id);
+ handle_remove_snapshot(r);
+ });
+ m_op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_remove_snapshot(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ // not a fatal error
+ lderr(m_cct) << "failed to remove validation snapshot: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ overwrite_rbd_info();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::overwrite_rbd_info() {
+ ldout(m_cct, 5) << dendl;
+
+ bufferlist bl;
+ bl.append(OVERWRITE_VALIDATED);
+
+ librados::ObjectWriteOperation op;
+ op.write(0, bl);
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_overwrite_rbd_info>(this);
+ int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_overwrite_rbd_info(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ lderr(m_cct) << "pool missing required overwrite support" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to validate overwrite support: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::finish(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::ValidatePoolRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/ValidatePoolRequest.h b/src/librbd/image/ValidatePoolRequest.h
new file mode 100644
index 00000000..8ea6e4a0
--- /dev/null
+++ b/src/librbd/image/ValidatePoolRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+
+class CephContext;
+class Context;
+class ContextWQ;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT>
+class ValidatePoolRequest {
+public:
+ static ValidatePoolRequest* create(librados::IoCtx& io_ctx,
+ ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new ValidatePoolRequest(io_ctx, op_work_queue, on_finish);
+ }
+
+ ValidatePoolRequest(librados::IoCtx& io_ctx, ContextWQ *op_work_queue,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (overwrites validated)
+ * READ RBD INFO . . . . . . . . .
+ * | . .
+ * | . (snapshots validated) .
+ * | . . . . . . . . . . .
+ * v . .
+ * CREATE SNAPSHOT . .
+ * | . .
+ * v . .
+ * WRITE RBD INFO . .
+ * | . .
+ * v . .
+ * REMOVE SNAPSHOT . .
+ * | . .
+ * v . .
+ * OVERWRITE RBD INFO < . . . .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . .`
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx m_io_ctx;
+ CephContext* m_cct;
+ ContextWQ* m_op_work_queue;
+ Context* m_on_finish;
+
+ int m_ret_val = 0;
+ bufferlist m_out_bl;
+ uint64_t m_snap_id = 0;
+
+ void read_rbd_info();
+ void handle_read_rbd_info(int r);
+
+ void create_snapshot();
+ void handle_create_snapshot(int r);
+
+ void write_rbd_info();
+ void handle_write_rbd_info(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void overwrite_rbd_info();
+ void handle_overwrite_rbd_info(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::ValidatePoolRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H
diff --git a/src/librbd/image_watcher/NotifyLockOwner.cc b/src/librbd/image_watcher/NotifyLockOwner.cc
new file mode 100644
index 00000000..ead5f214
--- /dev/null
+++ b/src/librbd/image_watcher/NotifyLockOwner.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image_watcher/NotifyLockOwner.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "librbd/watcher/Notifier.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image_watcher::NotifyLockOwner: " \
+ << this << " " << __func__
+
+namespace librbd {
+
+namespace image_watcher {
+
+using namespace watch_notify;
+using util::create_context_callback;
+
+NotifyLockOwner::NotifyLockOwner(ImageCtx &image_ctx,
+ watcher::Notifier &notifier,
+ bufferlist &&bl, Context *on_finish)
+ : m_image_ctx(image_ctx), m_notifier(notifier), m_bl(std::move(bl)),
+ m_on_finish(on_finish) {
+}
+
+void NotifyLockOwner::send() {
+ send_notify();
+}
+
+void NotifyLockOwner::send_notify() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ m_notifier.notify(m_bl, &m_notify_response, create_context_callback<
+ NotifyLockOwner, &NotifyLockOwner::handle_notify>(this));
+}
+
+void NotifyLockOwner::handle_notify(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": r=" << r << dendl;
+
+ if (r < 0 && r != -ETIMEDOUT) {
+ lderr(cct) << ": lock owner notification failed: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ bufferlist response;
+ bool lock_owner_responded = false;
+ for (auto &it : m_notify_response.acks) {
+ if (it.second.length() > 0) {
+ if (lock_owner_responded) {
+ lderr(cct) << ": duplicate lock owners detected" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+ lock_owner_responded = true;
+ response.claim(it.second);
+ }
+ }
+
+ if (!lock_owner_responded) {
+ ldout(cct, 1) << ": no lock owners detected" << dendl;
+ finish(-ETIMEDOUT);
+ return;
+ }
+
+ try {
+ auto iter = response.cbegin();
+
+ ResponseMessage response_message;
+ using ceph::decode;
+ decode(response_message, iter);
+
+ r = response_message.result;
+ } catch (const buffer::error &err) {
+ r = -EINVAL;
+ }
+ finish(r);
+}
+
+void NotifyLockOwner::finish(int r) {
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_watcher
+} // namespace librbd
diff --git a/src/librbd/image_watcher/NotifyLockOwner.h b/src/librbd/image_watcher/NotifyLockOwner.h
new file mode 100644
index 00000000..6249bc12
--- /dev/null
+++ b/src/librbd/image_watcher/NotifyLockOwner.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H
+#define CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H
+
+#include "include/buffer.h"
+#include "librbd/watcher/Types.h"
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace watcher { class Notifier; }
+
+namespace image_watcher {
+
+class NotifyLockOwner {
+public:
+ static NotifyLockOwner *create(ImageCtx &image_ctx,
+ watcher::Notifier &notifier,
+ bufferlist &&bl, Context *on_finish) {
+ return new NotifyLockOwner(image_ctx, notifier, std::move(bl), on_finish);
+ }
+
+ NotifyLockOwner(ImageCtx &image_ctx, watcher::Notifier &notifier,
+ bufferlist &&bl, Context *on_finish);
+
+ void send();
+
+private:
+ ImageCtx &m_image_ctx;
+ watcher::Notifier &m_notifier;
+
+ bufferlist m_bl;
+ watcher::NotifyResponse m_notify_response;
+ Context *m_on_finish;
+
+ void send_notify();
+ void handle_notify(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
new file mode 100644
index 00000000..ebf2603f
--- /dev/null
+++ b/src/librbd/internal.cc
@@ -0,0 +1,2021 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "include/int_types.h"
+
+#include <errno.h>
+#include <limits.h>
+
+#include "include/types.h"
+#include "include/uuid.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "common/event_socket.h"
+#include "common/perf_counters.h"
+#include "osdc/Striper.h"
+#include "include/stringify.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/journal/cls_journal_client.h"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Image.h"
+#include "librbd/exclusive_lock/AutomaticPolicy.h"
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/journal/Types.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/operation/TrimRequest.h"
+
+#include "journal/Journaler.h"
+
+#include <boost/scope_exit.hpp>
+#include <boost/variant.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd: "
+
+#define rbd_howmany(x, y) (((x) + (y) - 1) / (y))
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+// list binds to list() here, so std::list is explicitly used below
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+using librados::Rados;
+
+namespace librbd {
+
+namespace {
+
+int validate_pool(IoCtx &io_ctx, CephContext *cct) {
+ if (!cct->_conf.get_val<bool>("rbd_validate_pool")) {
+ return 0;
+ }
+
+ int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
+ if (r == 0) {
+ return 0;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // allocate a self-managed snapshot id if this a new pool to force
+ // self-managed snapshot mode
+ uint64_t snap_id;
+ r = io_ctx.selfmanaged_snap_create(&snap_id);
+ if (r == -EINVAL) {
+ lderr(cct) << "pool not configured for self-managed RBD snapshot support"
+ << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to allocate self-managed snapshot: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = io_ctx.selfmanaged_snap_remove(snap_id);
+ if (r < 0) {
+ lderr(cct) << "failed to release self-managed snapshot " << snap_id
+ << ": " << cpp_strerror(r) << dendl;
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+ int detect_format(IoCtx &io_ctx, const string &name,
+ bool *old_format, uint64_t *size)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ if (old_format)
+ *old_format = true;
+ int r = io_ctx.stat(util::old_header_name(name), size, NULL);
+ if (r == -ENOENT) {
+ if (old_format)
+ *old_format = false;
+ r = io_ctx.stat(util::id_obj_name(name), size, NULL);
+ if (r < 0)
+ return r;
+ } else if (r < 0) {
+ return r;
+ }
+
+ ldout(cct, 20) << "detect format of " << name << " : "
+ << (old_format ? (*old_format ? "old" : "new") :
+ "don't care") << dendl;
+ return 0;
+ }
+
+ bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap)
+ {
+ return (parent_pool_id != -1 && off <= overlap);
+ }
+
+ void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
+ uint64_t size, int order, uint64_t bid)
+ {
+ uint32_t hi = bid >> 32;
+ uint32_t lo = bid & 0xFFFFFFFF;
+ uint32_t extra = rand() % 0xFFFFFFFF;
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&ondisk, 0, sizeof(ondisk));
+
+ memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT));
+ memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE,
+ sizeof(RBD_HEADER_SIGNATURE));
+ memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION));
+
+ snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x",
+ hi, lo, extra);
+
+ ondisk.image_size = size;
+ ondisk.options.order = order;
+ ondisk.options.crypt_type = RBD_CRYPT_NONE;
+ ondisk.options.comp_type = RBD_COMP_NONE;
+ ondisk.snap_seq = 0;
+ ondisk.snap_count = 0;
+ ondisk.reserved = 0;
+ ondisk.snap_names_len = 0;
+ }
+
+ void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize)
+ {
+ int obj_order = ictx->order;
+ ictx->snap_lock.get_read();
+ info.size = ictx->get_image_size(ictx->snap_id);
+ ictx->snap_lock.put_read();
+ info.obj_size = 1ULL << obj_order;
+ info.num_objs = Striper::get_num_objects(ictx->layout, info.size);
+ info.order = obj_order;
+ strncpy(info.block_name_prefix, ictx->object_prefix.c_str(),
+ RBD_MAX_BLOCK_NAME_SIZE);
+ info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0';
+
+ // clear deprecated fields
+ info.parent_pool = -1L;
+ info.parent_name[0] = '\0';
+ }
+
+ uint64_t oid_to_object_no(const string& oid, const string& object_prefix)
+ {
+ istringstream iss(oid);
+ // skip object prefix and separator
+ iss.ignore(object_prefix.length() + 1);
+ uint64_t num;
+ iss >> std::hex >> num;
+ return num;
+ }
+
+ void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
+ {
+ ceph_assert(ictx->owner_lock.is_locked());
+ ceph_assert(ictx->exclusive_lock == nullptr ||
+ ictx->exclusive_lock->is_lock_owner());
+
+ C_SaferCond ctx;
+ ictx->snap_lock.get_read();
+ operation::TrimRequest<> *req = operation::TrimRequest<>::create(
+ *ictx, &ctx, ictx->size, newsize, prog_ctx);
+ ictx->snap_lock.put_read();
+ req->send();
+
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(ictx->cct) << "warning: failed to remove some object(s): "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+
+ int read_header_bl(IoCtx& io_ctx, const string& header_oid,
+ bufferlist& header, uint64_t *ver)
+ {
+ int r;
+ uint64_t off = 0;
+#define READ_SIZE 4096
+ do {
+ bufferlist bl;
+ r = io_ctx.read(header_oid, bl, READ_SIZE, off);
+ if (r < 0)
+ return r;
+ header.claim_append(bl);
+ off += r;
+ } while (r == READ_SIZE);
+
+ static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT),
+ "length of rbd headers must be the same");
+
+ if (header.length() < sizeof(RBD_HEADER_TEXT) ||
+ (memcmp(RBD_HEADER_TEXT, header.c_str(),
+ sizeof(RBD_HEADER_TEXT)) != 0 &&
+ memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(),
+ sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ lderr(cct) << "unrecognized header format" << dendl;
+ return -ENXIO;
+ }
+
+ if (ver)
+ *ver = io_ctx.get_last_version();
+
+ return 0;
+ }
+
+ int read_header(IoCtx& io_ctx, const string& header_oid,
+ struct rbd_obj_header_ondisk *header, uint64_t *ver)
+ {
+ bufferlist header_bl;
+ int r = read_header_bl(io_ctx, header_oid, header_bl, ver);
+ if (r < 0)
+ return r;
+ if (header_bl.length() < (int)sizeof(*header))
+ return -EIO;
+ memcpy(header, header_bl.c_str(), sizeof(*header));
+
+ return 0;
+ }
+
+ int tmap_set(IoCtx& io_ctx, const string& imgname)
+ {
+ bufferlist cmdbl, emptybl;
+ __u8 c = CEPH_OSD_TMAP_SET;
+ encode(c, cmdbl);
+ encode(imgname, cmdbl);
+ encode(emptybl, cmdbl);
+ return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
+ }
+
+ int tmap_rm(IoCtx& io_ctx, const string& imgname)
+ {
+ bufferlist cmdbl;
+ __u8 c = CEPH_OSD_TMAP_RM;
+ encode(c, cmdbl);
+ encode(imgname, cmdbl);
+ return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
+ }
+
+ typedef boost::variant<std::string,uint64_t> image_option_value_t;
+ typedef std::map<int,image_option_value_t> image_options_t;
+ typedef std::shared_ptr<image_options_t> image_options_ref;
+
+ enum image_option_type_t {
+ STR,
+ UINT64,
+ };
+
+ const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
+ {RBD_IMAGE_OPTION_FORMAT, UINT64},
+ {RBD_IMAGE_OPTION_FEATURES, UINT64},
+ {RBD_IMAGE_OPTION_ORDER, UINT64},
+ {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
+ {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
+ {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
+ {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
+ {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
+ {RBD_IMAGE_OPTION_FEATURES_SET, UINT64},
+ {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64},
+ {RBD_IMAGE_OPTION_DATA_POOL, STR},
+ {RBD_IMAGE_OPTION_FLATTEN, UINT64},
+ {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64},
+ };
+
+ std::string image_option_name(int optname) {
+ switch (optname) {
+ case RBD_IMAGE_OPTION_FORMAT:
+ return "format";
+ case RBD_IMAGE_OPTION_FEATURES:
+ return "features";
+ case RBD_IMAGE_OPTION_ORDER:
+ return "order";
+ case RBD_IMAGE_OPTION_STRIPE_UNIT:
+ return "stripe_unit";
+ case RBD_IMAGE_OPTION_STRIPE_COUNT:
+ return "stripe_count";
+ case RBD_IMAGE_OPTION_JOURNAL_ORDER:
+ return "journal_order";
+ case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
+ return "journal_splay_width";
+ case RBD_IMAGE_OPTION_JOURNAL_POOL:
+ return "journal_pool";
+ case RBD_IMAGE_OPTION_FEATURES_SET:
+ return "features_set";
+ case RBD_IMAGE_OPTION_FEATURES_CLEAR:
+ return "features_clear";
+ case RBD_IMAGE_OPTION_DATA_POOL:
+ return "data_pool";
+ case RBD_IMAGE_OPTION_FLATTEN:
+ return "flatten";
+ case RBD_IMAGE_OPTION_CLONE_FORMAT:
+ return "clone_format";
+ default:
+ return "unknown (" + stringify(optname) + ")";
+ }
+ }
+
+ void image_options_create(rbd_image_options_t* opts)
+ {
+ image_options_ref* opts_ = new image_options_ref(new image_options_t());
+
+ *opts = static_cast<rbd_image_options_t>(opts_);
+ }
+
+ void image_options_create_ref(rbd_image_options_t* opts,
+ rbd_image_options_t orig)
+ {
+ image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
+ image_options_ref* opts_ = new image_options_ref(*orig_);
+
+ *opts = static_cast<rbd_image_options_t>(opts_);
+ }
+
+ void image_options_copy(rbd_image_options_t* opts,
+ const ImageOptions &orig)
+ {
+ image_options_ref* opts_ = new image_options_ref(new image_options_t());
+
+ *opts = static_cast<rbd_image_options_t>(opts_);
+
+ std::string str_val;
+ uint64_t uint64_val;
+ for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
+ switch (i.second) {
+ case STR:
+ if (orig.get(i.first, &str_val) == 0) {
+ image_options_set(*opts, i.first, str_val);
+ }
+ continue;
+ case UINT64:
+ if (orig.get(i.first, &uint64_val) == 0) {
+ image_options_set(*opts, i.first, uint64_val);
+ }
+ continue;
+ }
+ }
+ }
+
+ void image_options_destroy(rbd_image_options_t opts)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ delete opts_;
+ }
+
+ int image_options_set(rbd_image_options_t opts, int optname,
+ const std::string& optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
+ return -EINVAL;
+ }
+
+ (*opts_->get())[optname] = optval;
+ return 0;
+ }
+
+ int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
+ return -EINVAL;
+ }
+
+ (*opts_->get())[optname] = optval;
+ return 0;
+ }
+
+ int image_options_get(rbd_image_options_t opts, int optname,
+ std::string* optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
+ return -EINVAL;
+ }
+
+ image_options_t::const_iterator j = (*opts_)->find(optname);
+
+ if (j == (*opts_)->end()) {
+ return -ENOENT;
+ }
+
+ *optval = boost::get<std::string>(j->second);
+ return 0;
+ }
+
+ int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
+ return -EINVAL;
+ }
+
+ image_options_t::const_iterator j = (*opts_)->find(optname);
+
+ if (j == (*opts_)->end()) {
+ return -ENOENT;
+ }
+
+ *optval = boost::get<uint64_t>(j->second);
+ return 0;
+ }
+
+ int image_options_is_set(rbd_image_options_t opts, int optname,
+ bool* is_set)
+ {
+ if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) ==
+ IMAGE_OPTIONS_TYPE_MAPPING.end()) {
+ return -EINVAL;
+ }
+
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+ *is_set = ((*opts_)->find(optname) != (*opts_)->end());
+ return 0;
+ }
+
+ int image_options_unset(rbd_image_options_t opts, int optname)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
+ ceph_assert((*opts_)->find(optname) == (*opts_)->end());
+ return -EINVAL;
+ }
+
+ image_options_t::const_iterator j = (*opts_)->find(optname);
+
+ if (j == (*opts_)->end()) {
+ return -ENOENT;
+ }
+
+ (*opts_)->erase(j);
+ return 0;
+ }
+
+ void image_options_clear(rbd_image_options_t opts)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ (*opts_)->clear();
+ }
+
+ bool image_options_is_empty(rbd_image_options_t opts)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ return (*opts_)->empty();
+ }
+
+ int flatten_children(ImageCtx *ictx, const char* snap_name,
+ ProgressContext& pctx)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "children flatten " << ictx->name << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker l(ictx->snap_lock);
+ snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ snap_name);
+
+ cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(),
+ ictx->md_ctx.get_namespace(),
+ ictx->id, snap_id};
+ std::vector<librbd::linked_image_spec_t> child_images;
+ r = api::Image<>::list_children(ictx, parent_spec, &child_images);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t size = child_images.size();
+ if (size == 0) {
+ return 0;
+ }
+
+ librados::IoCtx child_io_ctx;
+ int64_t child_pool_id = -1;
+ size_t i = 0;
+ for (auto &child_image : child_images){
+ std::string pool = child_image.pool_name;
+ if (child_pool_id == -1 ||
+ child_pool_id != child_image.pool_id ||
+ child_io_ctx.get_namespace() != child_image.pool_namespace) {
+ r = util::create_ioctx(ictx->md_ctx, "child image",
+ child_image.pool_id, child_image.pool_namespace,
+ &child_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ child_pool_id = child_image.pool_id;
+ }
+
+ ImageCtx *imctx = new ImageCtx("", child_image.image_id, nullptr,
+ child_io_ctx, false);
+ r = imctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((imctx->features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
+ !imctx->snaps.empty()) {
+ lderr(cct) << "snapshot in-use by " << pool << "/" << imctx->name
+ << dendl;
+ imctx->state->close();
+ return -EBUSY;
+ }
+
+ librbd::NoOpProgressContext prog_ctx;
+ r = imctx->operations->flatten(prog_ctx);
+ if (r < 0) {
+ lderr(cct) << "error flattening image: " << pool << "/"
+ << (child_image.pool_namespace.empty() ?
+ "" : "/" + child_image.pool_namespace)
+ << child_image.image_name << cpp_strerror(r) << dendl;
+ imctx->state->close();
+ return r;
+ }
+
+ r = imctx->state->close();
+ if (r < 0) {
+ lderr(cct) << "failed to close image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ pctx.update_progress(++i, size);
+ ceph_assert(i <= size);
+ }
+
+ return 0;
+ }
+
+ int get_snap_namespace(ImageCtx *ictx,
+ const char *snap_name,
+ cls::rbd::SnapshotNamespace *snap_namespace) {
+ ldout(ictx->cct, 20) << "get_snap_namespace " << ictx << " " << snap_name
+ << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ RWLock::RLocker l(ictx->snap_lock);
+ snap_t snap_id = ictx->get_snap_id(*snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP)
+ return -ENOENT;
+ r = ictx->get_snap_namespace(snap_id, snap_namespace);
+ return r;
+ }
+
+ int snap_is_protected(ImageCtx *ictx, const char *snap_name, bool *is_protected)
+ {
+ ldout(ictx->cct, 20) << "snap_is_protected " << ictx << " " << snap_name
+ << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ RWLock::RLocker l(ictx->snap_lock);
+ snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), snap_name);
+ if (snap_id == CEPH_NOSNAP)
+ return -ENOENT;
+ bool is_unprotected;
+ r = ictx->is_snap_unprotected(snap_id, &is_unprotected);
+ // consider both PROTECTED or UNPROTECTING to be 'protected',
+ // since in either state they can't be deleted
+ *is_protected = !is_unprotected;
+ return r;
+ }
+
+ int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname
+ << " size = " << size << " order = " << order << dendl;
+ int r = validate_pool(io_ctx, cct);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!io_ctx.get_namespace().empty()) {
+ lderr(cct) << "attempting to add v1 image to namespace" << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 2) << "adding rbd image to directory..." << dendl;
+ r = tmap_set(io_ctx, imgname);
+ if (r < 0) {
+ lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ Rados rados(io_ctx);
+ uint64_t bid = rados.get_instance_id();
+
+ ldout(cct, 2) << "creating rbd image..." << dendl;
+ struct rbd_obj_header_ondisk header;
+ init_rbd_header(header, size, order, bid);
+
+ bufferlist bl;
+ bl.append((const char *)&header, sizeof(header));
+
+ string header_oid = util::old_header_name(imgname);
+ r = io_ctx.write(header_oid, bl, bl.length(), 0);
+ if (r < 0) {
+ lderr(cct) << "Error writing image header: " << cpp_strerror(r)
+ << dendl;
+ int remove_r = tmap_rm(io_ctx, imgname);
+ if (remove_r < 0) {
+ lderr(cct) << "Could not remove image from directory after "
+ << "header creation failed: "
+ << cpp_strerror(remove_r) << dendl;
+ }
+ return r;
+ }
+
+ ldout(cct, 2) << "done." << dendl;
+ return 0;
+ }
+
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ int *order)
+ {
+ uint64_t order_ = *order;
+ ImageOptions opts;
+
+ int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
+ ceph_assert(r == 0);
+
+ r = create(io_ctx, imgname, "", size, opts, "", "", false);
+
+ int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
+ ceph_assert(r1 == 0);
+ *order = order_;
+
+ return r;
+ }
+
+ int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
+ bool old_format, uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count)
+ {
+ if (!order)
+ return -EINVAL;
+
+ uint64_t order_ = *order;
+ uint64_t format = old_format ? 1 : 2;
+ ImageOptions opts;
+ int r;
+
+ r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ ceph_assert(r == 0);
+
+ r = create(io_ctx, imgname, "", size, opts, "", "", false);
+
+ int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
+ ceph_assert(r1 == 0);
+ *order = order_;
+
+ return r;
+ }
+
+ int create(IoCtx& io_ctx, const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ ImageOptions& opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable)
+ {
+ std::string id(image_id);
+ if (id.empty()) {
+ id = util::generate_image_id(io_ctx);
+ }
+
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ uint64_t option;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
+ lderr(cct) << "create does not support 'flatten' image option" << dendl;
+ return -EINVAL;
+ }
+ if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
+ lderr(cct) << "create does not support 'clone_format' image option"
+ << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 10) << __func__ << " name=" << image_name << ", "
+ << "id= " << id << ", "
+ << "size=" << size << ", opts=" << opts << dendl;
+
+ uint64_t format;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
+ format = cct->_conf.get_val<uint64_t>("rbd_default_format");
+ bool old_format = format == 1;
+
+ // make sure it doesn't already exist, in either format
+ int r = detect_format(io_ctx, image_name, NULL, NULL);
+ if (r != -ENOENT) {
+ if (r) {
+ lderr(cct) << "Could not tell if " << image_name << " already exists"
+ << dendl;
+ return r;
+ }
+ lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
+ return -EEXIST;
+ }
+
+ uint64_t order = 0;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
+ order = cct->_conf.get_val<uint64_t>("rbd_default_order");
+ }
+ r = image::CreateRequest<>::validate_order(cct, order);
+ if (r < 0) {
+ return r;
+ }
+
+ if (old_format) {
+ if ( !getenv("RBD_FORCE_ALLOW_V1") ) {
+ lderr(cct) << "Format 1 image creation unsupported. " << dendl;
+ return -EINVAL;
+ }
+ lderr(cct) << "Forced V1 image creation. " << dendl;
+ r = create_v1(io_ctx, image_name.c_str(), size, order);
+ } else {
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ ConfigProxy config{cct->_conf};
+ api::Config<>::apply_pool_overrides(io_ctx, &config);
+
+ C_SaferCond cond;
+ image::CreateRequest<> *req = image::CreateRequest<>::create(
+ config, io_ctx, image_name, id, size, opts, non_primary_global_image_id,
+ primary_mirror_uuid, skip_mirror_enable, op_work_queue, &cond);
+ req->send();
+
+ r = cond.wait();
+ }
+
+ int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ ceph_assert(r1 == 0);
+
+ return r;
+ }
+
+ /*
+ * Parent may be in different pool, hence different IoCtx
+ */
+ int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name,
+ uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count)
+ {
+ uint64_t order = *c_order;
+
+ ImageOptions opts;
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+
+ int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr,
+ c_name, opts, "", "");
+ opts.get(RBD_IMAGE_OPTION_ORDER, &order);
+ *c_order = order;
+ return r;
+ }
+
+ int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
+ const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
+ const char *c_name, ImageOptions& c_opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid)
+ {
+ ceph_assert((p_id == nullptr) ^ (p_name == nullptr));
+
+ CephContext *cct = (CephContext *)p_ioctx.cct();
+ if (p_snap_name == nullptr) {
+ lderr(cct) << "image to be cloned must be a snapshot" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t flatten;
+ if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
+ lderr(cct) << "clone does not support 'flatten' image option" << dendl;
+ return -EINVAL;
+ }
+
+ int r;
+ std::string parent_id;
+ if (p_id == nullptr) {
+ r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name,
+ &parent_id);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "failed to retrieve parent image id: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ } else {
+ parent_id = p_id;
+ }
+
+ std::string clone_id;
+ if (c_id == nullptr) {
+ clone_id = util::generate_image_id(c_ioctx);
+ } else {
+ clone_id = c_id;
+ }
+
+ ldout(cct, 10) << __func__ << " "
+ << "c_name=" << c_name << ", "
+ << "c_id= " << clone_id << ", "
+ << "c_opts=" << c_opts << dendl;
+
+ ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf};
+ api::Config<>::apply_pool_overrides(c_ioctx, &config);
+
+ ThreadPool *thread_pool;
+ ContextWQ *op_work_queue;
+ ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
+
+ C_SaferCond cond;
+ auto *req = image::CloneRequest<>::create(
+ config, p_ioctx, parent_id, p_snap_name, CEPH_NOSNAP, c_ioctx, c_name,
+ clone_id, c_opts, non_primary_global_image_id, primary_mirror_uuid,
+ op_work_queue, &cond);
+ req->send();
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+ }
+
+ int rename(IoCtx& io_ctx, const char *srcname, const char *dstname)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
+ << dstname << dendl;
+
+ ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ BOOST_SCOPE_EXIT((ictx)) {
+ ictx->state->close();
+ } BOOST_SCOPE_EXIT_END
+
+ return ictx->operations->rename(dstname);
+ }
+
+ int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
+ {
+ ldout(ictx->cct, 20) << "info " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ image_info(ictx, info, infosize);
+ return 0;
+ }
+
+ int get_old_format(ImageCtx *ictx, uint8_t *old)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ *old = ictx->old_format;
+ return 0;
+ }
+
+ int get_size(ImageCtx *ictx, uint64_t *size)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ RWLock::RLocker l2(ictx->snap_lock);
+ *size = ictx->get_image_size(ictx->snap_id);
+ return 0;
+ }
+
+ int get_features(ImageCtx *ictx, uint64_t *features)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ RWLock::RLocker l(ictx->snap_lock);
+ *features = ictx->features;
+ return 0;
+ }
+
+ int get_overlap(ImageCtx *ictx, uint64_t *overlap)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ RWLock::RLocker l(ictx->snap_lock);
+ RWLock::RLocker l2(ictx->parent_lock);
+ return ictx->get_parent_overlap(ictx->snap_id, overlap);
+ }
+
+ int get_flags(ImageCtx *ictx, uint64_t *flags)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::RLocker l2(ictx->snap_lock);
+ return ictx->get_flags(ictx->snap_id, flags);
+ }
+
+ int set_image_notification(ImageCtx *ictx, int fd, int type)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (ictx->event_socket.is_valid())
+ return -EINVAL;
+ return ictx->event_socket.init(fd, type);
+ }
+
+ int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
+ *is_owner = false;
+
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ if (ictx->exclusive_lock == nullptr) {
+ return 0;
+ }
+
+ // might have been blacklisted by peer -- ensure we still own
+ // the lock by pinging the OSD
+ int r = ictx->exclusive_lock->assert_header_locked();
+ if (r == -EBUSY || r == -ENOENT) {
+ return 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ *is_owner = true;
+ return 0;
+ }
+
+ int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
+ << "lock_mode=" << lock_mode << dendl;
+
+ if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
+ return -EOPNOTSUPP;
+ }
+
+ C_SaferCond lock_ctx;
+ {
+ RWLock::WLocker l(ictx->owner_lock);
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) {
+ ictx->set_exclusive_lock_policy(
+ new exclusive_lock::StandardPolicy(ictx));
+ }
+
+ if (ictx->exclusive_lock->is_lock_owner()) {
+ return 0;
+ }
+
+ ictx->exclusive_lock->acquire_lock(&lock_ctx);
+ }
+
+ int r = lock_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ RWLock::RLocker l(ictx->owner_lock);
+ if (ictx->exclusive_lock == nullptr) {
+ return -EINVAL;
+ } else if (!ictx->exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ return ictx->exclusive_lock->get_unlocked_op_error();
+ }
+
+ return 0;
+ }
+
+ int lock_release(ImageCtx *ictx)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
+
+ C_SaferCond lock_ctx;
+ {
+ RWLock::WLocker l(ictx->owner_lock);
+
+ if (ictx->exclusive_lock == nullptr ||
+ !ictx->exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "not exclusive lock owner" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->release_lock(&lock_ctx);
+ }
+
+ int r = lock_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ return 0;
+ }
+
+ int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
+
+ if (!ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ managed_lock::Locker locker;
+ C_SaferCond get_owner_ctx;
+ ExclusiveLock<>(*ictx).get_locker(&locker, &get_owner_ctx);
+ int r = get_owner_ctx.wait();
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to determine current lock owner: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ *lock_mode = RBD_LOCK_MODE_EXCLUSIVE;
+ lock_owners->clear();
+ lock_owners->emplace_back(locker.address);
+ return 0;
+ }
+
+ int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
+ const std::string &lock_owner) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
+ << "lock_mode=" << lock_mode << ", "
+ << "lock_owner=" << lock_owner << dendl;
+
+ if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
+ return -EOPNOTSUPP;
+ }
+
+ if (ictx->read_only) {
+ return -EROFS;
+ }
+
+ managed_lock::Locker locker;
+ C_SaferCond get_owner_ctx;
+ {
+ RWLock::RLocker l(ictx->owner_lock);
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
+ }
+ int r = get_owner_ctx.wait();
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to determine current lock owner: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (locker.address != lock_owner) {
+ return -EBUSY;
+ }
+
+ C_SaferCond break_ctx;
+ {
+ RWLock::RLocker l(ictx->owner_lock);
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->break_lock(locker, true, &break_ctx);
+ }
+ r = break_ctx.wait();
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+ }
+
+ int snap_list(ImageCtx *ictx, vector<snap_info_t>& snaps)
+ {
+ ldout(ictx->cct, 20) << "snap_list " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ RWLock::RLocker l(ictx->snap_lock);
+ for (map<snap_t, SnapInfo>::iterator it = ictx->snap_info.begin();
+ it != ictx->snap_info.end(); ++it) {
+ snap_info_t info;
+ info.name = it->second.name;
+ info.id = it->first;
+ info.size = it->second.size;
+ snaps.push_back(info);
+ }
+
+ return 0;
+ }
+
+ int snap_exists(ImageCtx *ictx, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const char *snap_name, bool *exists)
+ {
+ ldout(ictx->cct, 20) << "snap_exists " << ictx << " " << snap_name << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ RWLock::RLocker l(ictx->snap_lock);
+ *exists = ictx->get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP;
+ return 0;
+ }
+
+ int snap_remove(ImageCtx *ictx, const char *snap_name, uint32_t flags,
+ ProgressContext& pctx)
+ {
+ ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_name << " flags: " << flags << dendl;
+
+ int r = 0;
+
+ r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ if (flags & RBD_SNAP_REMOVE_FLATTEN) {
+ r = flatten_children(ictx, snap_name, pctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ bool is_protected;
+ r = snap_is_protected(ictx, snap_name, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_protected && flags & RBD_SNAP_REMOVE_UNPROTECT) {
+ r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ if (r < 0) {
+ lderr(ictx->cct) << "failed to unprotect snapshot: " << snap_name << dendl;
+ return r;
+ }
+
+ r = snap_is_protected(ictx, snap_name, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+ if (is_protected) {
+ lderr(ictx->cct) << "snapshot is still protected after unprotection" << dendl;
+ ceph_abort();
+ }
+ }
+
+ C_SaferCond ctx;
+ ictx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(), snap_name, &ctx);
+
+ r = ctx.wait();
+ return r;
+ }
+
+ int snap_get_timestamp(ImageCtx *ictx, uint64_t snap_id, struct timespec *timestamp)
+ {
+ std::map<librados::snap_t, SnapInfo>::iterator snap_it = ictx->snap_info.find(snap_id);
+ ceph_assert(snap_it != ictx->snap_info.end());
+ utime_t time = snap_it->second.timestamp;
+ time.to_timespec(timestamp);
+ return 0;
+ }
+
+ int snap_get_limit(ImageCtx *ictx, uint64_t *limit)
+ {
+ int r = cls_client::snapshot_get_limit(&ictx->md_ctx, ictx->header_oid,
+ limit);
+ if (r == -EOPNOTSUPP) {
+ *limit = UINT64_MAX;
+ r = 0;
+ }
+ return r;
+ }
+
+ int snap_set_limit(ImageCtx *ictx, uint64_t limit)
+ {
+ return ictx->operations->snap_set_limit(limit);
+ }
+
+ struct CopyProgressCtx {
+ explicit CopyProgressCtx(ProgressContext &p)
+ : destictx(NULL), src_size(0), prog_ctx(p)
+ { }
+
+ ImageCtx *destictx;
+ uint64_t src_size;
+ ProgressContext &prog_ctx;
+ };
+
+ int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size)
+ {
+ CephContext *cct = (CephContext *)dest_md_ctx.cct();
+ uint64_t option;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
+ lderr(cct) << "copy does not support 'flatten' image option" << dendl;
+ return -EINVAL;
+ }
+ if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
+ lderr(cct) << "copy does not support 'clone_format' image option"
+ << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 20) << "copy " << src->name
+ << (src->snap_name.length() ? "@" + src->snap_name : "")
+ << " -> " << destname << " opts = " << opts << dendl;
+
+ src->snap_lock.get_read();
+ uint64_t features = src->features;
+ uint64_t src_size = src->get_image_size(src->snap_id);
+ src->snap_lock.put_read();
+ uint64_t format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ uint64_t stripe_unit = src->stripe_unit;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+ uint64_t stripe_count = src->stripe_count;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+ uint64_t order = src->order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ }
+ if (features & ~RBD_FEATURES_ALL) {
+ lderr(cct) << "librbd does not support requested features" << dendl;
+ return -ENOSYS;
+ }
+
+ int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
+ if (r < 0) {
+ lderr(cct) << "header creation failed" << dendl;
+ return r;
+ }
+ opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
+
+ ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx,
+ false);
+ r = dest->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "failed to read newly created header" << dendl;
+ return r;
+ }
+
+ r = copy(src, dest, prog_ctx, sparse_size);
+
+ int close_r = dest->state->close();
+ if (r == 0 && close_r < 0) {
+ r = close_r;
+ }
+ return r;
+ }
+
+ class C_CopyWrite : public Context {
+ public:
+ C_CopyWrite(bufferlist *bl, Context* ctx)
+ : m_bl(bl), m_ctx(ctx) {}
+ void finish(int r) override {
+ delete m_bl;
+ m_ctx->complete(r);
+ }
+ private:
+ bufferlist *m_bl;
+ Context *m_ctx;
+ };
+
+ class C_CopyRead : public Context {
+ public:
+ C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset,
+ bufferlist *bl, size_t sparse_size)
+ : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl),
+ m_sparse_size(sparse_size) {
+ m_throttle->start_op();
+ }
+ void finish(int r) override {
+ if (r < 0) {
+ lderr(m_dest->cct) << "error reading from source image at offset "
+ << m_offset << ": " << cpp_strerror(r) << dendl;
+ delete m_bl;
+ m_throttle->end_op(r);
+ return;
+ }
+ ceph_assert(m_bl->length() == (size_t)r);
+
+ if (m_bl->is_zero()) {
+ delete m_bl;
+ m_throttle->end_op(r);
+ return;
+ }
+
+ if (!m_sparse_size) {
+ m_sparse_size = (1 << m_dest->order);
+ }
+
+ auto *throttle = m_throttle;
+ auto *end_op_ctx = new FunctionContext([throttle](int r) {
+ throttle->end_op(r);
+ });
+ auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx);
+
+ m_bl->rebuild(buffer::ptr_node::create(m_bl->length()));
+ size_t write_offset = 0;
+ size_t write_length = 0;
+ size_t offset = 0;
+ size_t length = m_bl->length();
+ const auto& m_ptr = m_bl->front();
+ while (offset < length) {
+ if (util::calc_sparse_extent(m_ptr,
+ m_sparse_size,
+ length,
+ &write_offset,
+ &write_length,
+ &offset)) {
+ bufferlist *write_bl = new bufferlist();
+ write_bl->push_back(
+ buffer::ptr_node::create(m_ptr, write_offset, write_length));
+ Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub());
+ auto comp = io::AioCompletion::create(ctx);
+
+ // coordinate through AIO WQ to ensure lock is acquired if needed
+ m_dest->io_work_queue->aio_write(comp, m_offset + write_offset,
+ write_length,
+ std::move(*write_bl),
+ LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+ std::move(read_trace));
+ write_offset = offset;
+ write_length = 0;
+ }
+ }
+ delete m_bl;
+ ceph_assert(gather_ctx->get_sub_created_count() > 0);
+ gather_ctx->activate();
+ }
+
+ ZTracer::Trace read_trace;
+
+ private:
+ SimpleThrottle *m_throttle;
+ ImageCtx *m_dest;
+ uint64_t m_offset;
+ bufferlist *m_bl;
+ size_t m_sparse_size;
+ };
+
+ int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size)
+ {
+ src->snap_lock.get_read();
+ uint64_t src_size = src->get_image_size(src->snap_id);
+ src->snap_lock.put_read();
+
+ dest->snap_lock.get_read();
+ uint64_t dest_size = dest->get_image_size(dest->snap_id);
+ dest->snap_lock.put_read();
+
+ CephContext *cct = src->cct;
+ if (dest_size < src_size) {
+ lderr(cct) << " src size " << src_size << " > dest size "
+ << dest_size << dendl;
+ return -EINVAL;
+ }
+ int r;
+ const uint32_t MAX_KEYS = 64;
+ map<string, bufferlist> pairs;
+ std::string last_key = "";
+ bool more_results = true;
+
+ while (more_results) {
+ r = cls_client::metadata_list(&src->md_ctx, src->header_oid, last_key, 0, &pairs);
+ if (r < 0 && r != -EOPNOTSUPP && r != -EIO) {
+ lderr(cct) << "couldn't list metadata: " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (r == 0 && !pairs.empty()) {
+ r = cls_client::metadata_set(&dest->md_ctx, dest->header_oid, pairs);
+ if (r < 0) {
+ lderr(cct) << "couldn't set metadata: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ last_key = pairs.rbegin()->first;
+ }
+
+ more_results = (pairs.size() == MAX_KEYS);
+ pairs.clear();
+ }
+
+ ZTracer::Trace trace;
+ if (src->blkin_trace_all) {
+ trace.init("copy", &src->trace_endpoint);
+ }
+
+ RWLock::RLocker owner_lock(src->owner_lock);
+ SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false);
+ uint64_t period = src->get_stripe_period();
+ unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ uint64_t object_id = 0;
+ for (uint64_t offset = 0; offset < src_size; offset += period) {
+ if (throttle.pending_error()) {
+ return throttle.wait_for_ret();
+ }
+
+ {
+ RWLock::RLocker snap_locker(src->snap_lock);
+ if (src->object_map != nullptr) {
+ bool skip = true;
+ // each period is related to src->stripe_count objects, check them all
+ for (uint64_t i=0; i < src->stripe_count; i++) {
+ if (object_id < src->object_map->size() &&
+ src->object_map->object_may_exist(object_id)) {
+ skip = false;
+ }
+ ++object_id;
+ }
+
+ if (skip) continue;
+ } else {
+ object_id += src->stripe_count;
+ }
+ }
+
+ uint64_t len = min(period, src_size - offset);
+ bufferlist *bl = new bufferlist();
+ auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
+ auto comp = io::AioCompletion::create_and_start<Context>(
+ ctx, src, io::AIO_TYPE_READ);
+
+ io::ImageReadRequest<> req(*src, comp, {{offset, len}},
+ io::ReadResult{bl}, fadvise_flags,
+ std::move(trace));
+ ctx->read_trace = req.get_trace();
+
+ req.send();
+ prog_ctx.update_progress(offset, src_size);
+ }
+
+ r = throttle.wait_for_ret();
+ if (r >= 0)
+ prog_ctx.update_progress(src_size, src_size);
+ return r;
+ }
+
+ int list_lockers(ImageCtx *ictx,
+ std::list<locker_t> *lockers,
+ bool *exclusive,
+ string *tag)
+ {
+ ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ RWLock::RLocker locker(ictx->md_lock);
+ if (exclusive)
+ *exclusive = ictx->exclusive_locked;
+ if (tag)
+ *tag = ictx->lock_tag;
+ if (lockers) {
+ lockers->clear();
+ map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t>::const_iterator it;
+ for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) {
+ locker_t locker;
+ locker.client = stringify(it->first.locker);
+ locker.cookie = it->first.cookie;
+ locker.address = it->second.addr.get_legacy_str();
+ lockers->push_back(locker);
+ }
+ }
+
+ return 0;
+ }
+
+ int lock(ImageCtx *ictx, bool exclusive, const string& cookie,
+ const string& tag)
+ {
+ ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive
+ << " cookie='" << cookie << "' tag='" << tag << "'"
+ << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ /**
+ * If we wanted we could do something more intelligent, like local
+ * checks that we think we will succeed. But for now, let's not
+ * duplicate that code.
+ */
+ {
+ RWLock::RLocker locker(ictx->md_lock);
+ r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
+ exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED,
+ cookie, tag, "", utime_t(), 0);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ ictx->notify_update();
+ return 0;
+ }
+
+ int unlock(ImageCtx *ictx, const string& cookie)
+ {
+ ldout(ictx->cct, 20) << "unlock image " << ictx
+ << " cookie='" << cookie << "'" << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ {
+ RWLock::RLocker locker(ictx->md_lock);
+ r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
+ RBD_LOCK_NAME, cookie);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ ictx->notify_update();
+ return 0;
+ }
+
+ int break_lock(ImageCtx *ictx, const string& client,
+ const string& cookie)
+ {
+ ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
+ << "' cookie='" << cookie << "'" << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ entity_name_t lock_client;
+ if (!lock_client.parse(client)) {
+ lderr(ictx->cct) << "Unable to parse client '" << client
+ << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (ictx->config.get_val<bool>("rbd_blacklist_on_break_lock")) {
+ typedef std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> Lockers;
+ Lockers lockers;
+ ClsLockType lock_type;
+ std::string lock_tag;
+ r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
+ RBD_LOCK_NAME, &lockers, &lock_type,
+ &lock_tag);
+ if (r < 0) {
+ lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string client_address;
+ for (Lockers::iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ if (it->first.locker == lock_client) {
+ client_address = it->second.addr.get_legacy_str();
+ break;
+ }
+ }
+ if (client_address.empty()) {
+ return -ENOENT;
+ }
+
+ RWLock::RLocker locker(ictx->md_lock);
+ librados::Rados rados(ictx->md_ctx);
+ r = rados.blacklist_add(
+ client_address,
+ ictx->config.get_val<uint64_t>("rbd_blacklist_expire_seconds"));
+ if (r < 0) {
+ lderr(ictx->cct) << "unable to blacklist client: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid,
+ RBD_LOCK_NAME, cookie, lock_client);
+ if (r < 0)
+ return r;
+ ictx->notify_update();
+ return 0;
+ }
+
+ void rbd_ctx_cb(completion_t cb, void *arg)
+ {
+ Context *ctx = reinterpret_cast<Context *>(arg);
+ auto comp = reinterpret_cast<io::AioCompletion *>(cb);
+ ctx->complete(comp->get_return_value());
+ comp->release();
+ }
+
+ int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+ {
+ coarse_mono_time start_time;
+ ceph::timespan elapsed;
+
+ ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
+ << " len = " << len << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ uint64_t mylen = len;
+ ictx->snap_lock.get_read();
+ r = clip_io(ictx, off, &mylen);
+ ictx->snap_lock.put_read();
+ if (r < 0)
+ return r;
+
+ int64_t total_read = 0;
+ uint64_t period = ictx->get_stripe_period();
+ uint64_t left = mylen;
+
+ ZTracer::Trace trace;
+ if (ictx->blkin_trace_all) {
+ trace.init("read_iterate", &ictx->trace_endpoint);
+ }
+
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ start_time = coarse_mono_clock::now();
+ while (left > 0) {
+ uint64_t period_off = off - (off % period);
+ uint64_t read_len = min(period_off + period - off, left);
+
+ bufferlist bl;
+
+ C_SaferCond ctx;
+ auto c = io::AioCompletion::create_and_start(&ctx, ictx,
+ io::AIO_TYPE_READ);
+ io::ImageRequest<>::aio_read(ictx, c, {{off, read_len}},
+ io::ReadResult{&bl}, 0, std::move(trace));
+
+ int ret = ctx.wait();
+ if (ret < 0) {
+ return ret;
+ }
+
+ r = cb(total_read, ret, bl.c_str(), arg);
+ if (r < 0) {
+ return r;
+ }
+
+ total_read += ret;
+ left -= ret;
+ off += ret;
+ }
+
+ elapsed = coarse_mono_clock::now() - start_time;
+ ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed);
+ ictx->perfcounter->inc(l_librbd_rd);
+ ictx->perfcounter->inc(l_librbd_rd_bytes, mylen);
+ return total_read;
+ }
+
+ // validate extent against image size; clip to image size if necessary
+ int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len)
+ {
+ ceph_assert(ictx->snap_lock.is_locked());
+ uint64_t image_size = ictx->get_image_size(ictx->snap_id);
+ bool snap_exists = ictx->snap_exists;
+
+ if (!snap_exists)
+ return -ENOENT;
+
+ // special-case "len == 0" requests: always valid
+ if (*len == 0)
+ return 0;
+
+ // can't start past end
+ if (off >= image_size)
+ return -EINVAL;
+
+ // clip requests that extend past end to just end
+ if ((off + *len) > image_size)
+ *len = (size_t)(image_size - off);
+
+ return 0;
+ }
+
+ int invalidate_cache(ImageCtx *ictx)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ {
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ ictx->io_object_dispatcher->invalidate_cache(&ctx);
+ }
+ r = ctx.wait();
+ ictx->perfcounter->inc(l_librbd_invalidate_cache);
+ return r;
+ }
+
+ int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp)
+ {
+ if (numcomp <= 0)
+ return -EINVAL;
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp
+ << dendl;
+ int i = 0;
+ Mutex::Locker l(ictx->completed_reqs_lock);
+ numcomp = std::min(numcomp, (int)ictx->completed_reqs.size());
+ while (i < numcomp) {
+ comps[i++] = ictx->completed_reqs.front();
+ ictx->completed_reqs.pop_front();
+ }
+ return i;
+ }
+
+ int metadata_get(ImageCtx *ictx, const string &key, string *value)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
+ }
+
+ int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "metadata_list " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ return cls_client::metadata_list(&ictx->md_ctx, ictx->header_oid, start, max, pairs);
+ }
+
+ struct C_RBD_Readahead : public Context {
+ ImageCtx *ictx;
+ object_t oid;
+ uint64_t offset;
+ uint64_t length;
+
+ bufferlist read_data;
+ io::ExtentMap extent_map;
+
+ C_RBD_Readahead(ImageCtx *ictx, object_t oid, uint64_t offset, uint64_t length)
+ : ictx(ictx), oid(oid), offset(offset), length(length) {
+ ictx->readahead.inc_pending();
+ }
+
+ void finish(int r) override {
+ ldout(ictx->cct, 20) << "C_RBD_Readahead on " << oid << ": "
+ << offset << "~" << length << dendl;
+ ictx->readahead.dec_pending();
+ }
+ };
+
+ void readahead(ImageCtx *ictx,
+ const vector<pair<uint64_t,uint64_t> >& image_extents)
+ {
+ uint64_t total_bytes = 0;
+ for (vector<pair<uint64_t,uint64_t> >::const_iterator p = image_extents.begin();
+ p != image_extents.end();
+ ++p) {
+ total_bytes += p->second;
+ }
+
+ ictx->md_lock.get_write();
+ bool abort = ictx->readahead_disable_after_bytes != 0 &&
+ ictx->total_bytes_read > ictx->readahead_disable_after_bytes;
+ if (abort) {
+ ictx->md_lock.put_write();
+ return;
+ }
+ ictx->total_bytes_read += total_bytes;
+ ictx->snap_lock.get_read();
+ uint64_t image_size = ictx->get_image_size(ictx->snap_id);
+ auto snap_id = ictx->snap_id;
+ ictx->snap_lock.put_read();
+ ictx->md_lock.put_write();
+
+ pair<uint64_t, uint64_t> readahead_extent = ictx->readahead.update(image_extents, image_size);
+ uint64_t readahead_offset = readahead_extent.first;
+ uint64_t readahead_length = readahead_extent.second;
+
+ if (readahead_length > 0) {
+ ldout(ictx->cct, 20) << "(readahead logical) " << readahead_offset << "~" << readahead_length << dendl;
+ map<object_t,vector<ObjectExtent> > readahead_object_extents;
+ Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout,
+ readahead_offset, readahead_length, 0, readahead_object_extents);
+ for (map<object_t,vector<ObjectExtent> >::iterator p = readahead_object_extents.begin(); p != readahead_object_extents.end(); ++p) {
+ for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
+ ldout(ictx->cct, 20) << "(readahead) oid " << q->oid << " " << q->offset << "~" << q->length << dendl;
+
+ auto req_comp = new C_RBD_Readahead(ictx, q->oid, q->offset,
+ q->length);
+ auto req = io::ObjectDispatchSpec::create_read(
+ ictx, io::OBJECT_DISPATCH_LAYER_NONE, q->oid.name, q->objectno,
+ q->offset, q->length, snap_id, 0, {}, &req_comp->read_data,
+ &req_comp->extent_map, req_comp);
+ req->send();
+ }
+ }
+ ictx->perfcounter->inc(l_librbd_readahead);
+ ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length);
+ }
+ }
+
+ int list_watchers(ImageCtx *ictx,
+ std::list<librbd::image_watcher_t> &watchers)
+ {
+ int r;
+ std::string header_oid;
+ std::list<obj_watch_t> obj_watchers;
+
+ if (ictx->old_format) {
+ header_oid = util::old_header_name(ictx->name);
+ } else {
+ header_oid = util::header_name(ictx->id);
+ }
+
+ r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) {
+ librbd::image_watcher_t watcher;
+ watcher.addr = i->addr;
+ watcher.id = i->watcher_id;
+ watcher.cookie = i->cookie;
+
+ watchers.push_back(watcher);
+ }
+
+ return 0;
+ }
+
+}
+
+std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) {
+ os << "[";
+
+ const char *delimiter = "";
+ for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) {
+ if (i.second == librbd::STR) {
+ std::string val;
+ if (opts.get(i.first, &val) == 0) {
+ os << delimiter << librbd::image_option_name(i.first) << "=" << val;
+ delimiter = ", ";
+ }
+ } else if (i.second == librbd::UINT64) {
+ uint64_t val;
+ if (opts.get(i.first, &val) == 0) {
+ os << delimiter << librbd::image_option_name(i.first) << "=" << val;
+ delimiter = ", ";
+ }
+ }
+ }
+
+ os << "]";
+
+ return os;
+}
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
new file mode 100644
index 00000000..1e1864f3
--- /dev/null
+++ b/src/librbd/internal.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_INTERNAL_H
+#define CEPH_LIBRBD_INTERNAL_H
+
+#include "include/int_types.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "include/buffer_fwd.h"
+#include "include/rbd/librbd.hpp"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/WorkQueue.h"
+#include "common/ceph_time.h"
+#include "librbd/Types.h"
+
+namespace librbd {
+
+ struct ImageCtx;
+ namespace io { struct AioCompletion; }
+
+ class NoOpProgressContext : public ProgressContext
+ {
+ public:
+ NoOpProgressContext()
+ {
+ }
+ int update_progress(uint64_t offset, uint64_t src_size) override
+ {
+ return 0;
+ }
+ };
+
+ int detect_format(librados::IoCtx &io_ctx, const std::string &name,
+ bool *old_format, uint64_t *size);
+
+ bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap);
+
+ std::string image_option_name(int optname);
+ void image_options_create(rbd_image_options_t* opts);
+ void image_options_create_ref(rbd_image_options_t* opts,
+ rbd_image_options_t orig);
+ void image_options_copy(rbd_image_options_t *opts,
+ const ImageOptions &orig);
+ void image_options_destroy(rbd_image_options_t opts);
+ int image_options_set(rbd_image_options_t opts, int optname,
+ const std::string& optval);
+ int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval);
+ int image_options_get(rbd_image_options_t opts, int optname,
+ std::string* optval);
+ int image_options_get(rbd_image_options_t opts, int optname,
+ uint64_t* optval);
+ int image_options_is_set(rbd_image_options_t opts, int optname,
+ bool* is_set);
+ int image_options_unset(rbd_image_options_t opts, int optname);
+ void image_options_clear(rbd_image_options_t opts);
+ bool image_options_is_empty(rbd_image_options_t opts);
+
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ int *order);
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ bool old_format, uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count);
+ int create(IoCtx& io_ctx, const std::string &image_name,
+ const std::string &image_id, uint64_t size, ImageOptions& opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable);
+ int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name,
+ uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count);
+ int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
+ const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
+ const char *c_name, ImageOptions& c_opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid);
+ int rename(librados::IoCtx& io_ctx, const char *srcname, const char *dstname);
+ int info(ImageCtx *ictx, image_info_t& info, size_t image_size);
+ int get_old_format(ImageCtx *ictx, uint8_t *old);
+ int get_size(ImageCtx *ictx, uint64_t *size);
+ int get_features(ImageCtx *ictx, uint64_t *features);
+ int get_overlap(ImageCtx *ictx, uint64_t *overlap);
+ int get_flags(ImageCtx *ictx, uint64_t *flags);
+ int set_image_notification(ImageCtx *ictx, int fd, int type);
+ int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner);
+ int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode);
+ int lock_release(ImageCtx *ictx);
+ int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners);
+ int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
+ const std::string &lock_owner);
+
+ int snap_list(ImageCtx *ictx, std::vector<snap_info_t>& snaps);
+ int snap_exists(ImageCtx *ictx, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const char *snap_name, bool *exists);
+ int snap_get_limit(ImageCtx *ictx, uint64_t *limit);
+ int snap_set_limit(ImageCtx *ictx, uint64_t limit);
+ int snap_get_timestamp(ImageCtx *ictx, uint64_t snap_id, struct timespec *timestamp);
+ int snap_remove(ImageCtx *ictx, const char *snap_name, uint32_t flags, ProgressContext& pctx);
+ int snap_is_protected(ImageCtx *ictx, const char *snap_name,
+ bool *is_protected);
+ int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size);
+ int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size);
+
+ /* cooperative locking */
+ int list_lockers(ImageCtx *ictx,
+ std::list<locker_t> *locks,
+ bool *exclusive,
+ std::string *tag);
+ int lock(ImageCtx *ictx, bool exclusive, const std::string& cookie,
+ const std::string& tag);
+ int lock_shared(ImageCtx *ictx, const std::string& cookie,
+ const std::string& tag);
+ int unlock(ImageCtx *ictx, const std::string& cookie);
+ int break_lock(ImageCtx *ictx, const std::string& client,
+ const std::string& cookie);
+
+ void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx);
+
+ int read_header_bl(librados::IoCtx& io_ctx, const std::string& md_oid,
+ ceph::bufferlist& header, uint64_t *ver);
+ int read_header(librados::IoCtx& io_ctx, const std::string& md_oid,
+ struct rbd_obj_header_ondisk *header, uint64_t *ver);
+ int tmap_set(librados::IoCtx& io_ctx, const std::string& imgname);
+ int tmap_rm(librados::IoCtx& io_ctx, const std::string& imgname);
+ void image_info(const ImageCtx *ictx, image_info_t& info, size_t info_size);
+ uint64_t oid_to_object_no(const std::string& oid,
+ const std::string& object_prefix);
+ int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len);
+ void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
+ uint64_t size, int order, uint64_t bid);
+
+ int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg);
+ void readahead(ImageCtx *ictx,
+ const vector<pair<uint64_t,uint64_t> >& image_extents);
+
+ int invalidate_cache(ImageCtx *ictx);
+ int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp);
+ int metadata_list(ImageCtx *ictx, const string &last, uint64_t max, map<string, bufferlist> *pairs);
+ int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value);
+
+ int list_watchers(ImageCtx *ictx, std::list<librbd::image_watcher_t> &watchers);
+}
+
+std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts);
+
+#endif
diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc
new file mode 100644
index 00000000..73c58167
--- /dev/null
+++ b/src/librbd/io/AioCompletion.cc
@@ -0,0 +1,216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/AioCompletion.h"
+#include <errno.h>
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Types.h"
+
+#ifdef WITH_LTTNG
+#include "tracing/librbd.h"
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+int AioCompletion::wait_for_complete() {
+ tracepoint(librbd, aio_wait_for_complete_enter, this);
+ lock.Lock();
+ while (state != AIO_STATE_COMPLETE)
+ cond.Wait(lock);
+ lock.Unlock();
+ tracepoint(librbd, aio_wait_for_complete_exit, 0);
+ return 0;
+}
+
+void AioCompletion::finalize(ssize_t rval)
+{
+ ceph_assert(lock.is_locked());
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ ldout(cct, 20) << "r=" << rval << dendl;
+ if (rval >= 0 && aio_type == AIO_TYPE_READ) {
+ read_result.assemble_result(cct);
+ }
+}
+
+void AioCompletion::complete() {
+ ceph_assert(lock.is_locked());
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ tracepoint(librbd, aio_complete_enter, this, rval);
+ if (ictx->perfcounter != nullptr) {
+ ceph::timespan elapsed = coarse_mono_clock::now() - start_time;
+ switch (aio_type) {
+ case AIO_TYPE_GENERIC:
+ case AIO_TYPE_OPEN:
+ case AIO_TYPE_CLOSE:
+ break;
+ case AIO_TYPE_READ:
+ ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break;
+ case AIO_TYPE_WRITE:
+ ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break;
+ case AIO_TYPE_DISCARD:
+ ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break;
+ case AIO_TYPE_FLUSH:
+ ictx->perfcounter->tinc(l_librbd_flush_latency, elapsed); break;
+ case AIO_TYPE_WRITESAME:
+ ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break;
+ case AIO_TYPE_COMPARE_AND_WRITE:
+ ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break;
+ default:
+ lderr(cct) << "completed invalid aio_type: " << aio_type << dendl;
+ break;
+ }
+ }
+
+ if ((aio_type == AIO_TYPE_CLOSE) ||
+ (aio_type == AIO_TYPE_OPEN && rval < 0)) {
+ // must destroy ImageCtx prior to invoking callback
+ delete ictx;
+ ictx = nullptr;
+ }
+
+ state = AIO_STATE_CALLBACK;
+ if (complete_cb) {
+ lock.Unlock();
+ complete_cb(rbd_comp, complete_arg);
+ lock.Lock();
+ }
+
+ if (ictx != nullptr && event_notify && ictx->event_socket.is_valid()) {
+ ictx->completed_reqs_lock.Lock();
+ ictx->completed_reqs.push_back(&m_xlist_item);
+ ictx->completed_reqs_lock.Unlock();
+ ictx->event_socket.notify();
+ }
+
+ state = AIO_STATE_COMPLETE;
+ cond.Signal();
+
+ // note: possible for image to be closed after op marked finished
+ if (async_op.started()) {
+ async_op.finish_op();
+ }
+ tracepoint(librbd, aio_complete_exit);
+}
+
+void AioCompletion::init_time(ImageCtx *i, aio_type_t t) {
+ Mutex::Locker locker(lock);
+ if (ictx == nullptr) {
+ ictx = i;
+ aio_type = t;
+ start_time = coarse_mono_clock::now();
+ }
+}
+
+void AioCompletion::start_op() {
+ Mutex::Locker locker(lock);
+ ceph_assert(ictx != nullptr);
+
+ if (aio_type == AIO_TYPE_OPEN || aio_type == AIO_TYPE_CLOSE) {
+ // no need to track async open/close operations
+ return;
+ }
+
+ ceph_assert(!async_op.started());
+ async_op.start_op(*ictx);
+}
+
+void AioCompletion::fail(int r)
+{
+ lock.Lock();
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ lderr(cct) << cpp_strerror(r) << dendl;
+ ceph_assert(pending_count == 0);
+ rval = r;
+ complete();
+ put_unlock();
+}
+
+void AioCompletion::set_request_count(uint32_t count) {
+ lock.Lock();
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ ldout(cct, 20) << "pending=" << count << dendl;
+ ceph_assert(pending_count == 0);
+
+ if (count > 0) {
+ pending_count = count;
+ lock.Unlock();
+ } else {
+ pending_count = 1;
+ lock.Unlock();
+
+ // ensure completion fires in clean lock context
+ ictx->op_work_queue->queue(new C_AioRequest(this), 0);
+ }
+}
+
+void AioCompletion::complete_request(ssize_t r)
+{
+ lock.Lock();
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ if (rval >= 0) {
+ if (r < 0 && r != -EEXIST)
+ rval = r;
+ else if (r > 0)
+ rval += r;
+ }
+ ceph_assert(pending_count);
+ int count = --pending_count;
+
+ ldout(cct, 20) << "cb=" << complete_cb << ", "
+ << "pending=" << pending_count << dendl;
+ if (!count) {
+ finalize(rval);
+ complete();
+ }
+ put_unlock();
+}
+
+bool AioCompletion::is_complete() {
+ tracepoint(librbd, aio_is_complete_enter, this);
+ bool done;
+ {
+ Mutex::Locker l(lock);
+ done = this->state == AIO_STATE_COMPLETE;
+ }
+ tracepoint(librbd, aio_is_complete_exit, done);
+ return done;
+}
+
+ssize_t AioCompletion::get_return_value() {
+ tracepoint(librbd, aio_get_return_value_enter, this);
+ lock.Lock();
+ ssize_t r = rval;
+ lock.Unlock();
+ tracepoint(librbd, aio_get_return_value_exit, r);
+ return r;
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/AioCompletion.h b/src/librbd/io/AioCompletion.h
new file mode 100644
index 00000000..f3551a02
--- /dev/null
+++ b/src/librbd/io/AioCompletion.h
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_AIO_COMPLETION_H
+#define CEPH_LIBRBD_IO_AIO_COMPLETION_H
+
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/ceph_time.h"
+#include "include/Context.h"
+#include "include/utime.h"
+#include "include/rbd/librbd.hpp"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+class CephContext;
+
+namespace librbd {
+namespace io {
+
+
+/**
+ * AioCompletion is the overall completion for a single
+ * rbd I/O request. It may be composed of many AioObjectRequests,
+ * which each go to a single object.
+ *
+ * The retrying of individual requests is handled at a lower level,
+ * so all AioCompletion cares about is the count of outstanding
+ * requests. The number of expected individual requests should be
+ * set initially using set_request_count() prior to issuing the
+ * requests. This ensures that the completion will not be completed
+ * within the caller's thread of execution (instead via a librados
+ * context or via a thread pool context for cache read hits).
+ */
+struct AioCompletion {
+ typedef enum {
+ AIO_STATE_PENDING = 0,
+ AIO_STATE_CALLBACK,
+ AIO_STATE_COMPLETE,
+ } aio_state_t;
+
+ mutable Mutex lock;
+ Cond cond;
+ aio_state_t state;
+ ssize_t rval;
+ callback_t complete_cb;
+ void *complete_arg;
+ rbd_completion_t rbd_comp;
+ uint32_t pending_count; ///< number of requests
+ int ref;
+ bool released;
+ ImageCtx *ictx;
+ coarse_mono_time start_time;
+ aio_type_t aio_type;
+
+ ReadResult read_result;
+
+ AsyncOperation async_op;
+
+ xlist<AioCompletion*>::item m_xlist_item;
+ bool event_notify;
+
+ template <typename T, void (T::*MF)(int)>
+ static void callback_adapter(completion_t cb, void *arg) {
+ AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
+ T *t = reinterpret_cast<T *>(arg);
+ (t->*MF)(comp->get_return_value());
+ comp->release();
+ }
+
+ static AioCompletion *create(void *cb_arg, callback_t cb_complete,
+ rbd_completion_t rbd_comp) {
+ AioCompletion *comp = new AioCompletion();
+ comp->set_complete_cb(cb_arg, cb_complete);
+ comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp);
+ return comp;
+ }
+
+ template <typename T, void (T::*MF)(int) = &T::complete>
+ static AioCompletion *create(T *obj) {
+ AioCompletion *comp = new AioCompletion();
+ comp->set_complete_cb(obj, &callback_adapter<T, MF>);
+ comp->rbd_comp = comp;
+ return comp;
+ }
+
+ template <typename T, void (T::*MF)(int) = &T::complete>
+ static AioCompletion *create_and_start(T *obj, ImageCtx *image_ctx,
+ aio_type_t type) {
+ AioCompletion *comp = create<T, MF>(obj);
+ comp->init_time(image_ctx, type);
+ comp->start_op();
+ return comp;
+ }
+
+ AioCompletion() : lock("AioCompletion::lock", true, false),
+ state(AIO_STATE_PENDING), rval(0), complete_cb(NULL),
+ complete_arg(NULL), rbd_comp(NULL),
+ pending_count(0), ref(1), released(false), ictx(NULL),
+ aio_type(AIO_TYPE_NONE), m_xlist_item(this),
+ event_notify(false) {
+ }
+
+ ~AioCompletion() {
+ }
+
+ int wait_for_complete();
+
+ void finalize(ssize_t rval);
+
+ inline bool is_initialized(aio_type_t type) const {
+ Mutex::Locker locker(lock);
+ return ((ictx != nullptr) && (aio_type == type));
+ }
+ inline bool is_started() const {
+ Mutex::Locker locker(lock);
+ return async_op.started();
+ }
+
+ void init_time(ImageCtx *i, aio_type_t t);
+ void start_op();
+ void fail(int r);
+
+ void complete();
+
+ void set_complete_cb(void *cb_arg, callback_t cb) {
+ complete_cb = cb;
+ complete_arg = cb_arg;
+ }
+
+ void set_request_count(uint32_t num);
+ void add_request() {
+ lock.Lock();
+ ceph_assert(pending_count > 0);
+ lock.Unlock();
+ get();
+ }
+ void complete_request(ssize_t r);
+
+ bool is_complete();
+
+ ssize_t get_return_value();
+
+ void get() {
+ lock.Lock();
+ ceph_assert(ref > 0);
+ ref++;
+ lock.Unlock();
+ }
+ void release() {
+ lock.Lock();
+ ceph_assert(!released);
+ released = true;
+ put_unlock();
+ }
+ void put() {
+ lock.Lock();
+ put_unlock();
+ }
+ void put_unlock() {
+ ceph_assert(ref > 0);
+ int n = --ref;
+ lock.Unlock();
+ if (!n) {
+ if (ictx != nullptr && event_notify) {
+ ictx->completed_reqs_lock.Lock();
+ m_xlist_item.remove_myself();
+ ictx->completed_reqs_lock.Unlock();
+ }
+ delete this;
+ }
+ }
+
+ void set_event_notify(bool s) {
+ Mutex::Locker l(lock);
+ event_notify = s;
+ }
+
+ void *get_arg() {
+ return complete_arg;
+ }
+};
+
+class C_AioRequest : public Context {
+public:
+ C_AioRequest(AioCompletion *completion) : m_completion(completion) {
+ m_completion->add_request();
+ }
+ ~C_AioRequest() override {}
+ void finish(int r) override {
+ m_completion->complete_request(r);
+ }
+protected:
+ AioCompletion *m_completion;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_AIO_COMPLETION_H
diff --git a/src/librbd/io/AsyncOperation.cc b/src/librbd/io/AsyncOperation.cc
new file mode 100644
index 00000000..c5a3bc93
--- /dev/null
+++ b/src/librbd/io/AsyncOperation.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/ImageCtx.h"
+#include "common/dout.h"
+#include "common/WorkQueue.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::AsyncOperation: "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+struct C_CompleteFlushes : public Context {
+ ImageCtx *image_ctx;
+ std::list<Context *> flush_contexts;
+
+ explicit C_CompleteFlushes(ImageCtx *image_ctx,
+ std::list<Context *> &&flush_contexts)
+ : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) {
+ }
+ void finish(int r) override {
+ RWLock::RLocker owner_locker(image_ctx->owner_lock);
+ while (!flush_contexts.empty()) {
+ Context *flush_ctx = flush_contexts.front();
+ flush_contexts.pop_front();
+
+ ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl;
+ flush_ctx->complete(0);
+ }
+ }
+};
+
+} // anonymous namespace
+
+void AsyncOperation::start_op(ImageCtx &image_ctx) {
+ ceph_assert(m_image_ctx == NULL);
+ m_image_ctx = &image_ctx;
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+ Mutex::Locker l(m_image_ctx->async_ops_lock);
+ m_image_ctx->async_ops.push_front(&m_xlist_item);
+}
+
+void AsyncOperation::finish_op() {
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+
+ {
+ Mutex::Locker l(m_image_ctx->async_ops_lock);
+ xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
+ ++iter;
+ ceph_assert(m_xlist_item.remove_myself());
+
+ // linked list stored newest -> oldest ops
+ if (!iter.end() && !m_flush_contexts.empty()) {
+ ldout(m_image_ctx->cct, 20) << "moving flush contexts to previous op: "
+ << *iter << dendl;
+ (*iter)->m_flush_contexts.insert((*iter)->m_flush_contexts.end(),
+ m_flush_contexts.begin(),
+ m_flush_contexts.end());
+ return;
+ }
+ }
+
+ if (!m_flush_contexts.empty()) {
+ C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx,
+ std::move(m_flush_contexts));
+ m_image_ctx->op_work_queue->queue(ctx);
+ }
+}
+
+void AsyncOperation::flush(Context* on_finish) {
+ {
+ Mutex::Locker locker(m_image_ctx->async_ops_lock);
+ xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
+ ++iter;
+
+ // linked list stored newest -> oldest ops
+ if (!iter.end()) {
+ (*iter)->m_flush_contexts.push_back(on_finish);
+ return;
+ }
+ }
+
+ m_image_ctx->op_work_queue->queue(on_finish);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/AsyncOperation.h b/src/librbd/io/AsyncOperation.h
new file mode 100644
index 00000000..b0a37c4b
--- /dev/null
+++ b/src/librbd/io/AsyncOperation.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_IO_ASYNC_OPERATION_H
+#define LIBRBD_IO_ASYNC_OPERATION_H
+
+#include "include/ceph_assert.h"
+#include "include/xlist.h"
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+class AsyncOperation {
+public:
+
+ AsyncOperation()
+ : m_image_ctx(NULL), m_xlist_item(this)
+ {
+ }
+
+ ~AsyncOperation()
+ {
+ ceph_assert(!m_xlist_item.is_on_list());
+ }
+
+ inline bool started() const {
+ return m_xlist_item.is_on_list();
+ }
+
+ void start_op(ImageCtx &image_ctx);
+ void finish_op();
+
+ void flush(Context *on_finish);
+
+private:
+
+ ImageCtx *m_image_ctx;
+ xlist<AsyncOperation *>::item m_xlist_item;
+ std::list<Context *> m_flush_contexts;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // LIBRBD_IO_ASYNC_OPERATION_H
diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc
new file mode 100644
index 00000000..c2ebb10f
--- /dev/null
+++ b/src/librbd/io/CopyupRequest.cc
@@ -0,0 +1,625 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/CopyupRequest.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/WorkQueue.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/ObjectCopyRequest.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ReadResult.h"
+
+#include <boost/bind.hpp>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+template <typename I>
+class C_UpdateObjectMap : public C_AsyncObjectThrottle<I> {
+public:
+ C_UpdateObjectMap(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no, uint8_t head_object_map_state,
+ const std::vector<uint64_t> *snap_ids,
+ bool first_snap_is_clean, const ZTracer::Trace &trace,
+ size_t snap_id_idx)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no),
+ m_head_object_map_state(head_object_map_state), m_snap_ids(*snap_ids),
+ m_first_snap_is_clean(first_snap_is_clean), m_trace(trace),
+ m_snap_id_idx(snap_id_idx)
+ {
+ }
+
+ int send() override {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ if (image_ctx.exclusive_lock == nullptr) {
+ return 1;
+ }
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.object_map == nullptr) {
+ return 1;
+ }
+
+ uint64_t snap_id = m_snap_ids[m_snap_id_idx];
+ if (snap_id == CEPH_NOSNAP) {
+ return update_head();
+ } else {
+ return update_snapshot(snap_id);
+ }
+ }
+
+ int update_head() {
+ auto& image_ctx = this->m_image_ctx;
+ RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+ bool sent = image_ctx.object_map->template aio_update<Context>(
+ CEPH_NOSNAP, m_object_no, m_head_object_map_state, {}, m_trace, false,
+ this);
+ return (sent ? 0 : 1);
+ }
+
+ int update_snapshot(uint64_t snap_id) {
+ auto& image_ctx = this->m_image_ctx;
+ uint8_t state = OBJECT_EXISTS;
+ if (image_ctx.test_features(RBD_FEATURE_FAST_DIFF, image_ctx.snap_lock) &&
+ (m_snap_id_idx > 0 || m_first_snap_is_clean)) {
+ // first snapshot should be exists+dirty since it contains
+ // the copyup data -- later snapshots inherit the data.
+ state = OBJECT_EXISTS_CLEAN;
+ }
+
+ RWLock::RLocker object_map_locker(image_ctx.object_map_lock);
+ bool sent = image_ctx.object_map->template aio_update<Context>(
+ snap_id, m_object_no, state, {}, m_trace, true, this);
+ ceph_assert(sent);
+ return 0;
+ }
+
+private:
+ uint64_t m_object_no;
+ uint8_t m_head_object_map_state;
+ const std::vector<uint64_t> &m_snap_ids;
+ bool m_first_snap_is_clean;
+ const ZTracer::Trace &m_trace;
+ size_t m_snap_id_idx;
+};
+
+} // anonymous namespace
+
+template <typename I>
+CopyupRequest<I>::CopyupRequest(I *ictx, const std::string &oid,
+ uint64_t objectno, Extents &&image_extents,
+ const ZTracer::Trace &parent_trace)
+ : m_image_ctx(ictx), m_oid(oid), m_object_no(objectno),
+ m_image_extents(image_extents),
+ m_trace(util::create_trace(*m_image_ctx, "copy-up", parent_trace)),
+ m_lock("CopyupRequest", false, false)
+{
+ ceph_assert(m_image_ctx->data_ctx.is_valid());
+ m_async_op.start_op(*util::get_image_ctx(m_image_ctx));
+}
+
+template <typename I>
+CopyupRequest<I>::~CopyupRequest() {
+ ceph_assert(m_pending_requests.empty());
+ m_async_op.finish_op();
+}
+
+template <typename I>
+void CopyupRequest<I>::append_request(AbstractObjectWriteRequest<I> *req) {
+ Mutex::Locker locker(m_lock);
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "oid=" << m_oid << ", "
+ << "object_request=" << req << ", "
+ << "append=" << m_append_request_permitted << dendl;
+ if (m_append_request_permitted) {
+ m_pending_requests.push_back(req);
+ } else {
+ m_restart_requests.push_back(req);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::send() {
+ read_from_parent();
+}
+
+template <typename I>
+void CopyupRequest<I>::read_from_parent() {
+ auto cct = m_image_ctx->cct;
+ RWLock::RLocker snap_locker(m_image_ctx->snap_lock);
+ RWLock::RLocker parent_locker(m_image_ctx->parent_lock);
+
+ if (m_image_ctx->parent == nullptr) {
+ ldout(cct, 5) << "parent detached" << dendl;
+
+ m_image_ctx->op_work_queue->queue(
+ util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_read_from_parent>(this),
+ -ENOENT);
+ return;
+ } else if (is_deep_copy()) {
+ deep_copy();
+ return;
+ }
+
+ auto comp = AioCompletion::create_and_start<
+ CopyupRequest<I>,
+ &CopyupRequest<I>::handle_read_from_parent>(
+ this, util::get_image_ctx(m_image_ctx->parent), AIO_TYPE_READ);
+
+ ldout(cct, 20) << "oid=" << m_oid << ", "
+ << "completion=" << comp << ", "
+ << "extents=" << m_image_extents
+ << dendl;
+ ImageRequest<I>::aio_read(m_image_ctx->parent, comp,
+ std::move(m_image_extents),
+ ReadResult{&m_copyup_data}, 0, m_trace);
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_read_from_parent(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl;
+
+ m_image_ctx->snap_lock.get_read();
+ m_lock.Lock();
+ m_copyup_is_zero = m_copyup_data.is_zero();
+ m_copyup_required = is_copyup_required();
+ disable_append_requests();
+
+ if (r < 0 && r != -ENOENT) {
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ lderr(cct) << "error reading from parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_copyup_required) {
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ ldout(cct, 20) << "no-op, skipping" << dendl;
+ finish(0);
+ return;
+ }
+
+ // copyup() will affect snapshots only if parent data is not all
+ // zeros.
+ if (!m_copyup_is_zero) {
+ m_snap_ids.insert(m_snap_ids.end(), m_image_ctx->snaps.rbegin(),
+ m_image_ctx->snaps.rend());
+ }
+
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ update_object_maps();
+}
+
+template <typename I>
+void CopyupRequest<I>::deep_copy() {
+ auto cct = m_image_ctx->cct;
+ ceph_assert(m_image_ctx->snap_lock.is_locked());
+ ceph_assert(m_image_ctx->parent_lock.is_locked());
+ ceph_assert(m_image_ctx->parent != nullptr);
+
+ m_lock.Lock();
+ m_flatten = is_copyup_required() ? true : m_image_ctx->migration_info.flatten;
+ m_lock.Unlock();
+
+ ldout(cct, 20) << "oid=" << m_oid << ", flatten=" << m_flatten << dendl;
+
+ auto ctx = util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_deep_copy>(this);
+ auto req = deep_copy::ObjectCopyRequest<I>::create(
+ m_image_ctx->parent, m_image_ctx, 0, 0,
+ m_image_ctx->migration_info.snap_map, m_object_no, m_flatten, ctx);
+
+ req->send();
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_deep_copy(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl;
+
+ m_image_ctx->snap_lock.get_read();
+ m_lock.Lock();
+ m_copyup_required = is_copyup_required();
+ if (r == -ENOENT && !m_flatten && m_copyup_required) {
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ ldout(cct, 10) << "restart deep-copy with flatten" << dendl;
+ send();
+ return;
+ }
+
+ disable_append_requests();
+
+ if (r < 0 && r != -ENOENT) {
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ lderr(cct) << "error encountered during deep-copy: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_copyup_required && !is_update_object_map_required(r)) {
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ ldout(cct, 20) << "skipping" << dendl;
+ finish(r);
+ return;
+ }
+
+ // For deep-copy, copyup() will never affect snapshots. However,
+ // this state machine is responsible for updating object maps for
+ // snapshots that have been created on destination image after
+ // migration started.
+ if (r != -ENOENT) {
+ compute_deep_copy_snap_ids();
+ }
+
+ m_lock.Unlock();
+ m_image_ctx->snap_lock.put_read();
+
+ update_object_maps();
+}
+
+template <typename I>
+void CopyupRequest<I>::update_object_maps() {
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ RWLock::RLocker snap_locker(m_image_ctx->snap_lock);
+ if (m_image_ctx->object_map == nullptr) {
+ snap_locker.unlock();
+ owner_locker.unlock();
+
+ copyup();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "oid=" << m_oid << dendl;
+
+ bool copy_on_read = m_pending_requests.empty();
+ uint8_t head_object_map_state = OBJECT_EXISTS;
+ if (copy_on_read && !m_snap_ids.empty() &&
+ m_image_ctx->test_features(RBD_FEATURE_FAST_DIFF,
+ m_image_ctx->snap_lock)) {
+ // HEAD is non-dirty since data is tied to first snapshot
+ head_object_map_state = OBJECT_EXISTS_CLEAN;
+ }
+
+ auto r_it = m_pending_requests.rbegin();
+ if (r_it != m_pending_requests.rend()) {
+ // last write-op determines the final object map state
+ head_object_map_state = (*r_it)->get_pre_write_object_map_state();
+ }
+
+ RWLock::WLocker object_map_locker(m_image_ctx->object_map_lock);
+ if ((*m_image_ctx->object_map)[m_object_no] != head_object_map_state) {
+ // (maybe) need to update the HEAD object map state
+ m_snap_ids.push_back(CEPH_NOSNAP);
+ }
+ object_map_locker.unlock();
+ snap_locker.unlock();
+
+ ceph_assert(m_image_ctx->exclusive_lock->is_lock_owner());
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_UpdateObjectMap<I>>(),
+ boost::lambda::_1, m_image_ctx, m_object_no, head_object_map_state,
+ &m_snap_ids, m_first_snap_is_clean, m_trace, boost::lambda::_2));
+ auto ctx = util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_update_object_maps>(this);
+ auto throttle = new AsyncObjectThrottle<I>(
+ nullptr, *m_image_ctx, context_factory, ctx, nullptr, 0, m_snap_ids.size());
+ throttle->start_ops(
+ m_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_update_object_maps(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+
+ finish(r);
+ return;
+ }
+
+ copyup();
+}
+
+template <typename I>
+void CopyupRequest<I>::copyup() {
+ auto cct = m_image_ctx->cct;
+ m_image_ctx->snap_lock.get_read();
+ auto snapc = m_image_ctx->snapc;
+ m_image_ctx->snap_lock.put_read();
+
+ m_lock.Lock();
+ if (!m_copyup_required) {
+ m_lock.Unlock();
+
+ ldout(cct, 20) << "skipping copyup" << dendl;
+ finish(0);
+ return;
+ }
+
+ ldout(cct, 20) << "oid=" << m_oid << dendl;
+
+ bool copy_on_read = m_pending_requests.empty();
+ bool deep_copyup = !snapc.snaps.empty() && !m_copyup_is_zero;
+ if (m_copyup_is_zero) {
+ m_copyup_data.clear();
+ }
+
+ int r;
+ librados::ObjectWriteOperation copyup_op;
+ if (copy_on_read || deep_copyup) {
+ copyup_op.exec("rbd", "copyup", m_copyup_data);
+ ObjectRequest<I>::add_write_hint(*m_image_ctx, &copyup_op);
+ ++m_pending_copyups;
+ }
+
+ librados::ObjectWriteOperation write_op;
+ if (!copy_on_read) {
+ if (!deep_copyup) {
+ write_op.exec("rbd", "copyup", m_copyup_data);
+ ObjectRequest<I>::add_write_hint(*m_image_ctx, &write_op);
+ }
+
+ // merge all pending write ops into this single RADOS op
+ for (auto req : m_pending_requests) {
+ ldout(cct, 20) << "add_copyup_ops " << req << dendl;
+ req->add_copyup_ops(&write_op);
+ }
+
+ if (write_op.size() > 0) {
+ ++m_pending_copyups;
+ }
+ }
+ m_lock.Unlock();
+
+ // issue librados ops at the end to simplify test cases
+ std::vector<librados::snap_t> snaps;
+ if (copyup_op.size() > 0) {
+ // send only the copyup request with a blank snapshot context so that
+ // all snapshots are detected from the parent for this object. If
+ // this is a CoW request, a second request will be created for the
+ // actual modification.
+ ldout(cct, 20) << "copyup with empty snapshot context" << dendl;
+
+ auto comp = util::create_rados_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_copyup>(this);
+ r = m_image_ctx->data_ctx.aio_operate(
+ m_oid, comp, &copyup_op, 0, snaps,
+ (m_trace.valid() ? m_trace.get_info() : nullptr));
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ if (write_op.size() > 0) {
+ // compare-and-write doesn't add any write ops (copyup+cmpext+write
+ // can't be executed in the same RADOS op because, unless the object
+ // was already present in the clone, cmpext wouldn't see it)
+ ldout(cct, 20) << (!deep_copyup && write_op.size() > 2 ?
+ "copyup + ops" : !deep_copyup ? "copyup" : "ops")
+ << " with current snapshot context" << dendl;
+
+ snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
+ auto comp = util::create_rados_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_copyup>(this);
+ r = m_image_ctx->data_ctx.aio_operate(
+ m_oid, comp, &write_op, snapc.seq, snaps,
+ (m_trace.valid() ? m_trace.get_info() : nullptr));
+ ceph_assert(r == 0);
+ comp->release();
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_copyup(int r) {
+ auto cct = m_image_ctx->cct;
+ unsigned pending_copyups;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_pending_copyups > 0);
+ pending_copyups = --m_pending_copyups;
+ }
+
+ ldout(cct, 20) << "oid=" << m_oid << ", " << "r=" << r << ", "
+ << "pending=" << pending_copyups << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to copyup object: " << cpp_strerror(r) << dendl;
+ complete_requests(false, r);
+ }
+
+ if (pending_copyups == 0) {
+ finish(0);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::finish(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "oid=" << m_oid << ", r=" << r << dendl;
+
+ complete_requests(true, r);
+ delete this;
+}
+
+template <typename I>
+void CopyupRequest<I>::complete_requests(bool override_restart_retval, int r) {
+ auto cct = m_image_ctx->cct;
+ remove_from_list();
+
+ while (!m_pending_requests.empty()) {
+ auto it = m_pending_requests.begin();
+ auto req = *it;
+ ldout(cct, 20) << "completing request " << req << dendl;
+ req->handle_copyup(r);
+ m_pending_requests.erase(it);
+ }
+
+ if (override_restart_retval) {
+ r = -ERESTART;
+ }
+
+ while (!m_restart_requests.empty()) {
+ auto it = m_restart_requests.begin();
+ auto req = *it;
+ ldout(cct, 20) << "restarting request " << req << dendl;
+ req->handle_copyup(r);
+ m_restart_requests.erase(it);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::disable_append_requests() {
+ ceph_assert(m_lock.is_locked());
+ m_append_request_permitted = false;
+}
+
+template <typename I>
+void CopyupRequest<I>::remove_from_list() {
+ Mutex::Locker copyup_list_locker(m_image_ctx->copyup_list_lock);
+
+ auto it = m_image_ctx->copyup_list.find(m_object_no);
+ if (it != m_image_ctx->copyup_list.end()) {
+ m_image_ctx->copyup_list.erase(it);
+ }
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_copyup_required() {
+ ceph_assert(m_lock.is_locked());
+
+ bool copy_on_read = m_pending_requests.empty();
+ if (copy_on_read) {
+ // always force a copyup if CoR enabled
+ return true;
+ }
+
+ if (!m_copyup_is_zero) {
+ return true;
+ }
+
+ for (auto req : m_pending_requests) {
+ if (!req->is_empty_write_op()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_deep_copy() const {
+ ceph_assert(m_image_ctx->snap_lock.is_locked());
+ return !m_image_ctx->migration_info.empty();
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_update_object_map_required(int r) {
+ ceph_assert(m_image_ctx->snap_lock.is_locked());
+
+ if (r < 0) {
+ return false;
+ }
+
+ if (m_image_ctx->object_map == nullptr) {
+ return false;
+ }
+
+ if (m_image_ctx->migration_info.empty()) {
+ // migration might have completed while IO was in-flight,
+ // assume worst-case and perform an object map update
+ return true;
+ }
+
+ auto it = m_image_ctx->migration_info.snap_map.find(CEPH_NOSNAP);
+ ceph_assert(it != m_image_ctx->migration_info.snap_map.end());
+ return it->second[0] != CEPH_NOSNAP;
+}
+
+template <typename I>
+void CopyupRequest<I>::compute_deep_copy_snap_ids() {
+ ceph_assert(m_image_ctx->snap_lock.is_locked());
+
+ // don't copy ids for the snaps updated by object deep copy or
+ // that don't overlap
+ std::set<uint64_t> deep_copied;
+ for (auto &it : m_image_ctx->migration_info.snap_map) {
+ if (it.first != CEPH_NOSNAP) {
+ deep_copied.insert(it.second.front());
+ }
+ }
+
+ RWLock::RLocker parent_locker(m_image_ctx->parent_lock);
+ std::copy_if(m_image_ctx->snaps.rbegin(), m_image_ctx->snaps.rend(),
+ std::back_inserter(m_snap_ids),
+ [this, cct=m_image_ctx->cct, &deep_copied](uint64_t snap_id) {
+ if (deep_copied.count(snap_id)) {
+ m_first_snap_is_clean = true;
+ return false;
+ }
+
+ uint64_t parent_overlap = 0;
+ int r = m_image_ctx->get_parent_overlap(snap_id, &parent_overlap);
+ if (r < 0) {
+ ldout(cct, 5) << "failed getting parent overlap for snap_id: "
+ << snap_id << ": " << cpp_strerror(r) << dendl;
+ }
+ if (parent_overlap == 0) {
+ return false;
+ }
+ std::vector<std::pair<uint64_t, uint64_t>> extents;
+ Striper::extent_to_file(cct, &m_image_ctx->layout,
+ m_object_no, 0,
+ m_image_ctx->layout.object_size,
+ extents);
+ auto overlap = m_image_ctx->prune_parent_extents(
+ extents, parent_overlap);
+ return overlap > 0;
+ });
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::CopyupRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h
new file mode 100644
index 00000000..e4b3a2e7
--- /dev/null
+++ b/src/librbd/io/CopyupRequest.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_COPYUP_REQUEST_H
+#define CEPH_LIBRBD_IO_COPYUP_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "common/Mutex.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/Types.h"
+
+#include <string>
+#include <vector>
+
+namespace ZTracer { struct Trace; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename I> class AbstractObjectWriteRequest;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CopyupRequest {
+public:
+ static CopyupRequest* create(ImageCtxT *ictx, const std::string &oid,
+ uint64_t objectno, Extents &&image_extents,
+ const ZTracer::Trace &parent_trace) {
+ return new CopyupRequest(ictx, oid, objectno, std::move(image_extents),
+ parent_trace);
+ }
+
+ CopyupRequest(ImageCtxT *ictx, const std::string &oid, uint64_t objectno,
+ Extents &&image_extents, const ZTracer::Trace &parent_trace);
+ ~CopyupRequest();
+
+ void append_request(AbstractObjectWriteRequest<ImageCtxT> *req);
+
+ void send();
+
+private:
+ /**
+ * Copyup requests go through the following state machine to read from the
+ * parent image, update the object map, and copyup the object:
+ *
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * /---------/ \---------\
+ * | |
+ * v v
+ * READ_FROM_PARENT DEEP_COPY
+ * | |
+ * \---------\ /---------/
+ * |
+ * v (skip if not needed)
+ * UPDATE_OBJECT_MAPS
+ * |
+ * v (skip if not needed)
+ * COPYUP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The OBJECT_MAP state is skipped if the object map isn't enabled or if
+ * an object map update isn't required. The COPYUP state is skipped if
+ * no data was read from the parent *and* there are no additional ops.
+ */
+
+ typedef std::vector<AbstractObjectWriteRequest<ImageCtxT> *> WriteRequests;
+
+ ImageCtxT *m_image_ctx;
+ std::string m_oid;
+ uint64_t m_object_no;
+ Extents m_image_extents;
+ ZTracer::Trace m_trace;
+
+ bool m_flatten = false;
+ bool m_copyup_required = true;
+ bool m_copyup_is_zero = true;
+
+ ceph::bufferlist m_copyup_data;
+
+ AsyncOperation m_async_op;
+
+ std::vector<uint64_t> m_snap_ids;
+ bool m_first_snap_is_clean = false;
+
+ Mutex m_lock;
+ WriteRequests m_pending_requests;
+ unsigned m_pending_copyups = 0;
+
+ WriteRequests m_restart_requests;
+ bool m_append_request_permitted = true;
+
+ void read_from_parent();
+ void handle_read_from_parent(int r);
+
+ void deep_copy();
+ void handle_deep_copy(int r);
+
+ void update_object_maps();
+ void handle_update_object_maps(int r);
+
+ void copyup();
+ void handle_copyup(int r);
+
+ void finish(int r);
+ void complete_requests(bool override_restart_retval, int r);
+
+ void disable_append_requests();
+ void remove_from_list();
+
+ bool is_copyup_required();
+ bool is_update_object_map_required(int r);
+ bool is_deep_copy() const;
+
+ void compute_deep_copy_snap_ids();
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::CopyupRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_COPYUP_REQUEST_H
diff --git a/src/librbd/io/ImageDispatchSpec.cc b/src/librbd/io/ImageDispatchSpec.cc
new file mode 100644
index 00000000..2c405d74
--- /dev/null
+++ b/src/librbd/io/ImageDispatchSpec.cc
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include <boost/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct ImageDispatchSpec<I>::SendVisitor
+ : public boost::static_visitor<void> {
+ ImageDispatchSpec* spec;
+
+ explicit SendVisitor(ImageDispatchSpec* spec)
+ : spec(spec) {
+ }
+
+ void operator()(Read& read) const {
+ ImageRequest<I>::aio_read(
+ &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents),
+ std::move(read.read_result), spec->m_op_flags, spec->m_parent_trace);
+ }
+
+ void operator()(Discard& discard) const {
+ ImageRequest<I>::aio_discard(
+ &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents),
+ discard.discard_granularity_bytes, spec->m_parent_trace);
+ }
+
+ void operator()(Write& write) const {
+ ImageRequest<I>::aio_write(
+ &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents),
+ std::move(write.bl), spec->m_op_flags, spec->m_parent_trace);
+ }
+
+ void operator()(WriteSame& write_same) const {
+ ImageRequest<I>::aio_writesame(
+ &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents),
+ std::move(write_same.bl), spec->m_op_flags, spec->m_parent_trace);
+ }
+
+ void operator()(CompareAndWrite& compare_and_write) const {
+ ImageRequest<I>::aio_compare_and_write(
+ &spec->m_image_ctx, spec->m_aio_comp, std::move(spec->m_image_extents),
+ std::move(compare_and_write.cmp_bl), std::move(compare_and_write.bl),
+ compare_and_write.mismatch_offset, spec->m_op_flags,
+ spec->m_parent_trace);
+ }
+
+ void operator()(Flush& flush) const {
+ ImageRequest<I>::aio_flush(
+ &spec->m_image_ctx, spec->m_aio_comp, flush.flush_source,
+ spec->m_parent_trace);
+ }
+};
+
+template <typename I>
+struct ImageDispatchSpec<I>::IsWriteOpVisitor
+ : public boost::static_visitor<bool> {
+ bool operator()(const Read&) const {
+ return false;
+ }
+
+ template <typename T>
+ bool operator()(const T&) const {
+ return true;
+ }
+};
+
+template <typename I>
+struct ImageDispatchSpec<I>::TokenRequestedVisitor
+ : public boost::static_visitor<uint64_t> {
+ ImageDispatchSpec* spec;
+ uint64_t flag;
+ uint64_t *tokens;
+
+ TokenRequestedVisitor(ImageDispatchSpec* spec, uint64_t _flag,
+ uint64_t *tokens)
+ : spec(spec), flag(_flag), tokens(tokens) {
+ }
+
+ uint64_t operator()(const Read&) const {
+ if (flag & RBD_QOS_WRITE_MASK) {
+ *tokens = 0;
+ return false;
+ }
+
+ *tokens = (flag & RBD_QOS_BPS_MASK) ? spec->extents_length() : 1;
+ return true;
+ }
+
+ uint64_t operator()(const Flush&) const {
+ *tokens = 0;
+ return true;
+ }
+
+ template <typename T>
+ uint64_t operator()(const T&) const {
+ if (flag & RBD_QOS_READ_MASK) {
+ *tokens = 0;
+ return false;
+ }
+
+ *tokens = (flag & RBD_QOS_BPS_MASK) ? spec->extents_length() : 1;
+ return true;
+ }
+};
+
+template <typename I>
+void ImageDispatchSpec<I>::send() {
+ boost::apply_visitor(SendVisitor{this}, m_request);
+}
+
+template <typename I>
+void ImageDispatchSpec<I>::fail(int r) {
+ m_aio_comp->get();
+ m_aio_comp->fail(r);
+}
+
+template <typename I>
+uint64_t ImageDispatchSpec<I>::extents_length() {
+ uint64_t length = 0;
+ auto &extents = this->m_image_extents;
+
+ for (auto &extent : extents) {
+ length += extent.second;
+ }
+ return length;
+}
+
+template <typename I>
+bool ImageDispatchSpec<I>::is_write_op() const {
+ return boost::apply_visitor(IsWriteOpVisitor(), m_request);
+}
+
+template <typename I>
+bool ImageDispatchSpec<I>::tokens_requested(uint64_t flag, uint64_t *tokens) {
+ return boost::apply_visitor(TokenRequestedVisitor{this, flag, tokens},
+ m_request);
+}
+
+template <typename I>
+void ImageDispatchSpec<I>::start_op() {
+ m_aio_comp->start_op();
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageDispatchSpec<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageDispatchSpec.h b/src/librbd/io/ImageDispatchSpec.h
new file mode 100644
index 00000000..93c53a0f
--- /dev/null
+++ b/src/librbd/io/ImageDispatchSpec.h
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ReadResult.h"
+#include <boost/variant/variant.hpp>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+class AioCompletion;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageDispatchSpec {
+public:
+ struct Read {
+ ReadResult read_result;
+
+ Read(ReadResult &&read_result) : read_result(std::move(read_result)) {
+ }
+ };
+
+ struct Discard {
+ uint32_t discard_granularity_bytes;
+
+ Discard(uint32_t discard_granularity_bytes)
+ : discard_granularity_bytes(discard_granularity_bytes) {
+ }
+ };
+
+ struct Write {
+ bufferlist bl;
+
+ Write(bufferlist&& bl) : bl(std::move(bl)) {
+ }
+ };
+
+ struct WriteSame {
+ bufferlist bl;
+
+ WriteSame(bufferlist&& bl) : bl(std::move(bl)) {
+ }
+ };
+
+ struct CompareAndWrite {
+ bufferlist cmp_bl;
+ bufferlist bl;
+ uint64_t *mismatch_offset;
+
+ CompareAndWrite(bufferlist&& cmp_bl, bufferlist&& bl,
+ uint64_t *mismatch_offset)
+ : cmp_bl(std::move(cmp_bl)), bl(std::move(bl)),
+ mismatch_offset(mismatch_offset) {
+ }
+ };
+
+ struct Flush {
+ FlushSource flush_source;
+
+ Flush(FlushSource flush_source) : flush_source(flush_source) {
+ }
+ };
+
+ static ImageDispatchSpec* create_read_request(
+ ImageCtxT &image_ctx, AioCompletion *aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx, aio_comp,
+ std::move(image_extents),
+ Read{std::move(read_result)},
+ op_flags, parent_trace);
+ }
+
+ static ImageDispatchSpec* create_discard_request(
+ ImageCtxT &image_ctx, AioCompletion *aio_comp, uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx, aio_comp, {{off, len}},
+ Discard{discard_granularity_bytes},
+ 0, parent_trace);
+ }
+
+ static ImageDispatchSpec* create_write_request(
+ ImageCtxT &image_ctx, AioCompletion *aio_comp, Extents &&image_extents,
+ bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx, aio_comp, std::move(image_extents),
+ Write{std::move(bl)}, op_flags, parent_trace);
+ }
+
+ static ImageDispatchSpec* create_write_same_request(
+ ImageCtxT &image_ctx, AioCompletion *aio_comp, uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx, aio_comp, {{off, len}},
+ WriteSame{std::move(bl)}, op_flags,
+ parent_trace);
+ }
+
+ static ImageDispatchSpec* create_compare_and_write_request(
+ ImageCtxT &image_ctx, AioCompletion *aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx, aio_comp,
+ std::move(image_extents),
+ CompareAndWrite{std::move(cmp_bl),
+ std::move(bl),
+ mismatch_offset},
+ op_flags, parent_trace);
+ }
+
+ static ImageDispatchSpec* create_flush_request(
+ ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ FlushSource flush_source, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx, aio_comp, {}, Flush{flush_source},
+ 0, parent_trace);
+ }
+
+ void send();
+ void fail(int r);
+
+ bool is_write_op() const;
+
+ void start_op();
+
+ bool tokens_requested(uint64_t flag, uint64_t *tokens);
+
+ bool was_throttled(uint64_t flag) {
+ return m_throttled_flag & flag;
+ }
+
+ void set_throttled(uint64_t flag) {
+ m_throttled_flag |= flag;
+ }
+
+ bool were_all_throttled() {
+ return (m_throttled_flag & RBD_QOS_MASK) == RBD_QOS_MASK;
+ }
+
+private:
+ typedef boost::variant<Read,
+ Discard,
+ Write,
+ WriteSame,
+ CompareAndWrite,
+ Flush> Request;
+
+ struct SendVisitor;
+ struct IsWriteOpVisitor;
+ struct TokenRequestedVisitor;
+
+ ImageDispatchSpec(ImageCtxT& image_ctx, AioCompletion* aio_comp,
+ Extents&& image_extents, Request&& request,
+ int op_flags, const ZTracer::Trace& parent_trace)
+ : m_image_ctx(image_ctx), m_aio_comp(aio_comp),
+ m_image_extents(std::move(image_extents)), m_request(std::move(request)),
+ m_op_flags(op_flags), m_parent_trace(parent_trace) {
+ }
+
+ ImageCtxT& m_image_ctx;
+ AioCompletion* m_aio_comp;
+ Extents m_image_extents;
+ Request m_request;
+ int m_op_flags;
+ ZTracer::Trace m_parent_trace;
+ std::atomic<uint64_t> m_throttled_flag = 0;
+
+ uint64_t extents_length();
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageDispatchSpec<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc
new file mode 100644
index 00000000..d6eb29fe
--- /dev/null
+++ b/src/librbd/io/ImageRequest.cc
@@ -0,0 +1,824 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/Utils.h"
+#include "librbd/journal/Types.h"
+#include "include/rados/librados.hpp"
+#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
+#include "osdc/Striper.h"
+#include <algorithm>
+#include <functional>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::get_image_ctx;
+
+namespace {
+
+template <typename I>
+struct C_UpdateTimestamp : public Context {
+public:
+ I& m_image_ctx;
+ bool m_modify; // if modify set to 'true', modify timestamp is updated,
+ // access timestamp otherwise
+ AsyncOperation m_async_op;
+
+ C_UpdateTimestamp(I& ictx, bool m) : m_image_ctx(ictx), m_modify(m) {
+ m_async_op.start_op(*get_image_ctx(&m_image_ctx));
+ }
+ ~C_UpdateTimestamp() override {
+ m_async_op.finish_op();
+ }
+
+ void send() {
+ librados::ObjectWriteOperation op;
+ if (m_modify) {
+ cls_client::set_modify_timestamp(&op);
+ } else {
+ cls_client::set_access_timestamp(&op);
+ }
+
+ auto comp = librbd::util::create_rados_callback(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void finish(int r) override {
+ // ignore errors updating timestamp
+ }
+};
+
+bool should_update_timestamp(const utime_t& now, const utime_t& timestamp,
+ uint64_t interval) {
+ return (interval &&
+ (static_cast<uint64_t>(now.sec()) >= interval + timestamp));
+}
+
+} // anonymous namespace
+
+template <typename I>
+void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ ReadResult &&read_result, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageReadRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(read_result), op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&bl,
+ int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageWriteRequest<I> req(*ictx, c, std::move(image_extents), std::move(bl),
+ op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_discard(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ const ZTracer::Trace &parent_trace) {
+ ImageDiscardRequest<I> req(*ictx, c, std::move(image_extents),
+ discard_granularity_bytes, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace) {
+ ImageFlushRequest<I> req(*ictx, c, flush_source, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_writesame(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageWriteSameRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(bl), op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_compare_and_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_offset,
+ int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageCompareAndWriteRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, op_flags, parent_trace);
+ req.send();
+}
+
+
+template <typename I>
+void ImageRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(m_aio_comp->is_initialized(get_aio_type()));
+ ceph_assert(m_aio_comp->is_started());
+
+ CephContext *cct = image_ctx.cct;
+ AioCompletion *aio_comp = this->m_aio_comp;
+ ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << dendl;
+
+ aio_comp->get();
+ int r = clip_request();
+ if (r < 0) {
+ m_aio_comp->fail(r);
+ return;
+ }
+
+ if (m_bypass_image_cache || m_image_ctx.image_cache == nullptr) {
+ update_timestamp();
+ send_request();
+ } else {
+ send_image_cache_request();
+ }
+}
+
+template <typename I>
+int ImageRequest<I>::clip_request() {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ for (auto &image_extent : m_image_extents) {
+ auto clip_len = image_extent.second;
+ int r = clip_io(get_image_ctx(&m_image_ctx), image_extent.first, &clip_len);
+ if (r < 0) {
+ return r;
+ }
+
+ image_extent.second = clip_len;
+ }
+ return 0;
+}
+
+template <typename I>
+void ImageRequest<I>::update_timestamp() {
+ bool modify = (get_aio_type() != AIO_TYPE_READ);
+ uint64_t update_interval;
+ if (modify) {
+ update_interval = m_image_ctx.mtime_update_interval;
+ } else {
+ update_interval = m_image_ctx.atime_update_interval;
+ }
+
+ if (update_interval == 0) {
+ return;
+ }
+
+ utime_t (I::*get_timestamp_fn)() const;
+ void (I::*set_timestamp_fn)(utime_t);
+ if (modify) {
+ get_timestamp_fn = &I::get_modify_timestamp;
+ set_timestamp_fn = &I::set_modify_timestamp;
+ } else {
+ get_timestamp_fn = &I::get_access_timestamp;
+ set_timestamp_fn = &I::set_access_timestamp;
+ }
+
+ utime_t ts = ceph_clock_now();
+ {
+ RWLock::RLocker timestamp_locker(m_image_ctx.timestamp_lock);
+ if(!should_update_timestamp(ts, std::invoke(get_timestamp_fn, m_image_ctx),
+ update_interval)) {
+ return;
+ }
+ }
+
+ {
+ RWLock::WLocker timestamp_locker(m_image_ctx.timestamp_lock);
+ bool update = should_update_timestamp(
+ ts, std::invoke(get_timestamp_fn, m_image_ctx), update_interval);
+ if (!update) {
+ return;
+ }
+
+ std::invoke(set_timestamp_fn, m_image_ctx, ts);
+ }
+
+ // TODO we fire and forget this outside the IO path to prevent
+ // potential race conditions with librbd client IO callbacks
+ // between different threads (e.g. librados and object cacher)
+ ldout(m_image_ctx.cct, 10) << get_request_type() << dendl;
+ auto req = new C_UpdateTimestamp<I>(m_image_ctx, modify);
+ req->send();
+}
+
+template <typename I>
+ImageReadRequest<I>::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents,
+ ReadResult &&read_result, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), "read",
+ parent_trace),
+ m_op_flags(op_flags) {
+ aio_comp->read_result = std::move(read_result);
+}
+
+template <typename I>
+int ImageReadRequest<I>::clip_request() {
+ int r = ImageRequest<I>::clip_request();
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t buffer_length = 0;
+ auto &image_extents = this->m_image_extents;
+ for (auto &image_extent : image_extents) {
+ buffer_length += image_extent.second;
+ }
+ this->m_aio_comp->read_result.set_clip_length(buffer_length);
+ return 0;
+}
+
+template <typename I>
+void ImageReadRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ auto &image_extents = this->m_image_extents;
+ if (image_ctx.cache && image_ctx.readahead_max_bytes > 0 &&
+ !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) {
+ readahead(get_image_ctx(&image_ctx), image_extents);
+ }
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ librados::snap_t snap_id;
+ map<object_t,vector<ObjectExtent> > object_extents;
+ uint64_t buffer_ofs = 0;
+ {
+ // prevent image size from changing between computing clip and recording
+ // pending async operation
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ snap_id = image_ctx.snap_id;
+
+ // map image extents to object extents
+ for (auto &extent : image_extents) {
+ if (extent.second == 0) {
+ continue;
+ }
+
+ Striper::file_to_extents(cct, image_ctx.format_string, &image_ctx.layout,
+ extent.first, extent.second, 0, object_extents,
+ buffer_ofs);
+ buffer_ofs += extent.second;
+ }
+ }
+
+ // pre-calculate the expected number of read requests
+ uint32_t request_count = 0;
+ for (auto &object_extent : object_extents) {
+ request_count += object_extent.second.size();
+ }
+ aio_comp->set_request_count(request_count);
+
+ // issue the requests
+ for (auto &object_extent : object_extents) {
+ for (auto &extent : object_extent.second) {
+ ldout(cct, 20) << "oid " << extent.oid << " " << extent.offset << "~"
+ << extent.length << " from " << extent.buffer_extents
+ << dendl;
+
+ auto req_comp = new io::ReadResult::C_ObjectReadRequest(
+ aio_comp, extent.offset, extent.length,
+ std::move(extent.buffer_extents));
+ auto req = ObjectDispatchSpec::create_read(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, extent.oid.name,
+ extent.objectno, extent.offset, extent.length, snap_id, m_op_flags,
+ this->m_trace, &req_comp->bl, &req_comp->extent_map, req_comp);
+ req->send();
+ }
+ }
+
+ aio_comp->put();
+
+ image_ctx.perfcounter->inc(l_librbd_rd);
+ image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs);
+}
+
+template <typename I>
+void ImageReadRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+
+ auto *req_comp = new io::ReadResult::C_ImageReadRequest(
+ aio_comp, this->m_image_extents);
+ image_ctx.image_cache->aio_read(std::move(this->m_image_extents),
+ &req_comp->bl, m_op_flags,
+ req_comp);
+}
+
+template <typename I>
+void AbstractImageWriteRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ RWLock::RLocker md_locker(image_ctx.md_lock);
+
+ bool journaling = false;
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ uint64_t clip_len = 0;
+ ObjectExtents object_extents;
+ ::SnapContext snapc;
+ {
+ // prevent image size from changing between computing clip and recording
+ // pending async operation
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.snap_id != CEPH_NOSNAP || image_ctx.read_only) {
+ aio_comp->fail(-EROFS);
+ return;
+ }
+
+ for (auto &extent : this->m_image_extents) {
+ if (extent.second == 0) {
+ continue;
+ }
+
+ // map to object extents
+ Striper::file_to_extents(cct, image_ctx.format_string, &image_ctx.layout,
+ extent.first, extent.second, 0, object_extents);
+ clip_len += extent.second;
+ }
+
+ snapc = image_ctx.snapc;
+ journaling = (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending());
+ }
+
+ int ret = prune_object_extents(&object_extents);
+ if (ret < 0) {
+ aio_comp->fail(ret);
+ return;
+ }
+
+ if (!object_extents.empty()) {
+ uint64_t journal_tid = 0;
+ if (journaling) {
+ // in-flight ops are flushed prior to closing the journal
+ ceph_assert(image_ctx.journal != NULL);
+ journal_tid = append_journal_event(m_synchronous);
+ }
+
+ aio_comp->set_request_count(object_extents.size());
+ send_object_requests(object_extents, snapc, journal_tid);
+ } else {
+ // no IO to perform -- fire completion
+ aio_comp->set_request_count(0);
+ }
+
+ update_stats(clip_len);
+ aio_comp->put();
+}
+
+template <typename I>
+void AbstractImageWriteRequest<I>::send_object_requests(
+ const ObjectExtents &object_extents, const ::SnapContext &snapc,
+ uint64_t journal_tid) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ for (ObjectExtents::const_iterator p = object_extents.begin();
+ p != object_extents.end(); ++p) {
+ ldout(cct, 20) << "oid " << p->oid << " " << p->offset << "~" << p->length
+ << " from " << p->buffer_extents << dendl;
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ auto request = create_object_request(*p, snapc, journal_tid, req_comp);
+
+ // if journaling, stash the request for later; otherwise send
+ if (request != NULL) {
+ request->send();
+ }
+ }
+}
+
+template <typename I>
+void ImageWriteRequest<I>::assemble_extent(const ObjectExtent &object_extent,
+ bufferlist *bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+ }
+}
+
+template <typename I>
+uint64_t ImageWriteRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ uint64_t buffer_offset = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, buffer_offset, extent.second);
+ buffer_offset += extent.second;
+
+ tid = image_ctx.journal->append_write_event(extent.first, extent.second,
+ sub_bl, synchronous);
+ }
+
+ return tid;
+}
+
+template <typename I>
+void ImageWriteRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ image_ctx.image_cache->aio_write(std::move(this->m_image_extents),
+ std::move(m_bl), m_op_flags, req_comp);
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageWriteRequest<I>::create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ assemble_extent(object_extent, &bl);
+ auto req = ObjectDispatchSpec::create_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name,
+ object_extent.objectno, object_extent.offset, std::move(bl), snapc,
+ m_op_flags, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_wr);
+ image_ctx.perfcounter->inc(l_librbd_wr_bytes, length);
+}
+
+template <typename I>
+uint64_t ImageDiscardRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ journal::EventEntry event_entry(
+ journal::AioDiscardEvent(extent.first,
+ extent.second,
+ this->m_discard_granularity_bytes));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, 0);
+ }
+
+ return tid;
+}
+
+template <typename I>
+void ImageDiscardRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(this->m_image_extents.size());
+ for (auto &extent : this->m_image_extents) {
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ image_ctx.image_cache->aio_discard(extent.first, extent.second,
+ this->m_discard_granularity_bytes,
+ req_comp);
+ }
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageDiscardRequest<I>::create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+ auto req = ObjectDispatchSpec::create_discard(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name,
+ object_extent.objectno, object_extent.offset, object_extent.length, snapc,
+ OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE, journal_tid, this->m_trace,
+ on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageDiscardRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_discard);
+ image_ctx.perfcounter->inc(l_librbd_discard_bytes, length);
+}
+
+template <typename I>
+int ImageDiscardRequest<I>::prune_object_extents(
+ ObjectExtents* object_extents) const {
+ if (m_discard_granularity_bytes == 0) {
+ return 0;
+ }
+
+ // Align the range to discard_granularity_bytes boundary and skip
+ // and discards that are too small to free up any space.
+ //
+ // discard_granularity_bytes >= object_size && tail truncation
+ // is a special case for filestore
+ bool prune_required = false;
+ auto object_size = this->m_image_ctx.layout.object_size;
+ auto discard_granularity_bytes = std::min(m_discard_granularity_bytes,
+ object_size);
+ auto xform_lambda =
+ [discard_granularity_bytes, object_size, &prune_required]
+ (ObjectExtent& object_extent) {
+ auto& offset = object_extent.offset;
+ auto& length = object_extent.length;
+ auto next_offset = offset + length;
+
+ if ((discard_granularity_bytes < object_size) ||
+ (next_offset < object_size)) {
+ offset = p2roundup<uint64_t>(offset, discard_granularity_bytes);
+ next_offset = p2align<uint64_t>(next_offset, discard_granularity_bytes);
+ if (offset >= next_offset) {
+ prune_required = true;
+ length = 0;
+ } else {
+ length = next_offset - offset;
+ }
+ }
+ };
+ std::for_each(object_extents->begin(), object_extents->end(),
+ xform_lambda);
+
+ if (prune_required) {
+ // one or more object extents were skipped
+ auto remove_lambda =
+ [](const ObjectExtent& object_extent) {
+ return (object_extent.length == 0);
+ };
+ object_extents->erase(
+ std::remove_if(object_extents->begin(), object_extents->end(),
+ remove_lambda),
+ object_extents->end());
+ }
+ return 0;
+}
+
+template <typename I>
+void ImageFlushRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool journaling = false;
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ journaling = (m_flush_source == FLUSH_SOURCE_USER &&
+ image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending());
+ }
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+
+ Context *ctx = new C_AioRequest(aio_comp);
+
+ // ensure no locks are held when flush is complete
+ ctx = librbd::util::create_async_context_callback(image_ctx, ctx);
+
+ if (journaling) {
+ // in-flight ops are flushed prior to closing the journal
+ uint64_t journal_tid = image_ctx.journal->append_io_event(
+ journal::EventEntry(journal::AioFlushEvent()), 0, 0, false, 0);
+ image_ctx.journal->user_flushed();
+
+ ctx = new FunctionContext(
+ [&image_ctx, journal_tid, ctx](int r) {
+ image_ctx.journal->commit_io_event(journal_tid, r);
+ ctx->complete(r);
+ });
+ ctx = new FunctionContext(
+ [&image_ctx, journal_tid, ctx](int r) {
+ image_ctx.journal->flush_event(journal_tid, ctx);
+ });
+ } else {
+ // flush rbd cache only when journaling is not enabled
+ auto object_dispatch_spec = ObjectDispatchSpec::create_flush(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, m_flush_source, this->m_trace,
+ ctx);
+ ctx = new FunctionContext([object_dispatch_spec](int r) {
+ object_dispatch_spec->send();
+ });
+ }
+
+ // ensure all in-flight IOs are settled if non-user flush request
+ aio_comp->async_op.flush(ctx);
+ aio_comp->put();
+
+ // might be flushing during image shutdown
+ if (image_ctx.perfcounter != nullptr) {
+ image_ctx.perfcounter->inc(l_librbd_flush);
+ }
+}
+
+template <typename I>
+void ImageFlushRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ image_ctx.image_cache->aio_flush(req_comp);
+}
+
+template <typename I>
+uint64_t ImageWriteSameRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first,
+ extent.second,
+ m_data_bl));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, 0);
+ }
+
+ return tid;
+}
+
+template <typename I>
+void ImageWriteSameRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(this->m_image_extents.size());
+ for (auto &extent : this->m_image_extents) {
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ image_ctx.image_cache->aio_writesame(extent.first, extent.second,
+ std::move(m_data_bl), m_op_flags,
+ req_comp);
+ }
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageWriteSameRequest<I>::create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ ObjectDispatchSpec *req;
+
+ if (util::assemble_write_same_extent(object_extent, m_data_bl, &bl, false)) {
+ Extents buffer_extents{object_extent.buffer_extents};
+
+ req = ObjectDispatchSpec::create_write_same(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name,
+ object_extent.objectno, object_extent.offset, object_extent.length,
+ std::move(buffer_extents), std::move(bl), snapc, m_op_flags, journal_tid,
+ this->m_trace, on_finish);
+ return req;
+ }
+ req = ObjectDispatchSpec::create_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name,
+ object_extent.objectno, object_extent.offset, std::move(bl), snapc,
+ m_op_flags, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageWriteSameRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_ws);
+ image_ctx.perfcounter->inc(l_librbd_ws_bytes, length);
+}
+
+template <typename I>
+uint64_t ImageCompareAndWriteRequest<I>::append_journal_event(
+ bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(this->m_image_extents.size() == 1);
+ auto &extent = this->m_image_extents.front();
+ journal::EventEntry event_entry(
+ journal::AioCompareAndWriteEvent(extent.first, extent.second, m_cmp_bl,
+ m_bl));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, -EILSEQ);
+
+ return tid;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::assemble_extent(
+ const ObjectExtent &object_extent, bufferlist *bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+ }
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ image_ctx.image_cache->aio_compare_and_write(
+ std::move(this->m_image_extents), std::move(m_cmp_bl), std::move(m_bl),
+ m_mismatch_offset, m_op_flags, req_comp);
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageCompareAndWriteRequest<I>::create_object_request(
+ const ObjectExtent &object_extent,
+ const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ // NOTE: safe to move m_cmp_bl since we only support this op against
+ // a single object
+ bufferlist bl;
+ assemble_extent(object_extent, &bl);
+ auto req = ObjectDispatchSpec::create_compare_and_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.oid.name,
+ object_extent.objectno, object_extent.offset, std::move(m_cmp_bl),
+ std::move(bl), snapc, m_mismatch_offset, m_op_flags, journal_tid,
+ this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_cmp);
+ image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length);
+}
+
+template <typename I>
+int ImageCompareAndWriteRequest<I>::prune_object_extents(
+ ObjectExtents* object_extents) const {
+ if (object_extents->size() > 1)
+ return -EINVAL;
+
+ I &image_ctx = this->m_image_ctx;
+ uint64_t sector_size = 512ULL;
+ uint64_t su = image_ctx.layout.stripe_unit;
+ ObjectExtent object_extent = object_extents->front();
+ if (object_extent.offset % sector_size + object_extent.length > sector_size ||
+ (su != 0 && (object_extent.offset % su + object_extent.length > su)))
+ return -EINVAL;
+
+ return 0;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageRequest<librbd::ImageCtx>;
+template class librbd::io::ImageReadRequest<librbd::ImageCtx>;
+template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
+template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
+template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h
new file mode 100644
index 00000000..d7d10019
--- /dev/null
+++ b/src/librbd/io/ImageRequest.h
@@ -0,0 +1,365 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_H
+#define CEPH_LIBRBD_IO_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "osd/osd_types.h"
+#include "librbd/Utils.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <utility>
+#include <vector>
+
+namespace librbd {
+class ImageCtx;
+
+namespace io {
+
+class AioCompletion;
+class ObjectDispatchSpec;
+class ReadResult;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageRequest {
+public:
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+ virtual ~ImageRequest() {
+ m_trace.event("finish");
+ }
+
+ static void aio_read(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ReadResult &&read_result,
+ int op_flags, const ZTracer::Trace &parent_trace);
+ static void aio_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace);
+ static void aio_discard(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ const ZTracer::Trace &parent_trace);
+ static void aio_flush(ImageCtxT *ictx, AioCompletion *c,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace);
+ static void aio_writesame(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace);
+
+ static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace);
+
+ void send();
+
+ void set_bypass_image_cache() {
+ m_bypass_image_cache = true;
+ }
+
+ inline const ZTracer::Trace &get_trace() const {
+ return m_trace;
+ }
+
+protected:
+ typedef std::list<ObjectDispatchSpec*> ObjectRequests;
+
+ ImageCtxT &m_image_ctx;
+ AioCompletion *m_aio_comp;
+ Extents m_image_extents;
+ ZTracer::Trace m_trace;
+ bool m_bypass_image_cache = false;
+
+ ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, const char *trace_name,
+ const ZTracer::Trace &parent_trace)
+ : m_image_ctx(image_ctx), m_aio_comp(aio_comp),
+ m_image_extents(std::move(image_extents)),
+ m_trace(util::create_trace(image_ctx, trace_name, parent_trace)) {
+ m_trace.event("start");
+ }
+
+
+ virtual int clip_request();
+ virtual void update_timestamp();
+ virtual void send_request() = 0;
+ virtual void send_image_cache_request() = 0;
+
+ virtual aio_type_t get_aio_type() const = 0;
+ virtual const char *get_request_type() const = 0;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageReadRequest : public ImageRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ReadResult &&read_result,
+ int op_flags, const ZTracer::Trace &parent_trace);
+
+protected:
+ int clip_request() override;
+
+ void send_request() override;
+ void send_image_cache_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_READ;
+ }
+ const char *get_request_type() const override {
+ return "aio_read";
+ }
+private:
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AbstractImageWriteRequest : public ImageRequest<ImageCtxT> {
+public:
+ inline void flag_synchronous() {
+ m_synchronous = true;
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ typedef std::vector<ObjectExtent> ObjectExtents;
+
+ AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, const char *trace_name,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents),
+ trace_name, parent_trace),
+ m_synchronous(false) {
+ }
+
+ void send_request() override;
+
+ virtual int prune_object_extents(ObjectExtents* object_extents) const {
+ return 0;
+ }
+
+ void send_object_requests(const ObjectExtents &object_extents,
+ const ::SnapContext &snapc, uint64_t journal_tid);
+ virtual ObjectDispatchSpec *create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) = 0;
+
+ virtual uint64_t append_journal_event(bool synchronous) = 0;
+ virtual void update_stats(size_t length) = 0;
+
+private:
+ bool m_synchronous;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), "write", parent_trace),
+ m_bl(std::move(bl)), m_op_flags(op_flags) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_write";
+ }
+
+ void assemble_extent(const ObjectExtent &object_extent, bufferlist *bl);
+
+ void send_image_cache_request() override;
+
+
+ ObjectDispatchSpec *create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+private:
+ bufferlist m_bl;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageDiscardRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents&& image_extents,
+ uint32_t discard_granularity_bytes,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), "discard", parent_trace),
+ m_discard_granularity_bytes(discard_granularity_bytes) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_DISCARD;
+ }
+ const char *get_request_type() const override {
+ return "aio_discard";
+ }
+
+ void send_image_cache_request() override;
+
+ ObjectDispatchSpec *create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ int prune_object_extents(ObjectExtents* object_extents) const override;
+
+private:
+ uint32_t m_discard_granularity_bytes;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageFlushRequest : public ImageRequest<ImageCtxT> {
+public:
+ ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {}, "flush", parent_trace),
+ m_flush_source(flush_source) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ int clip_request() override {
+ return 0;
+ }
+ void update_timestamp() override {
+ }
+ void send_request() override;
+ void send_image_cache_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_FLUSH;
+ }
+ const char *get_request_type() const override {
+ return "aio_flush";
+ }
+
+private:
+ FlushSource m_flush_source;
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWriteSameRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents&& image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), "writesame",
+ parent_trace),
+ m_data_bl(std::move(bl)), m_op_flags(op_flags) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_WRITESAME;
+ }
+ const char *get_request_type() const override {
+ return "aio_writesame";
+ }
+
+ void send_image_cache_request() override;
+
+ ObjectDispatchSpec *create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+private:
+ bufferlist m_data_bl;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCompareAndWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents;
+
+ ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), "compare_and_write", parent_trace),
+ m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+protected:
+ void send_image_cache_request() override;
+
+ void assemble_extent(const ObjectExtent &object_extent, bufferlist *bl);
+
+ ObjectDispatchSpec *create_object_request(
+ const ObjectExtent &object_extent, const ::SnapContext &snapc,
+ uint64_t journal_tid, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_COMPARE_AND_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_compare_and_write";
+ }
+
+ int prune_object_extents(ObjectExtents* object_extents) const override;
+
+private:
+ bufferlist m_cmp_bl;
+ bufferlist m_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageReadRequest<librbd::ImageCtx>;
+extern template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H
diff --git a/src/librbd/io/ImageRequestWQ.cc b/src/librbd/io/ImageRequestWQ.cc
new file mode 100644
index 00000000..7bdaf2f2
--- /dev/null
+++ b/src/librbd/io/ImageRequestWQ.cc
@@ -0,0 +1,1043 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageRequestWQ.h"
+#include "common/errno.h"
+#include "common/zipkin_trace.h"
+#include "common/Cond.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "common/EventTrace.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageRequestWQ: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+template <typename I>
+void flush_image(I& image_ctx, Context* on_finish) {
+ auto aio_comp = librbd::io::AioCompletion::create_and_start(
+ on_finish, util::get_image_ctx(&image_ctx), librbd::io::AIO_TYPE_FLUSH);
+ auto req = librbd::io::ImageDispatchSpec<I>::create_flush_request(
+ image_ctx, aio_comp, librbd::io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ delete req;
+}
+
+} // anonymous namespace
+
+template <typename I>
+struct ImageRequestWQ<I>::C_AcquireLock : public Context {
+ ImageRequestWQ *work_queue;
+ ImageDispatchSpec<I> *image_request;
+
+ C_AcquireLock(ImageRequestWQ *work_queue, ImageDispatchSpec<I> *image_request)
+ : work_queue(work_queue), image_request(image_request) {
+ }
+
+ void finish(int r) override {
+ work_queue->handle_acquire_lock(r, image_request);
+ }
+};
+
+template <typename I>
+struct ImageRequestWQ<I>::C_BlockedWrites : public Context {
+ ImageRequestWQ *work_queue;
+ explicit C_BlockedWrites(ImageRequestWQ *_work_queue)
+ : work_queue(_work_queue) {
+ }
+
+ void finish(int r) override {
+ work_queue->handle_blocked_writes(r);
+ }
+};
+
+template <typename I>
+struct ImageRequestWQ<I>::C_RefreshFinish : public Context {
+ ImageRequestWQ *work_queue;
+ ImageDispatchSpec<I> *image_request;
+
+ C_RefreshFinish(ImageRequestWQ *work_queue,
+ ImageDispatchSpec<I> *image_request)
+ : work_queue(work_queue), image_request(image_request) {
+ }
+ void finish(int r) override {
+ work_queue->handle_refreshed(r, image_request);
+ }
+};
+
+static std::map<uint64_t, std::string> throttle_flags = {
+ { RBD_QOS_IOPS_THROTTLE, "rbd_qos_iops_throttle" },
+ { RBD_QOS_BPS_THROTTLE, "rbd_qos_bps_throttle" },
+ { RBD_QOS_READ_IOPS_THROTTLE, "rbd_qos_read_iops_throttle" },
+ { RBD_QOS_WRITE_IOPS_THROTTLE, "rbd_qos_write_iops_throttle" },
+ { RBD_QOS_READ_BPS_THROTTLE, "rbd_qos_read_bps_throttle" },
+ { RBD_QOS_WRITE_BPS_THROTTLE, "rbd_qos_write_bps_throttle" }
+};
+
+template <typename I>
+ImageRequestWQ<I>::ImageRequestWQ(I *image_ctx, const string &name,
+ time_t ti, ThreadPool *tp)
+ : ThreadPool::PointerWQ<ImageDispatchSpec<I> >(name, ti, 0, tp),
+ m_image_ctx(*image_ctx),
+ m_lock(util::unique_lock_name("ImageRequestWQ<I>::m_lock", this)) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+
+ SafeTimer *timer;
+ Mutex *timer_lock;
+ ImageCtx::get_timer_instance(cct, &timer, &timer_lock);
+
+ for (auto flag : throttle_flags) {
+ m_throttles.push_back(make_pair(
+ flag.first,
+ new TokenBucketThrottle(cct, flag.second, 0, 0, timer, timer_lock)));
+ }
+
+ this->register_work_queue();
+}
+
+template <typename I>
+ImageRequestWQ<I>::~ImageRequestWQ() {
+ for (auto t : m_throttles) {
+ delete t.second;
+ }
+}
+
+template <typename I>
+ssize_t ImageRequestWQ<I>::read(uint64_t off, uint64_t len,
+ ReadResult &&read_result, int op_flags) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_read(c, off, len, std::move(read_result), op_flags, false);
+ return cond.wait();
+}
+
+template <typename I>
+ssize_t ImageRequestWQ<I>::write(uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ m_image_ctx.snap_lock.get_read();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.snap_lock.put_read();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_write(c, off, len, std::move(bl), op_flags, false);
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t ImageRequestWQ<I>::discard(uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ m_image_ctx.snap_lock.get_read();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.snap_lock.put_read();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_discard(c, off, len, discard_granularity_bytes, false);
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t ImageRequestWQ<I>::writesame(uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+ << "len = " << len << ", data_len " << bl.length() << dendl;
+
+ m_image_ctx.snap_lock.get_read();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.snap_lock.put_read();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_writesame(c, off, len, std::move(bl), op_flags, false);
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t ImageRequestWQ<I>::write_zeroes(uint64_t off, uint64_t len,
+ int zero_flags, int op_flags) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ m_image_ctx.snap_lock.get_read();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.snap_lock.put_read();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_write_zeroes(aio_comp, off, len, zero_flags, op_flags, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_off,
+ int op_flags){
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "compare_and_write ictx=" << &m_image_ctx << ", off="
+ << off << ", " << "len = " << len << dendl;
+
+ m_image_ctx.snap_lock.get_read();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.snap_lock.put_read();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_compare_and_write(c, off, len, std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, false);
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return len;
+}
+
+template <typename I>
+int ImageRequestWQ<I>::flush() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << dendl;
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_flush(c, false);
+
+ int r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
+ ReadResult &&read_result, int op_flags,
+ bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("wq: read", &m_image_ctx.trace_endpoint);
+ trace.event("start");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_READ);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << ", off=" << off << ", "
+ << "len=" << len << ", " << "flags=" << op_flags << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ // if journaling is enabled -- we need to replay the journal because
+ // it might contain an uncommitted write
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty() ||
+ require_lock_on_read()) {
+ queue(ImageDispatchSpec<I>::create_read_request(
+ m_image_ctx, c, {{off, len}}, std::move(read_result), op_flags,
+ trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_read(&m_image_ctx, c, {{off, len}},
+ std::move(read_result), op_flags, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_write(AioCompletion *c, uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags,
+ bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("wq: write", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_WRITE);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << ", off=" << off << ", "
+ << "len=" << len << ", flags=" << op_flags << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(ImageDispatchSpec<I>::create_write_request(
+ m_image_ctx, c, {{off, len}}, std::move(bl), op_flags, trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_write(&m_image_ctx, c, {{off, len}},
+ std::move(bl), op_flags, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_discard(AioCompletion *c, uint64_t off,
+ uint64_t len,
+ uint32_t discard_granularity_bytes,
+ bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("wq: discard", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_DISCARD);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << ", off=" << off << ", len=" << len
+ << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(ImageDispatchSpec<I>::create_discard_request(
+ m_image_ctx, c, off, len, discard_granularity_bytes, trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_discard(&m_image_ctx, c, {{off, len}},
+ discard_granularity_bytes, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_flush(AioCompletion *c, bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("wq: flush", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_FLUSH);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty()) {
+ queue(ImageDispatchSpec<I>::create_flush_request(
+ m_image_ctx, c, FLUSH_SOURCE_USER, trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_flush(&m_image_ctx, c, FLUSH_SOURCE_USER, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_writesame(AioCompletion *c, uint64_t off,
+ uint64_t len, bufferlist &&bl,
+ int op_flags, bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("wq: writesame", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_WRITESAME);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << ", off=" << off << ", "
+ << "len=" << len << ", data_len = " << bl.length() << ", "
+ << "flags=" << op_flags << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(ImageDispatchSpec<I>::create_write_same_request(
+ m_image_ctx, c, off, len, std::move(bl), op_flags, trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_writesame(&m_image_ctx, c, {{off, len}}, std::move(bl),
+ op_flags, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+
+template <typename I>
+void ImageRequestWQ<I>::aio_write_zeroes(io::AioCompletion *aio_comp,
+ uint64_t off, uint64_t len,
+ int zero_flags, int op_flags,
+ bool native_async) {
+ auto cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("io: write_zeroes", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_DISCARD);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ // validate the supported flags
+ if (zero_flags != 0U) {
+ aio_comp->fail(-EINVAL);
+ return;
+ }
+
+ if (!start_in_flight_io(aio_comp)) {
+ return;
+ }
+
+ // enable partial discard (zeroing) of objects
+ uint32_t discard_granularity_bytes = 0;
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(ImageDispatchSpec<I>::create_discard_request(
+ m_image_ctx, aio_comp, off, len, discard_granularity_bytes, trace));
+ } else {
+ aio_comp->start_op();
+ ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp, {{off, len}},
+ discard_granularity_bytes, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
+ uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_off,
+ int op_flags, bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("wq: compare_and_write", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_COMPARE_AND_WRITE);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(ImageDispatchSpec<I>::create_compare_and_write_request(
+ m_image_ctx, c, {{off, len}}, std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_compare_and_write(&m_image_ctx, c, {{off, len}},
+ std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
+template <typename I>
+void ImageRequestWQ<I>::shut_down(Context *on_shutdown) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ {
+ RWLock::WLocker locker(m_lock);
+ ceph_assert(!m_shutdown);
+ m_shutdown = true;
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << __func__ << ": in_flight=" << m_in_flight_ios.load()
+ << dendl;
+ if (m_in_flight_ios > 0) {
+ m_on_shutdown = on_shutdown;
+ return;
+ }
+ }
+
+ // ensure that all in-flight IO is flushed
+ flush_image(m_image_ctx, on_shutdown);
+}
+
+template <typename I>
+int ImageRequestWQ<I>::block_writes() {
+ C_SaferCond cond_ctx;
+ block_writes(&cond_ctx);
+ return cond_ctx.wait();
+}
+
+template <typename I>
+void ImageRequestWQ<I>::block_writes(Context *on_blocked) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ {
+ RWLock::WLocker locker(m_lock);
+ ++m_write_blockers;
+ ldout(cct, 5) << &m_image_ctx << ", " << "num="
+ << m_write_blockers << dendl;
+ if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) {
+ m_write_blocker_contexts.push_back(on_blocked);
+ return;
+ }
+ }
+
+ // ensure that all in-flight IO is flushed
+ flush_image(m_image_ctx, on_blocked);
+}
+
+template <typename I>
+void ImageRequestWQ<I>::unblock_writes() {
+ CephContext *cct = m_image_ctx.cct;
+
+ bool wake_up = false;
+ Contexts waiter_contexts;
+ {
+ RWLock::WLocker locker(m_lock);
+ ceph_assert(m_write_blockers > 0);
+ --m_write_blockers;
+
+ ldout(cct, 5) << &m_image_ctx << ", " << "num="
+ << m_write_blockers << dendl;
+ if (m_write_blockers == 0) {
+ wake_up = true;
+ std::swap(waiter_contexts, m_unblocked_write_waiter_contexts);
+ }
+ }
+
+ if (wake_up) {
+ for (auto ctx : waiter_contexts) {
+ ctx->complete(0);
+ }
+ this->signal();
+ }
+}
+
+template <typename I>
+void ImageRequestWQ<I>::wait_on_writes_unblocked(Context *on_unblocked) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ {
+ RWLock::WLocker locker(m_lock);
+ ldout(cct, 20) << &m_image_ctx << ", " << "write_blockers="
+ << m_write_blockers << dendl;
+ if (!m_unblocked_write_waiter_contexts.empty() || m_write_blockers > 0) {
+ m_unblocked_write_waiter_contexts.push_back(on_unblocked);
+ return;
+ }
+ }
+
+ on_unblocked->complete(0);
+}
+
+template <typename I>
+void ImageRequestWQ<I>::set_require_lock(Direction direction, bool enabled) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ bool wake_up = false;
+ {
+ RWLock::WLocker locker(m_lock);
+ switch (direction) {
+ case DIRECTION_READ:
+ wake_up = (enabled != m_require_lock_on_read);
+ m_require_lock_on_read = enabled;
+ break;
+ case DIRECTION_WRITE:
+ wake_up = (enabled != m_require_lock_on_write);
+ m_require_lock_on_write = enabled;
+ break;
+ case DIRECTION_BOTH:
+ wake_up = (enabled != m_require_lock_on_read ||
+ enabled != m_require_lock_on_write);
+ m_require_lock_on_read = enabled;
+ m_require_lock_on_write = enabled;
+ break;
+ }
+ }
+
+ // wake up the thread pool whenever the state changes so that
+ // we can re-request the lock if required
+ if (wake_up) {
+ this->signal();
+ }
+}
+
+template <typename I>
+void ImageRequestWQ<I>::apply_qos_schedule_tick_min(uint64_t tick){
+ for (auto pair : m_throttles) {
+ pair.second->set_schedule_tick_min(tick);
+ }
+}
+
+template <typename I>
+void ImageRequestWQ<I>::apply_qos_limit(const uint64_t flag,
+ uint64_t limit,
+ uint64_t burst) {
+ CephContext *cct = m_image_ctx.cct;
+ TokenBucketThrottle *throttle = nullptr;
+ for (auto pair : m_throttles) {
+ if (flag == pair.first) {
+ throttle = pair.second;
+ break;
+ }
+ }
+ ceph_assert(throttle != nullptr);
+
+ int r = throttle->set_limit(limit, burst);
+ if (r < 0) {
+ lderr(cct) << throttle->get_name() << ": invalid qos parameter: "
+ << "burst(" << burst << ") is less than "
+ << "limit(" << limit << ")" << dendl;
+ // if apply failed, we should at least make sure the limit works.
+ throttle->set_limit(limit, 0);
+ }
+
+ if (limit)
+ m_qos_enabled_flag |= flag;
+ else
+ m_qos_enabled_flag &= ~flag;
+}
+
+template <typename I>
+void ImageRequestWQ<I>::handle_throttle_ready(int r, ImageDispatchSpec<I> *item,
+ uint64_t flag) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 15) << "r=" << r << ", " << "req=" << item << dendl;
+
+ std::lock_guard pool_locker{this->get_pool_lock()};
+ ceph_assert(m_io_throttled.load() > 0);
+ item->set_throttled(flag);
+ if (item->were_all_throttled()) {
+ this->requeue_back(pool_locker, item);
+ --m_io_throttled;
+ this->signal(pool_locker);
+ }
+}
+
+template <typename I>
+bool ImageRequestWQ<I>::needs_throttle(ImageDispatchSpec<I> *item) {
+ uint64_t tokens = 0;
+ uint64_t flag = 0;
+ bool blocked = false;
+ TokenBucketThrottle* throttle = nullptr;
+
+ for (auto t : m_throttles) {
+ flag = t.first;
+ if (item->was_throttled(flag))
+ continue;
+
+ if (!(m_qos_enabled_flag & flag)) {
+ item->set_throttled(flag);
+ continue;
+ }
+
+ throttle = t.second;
+ if (item->tokens_requested(flag, &tokens) &&
+ throttle->get<ImageRequestWQ<I>, ImageDispatchSpec<I>,
+ &ImageRequestWQ<I>::handle_throttle_ready>(
+ tokens, this, item, flag)) {
+ blocked = true;
+ } else {
+ item->set_throttled(flag);
+ }
+ }
+ return blocked;
+}
+
+template <typename I>
+void *ImageRequestWQ<I>::_void_dequeue() {
+ CephContext *cct = m_image_ctx.cct;
+ ImageDispatchSpec<I> *peek_item = this->front();
+
+ // no queued IO requests or all IO is blocked/stalled
+ if (peek_item == nullptr || m_io_blockers.load() > 0) {
+ return nullptr;
+ }
+
+ if (needs_throttle(peek_item)) {
+ ldout(cct, 15) << "throttling IO " << peek_item << dendl;
+
+ ++m_io_throttled;
+ // dequeue the throttled item
+ ThreadPool::PointerWQ<ImageDispatchSpec<I> >::_void_dequeue();
+ return nullptr;
+ }
+
+ bool lock_required;
+ bool refresh_required = m_image_ctx.state->is_refresh_required();
+ {
+ RWLock::RLocker locker(m_lock);
+ bool write_op = peek_item->is_write_op();
+ lock_required = is_lock_required(write_op);
+ if (write_op) {
+ if (!lock_required && m_write_blockers > 0) {
+ // missing lock is not the write blocker
+ return nullptr;
+ }
+
+ if (!lock_required && !refresh_required) {
+ // completed ops will requeue the IO -- don't count it as in-progress
+ m_in_flight_writes++;
+ }
+ }
+ }
+
+ auto item = reinterpret_cast<ImageDispatchSpec<I> *>(
+ ThreadPool::PointerWQ<ImageDispatchSpec<I> >::_void_dequeue());
+ ceph_assert(peek_item == item);
+
+ if (lock_required) {
+ this->get_pool_lock().unlock();
+ m_image_ctx.owner_lock.get_read();
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ ldout(cct, 5) << "exclusive lock required: delaying IO " << item << dendl;
+ if (!m_image_ctx.get_exclusive_lock_policy()->may_auto_request_lock()) {
+ lderr(cct) << "op requires exclusive lock" << dendl;
+ fail_in_flight_io(m_image_ctx.exclusive_lock->get_unlocked_op_error(),
+ item);
+
+ // wake up the IO since we won't be returning a request to process
+ this->signal();
+ } else {
+ // stall IO until the acquire completes
+ ++m_io_blockers;
+ m_image_ctx.exclusive_lock->acquire_lock(new C_AcquireLock(this, item));
+ }
+ } else {
+ // raced with the exclusive lock being disabled
+ lock_required = false;
+ }
+ m_image_ctx.owner_lock.put_read();
+ this->get_pool_lock().lock();
+
+ if (lock_required) {
+ return nullptr;
+ }
+ }
+
+ if (refresh_required) {
+ ldout(cct, 5) << "image refresh required: delaying IO " << item << dendl;
+
+ // stall IO until the refresh completes
+ ++m_io_blockers;
+
+ this->get_pool_lock().unlock();
+ m_image_ctx.state->refresh(new C_RefreshFinish(this, item));
+ this->get_pool_lock().lock();
+ return nullptr;
+ }
+
+ item->start_op();
+ return item;
+}
+
+template <typename I>
+void ImageRequestWQ<I>::process(ImageDispatchSpec<I> *req) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "req=" << req << dendl;
+
+ req->send();
+
+ finish_queued_io(req);
+ if (req->is_write_op()) {
+ finish_in_flight_write();
+ }
+ delete req;
+
+ finish_in_flight_io();
+}
+
+template <typename I>
+void ImageRequestWQ<I>::finish_queued_io(ImageDispatchSpec<I> *req) {
+ RWLock::RLocker locker(m_lock);
+ if (req->is_write_op()) {
+ ceph_assert(m_queued_writes > 0);
+ m_queued_writes--;
+ } else {
+ ceph_assert(m_queued_reads > 0);
+ m_queued_reads--;
+ }
+}
+
+template <typename I>
+void ImageRequestWQ<I>::finish_in_flight_write() {
+ bool writes_blocked = false;
+ {
+ RWLock::RLocker locker(m_lock);
+ ceph_assert(m_in_flight_writes > 0);
+ if (--m_in_flight_writes == 0 &&
+ !m_write_blocker_contexts.empty()) {
+ writes_blocked = true;
+ }
+ }
+
+ if (writes_blocked) {
+ flush_image(m_image_ctx, new C_BlockedWrites(this));
+ }
+}
+
+template <typename I>
+int ImageRequestWQ<I>::start_in_flight_io(AioCompletion *c) {
+ RWLock::RLocker locker(m_lock);
+
+ if (m_shutdown) {
+ CephContext *cct = m_image_ctx.cct;
+ lderr(cct) << "IO received on closed image" << dendl;
+
+ c->get();
+ c->fail(-ESHUTDOWN);
+ return false;
+ }
+
+ if (!m_image_ctx.data_ctx.is_valid()) {
+ CephContext *cct = m_image_ctx.cct;
+ lderr(cct) << "missing data pool" << dendl;
+
+ c->get();
+ c->fail(-ENODEV);
+ return false;
+ }
+
+ m_in_flight_ios++;
+ return true;
+}
+
+template <typename I>
+void ImageRequestWQ<I>::finish_in_flight_io() {
+ Context *on_shutdown;
+ {
+ RWLock::RLocker locker(m_lock);
+ if (--m_in_flight_ios > 0 || !m_shutdown) {
+ return;
+ }
+ on_shutdown = m_on_shutdown;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "completing shut down" << dendl;
+
+ ceph_assert(on_shutdown != nullptr);
+ flush_image(m_image_ctx, on_shutdown);
+}
+
+template <typename I>
+void ImageRequestWQ<I>::fail_in_flight_io(
+ int r, ImageDispatchSpec<I> *req) {
+ this->process_finish();
+ req->fail(r);
+ finish_queued_io(req);
+ delete req;
+ finish_in_flight_io();
+}
+
+template <typename I>
+bool ImageRequestWQ<I>::is_lock_required(bool write_op) const {
+ ceph_assert(m_lock.is_locked());
+ return ((write_op && m_require_lock_on_write) ||
+ (!write_op && m_require_lock_on_read));
+}
+
+template <typename I>
+void ImageRequestWQ<I>::queue(ImageDispatchSpec<I> *req) {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "req=" << req << dendl;
+
+ if (req->is_write_op()) {
+ m_queued_writes++;
+ } else {
+ m_queued_reads++;
+ }
+
+ ThreadPool::PointerWQ<ImageDispatchSpec<I> >::queue(req);
+}
+
+template <typename I>
+void ImageRequestWQ<I>::handle_acquire_lock(
+ int r, ImageDispatchSpec<I> *req) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << ", " << "req=" << req << dendl;
+
+ if (r < 0) {
+ fail_in_flight_io(r, req);
+ } else {
+ // since IO was stalled for acquire -- original IO order is preserved
+ // if we requeue this op for work queue processing
+ this->requeue_front(req);
+ }
+
+ ceph_assert(m_io_blockers.load() > 0);
+ --m_io_blockers;
+ this->signal();
+}
+
+template <typename I>
+void ImageRequestWQ<I>::handle_refreshed(
+ int r, ImageDispatchSpec<I> *req) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "resuming IO after image refresh: r=" << r << ", "
+ << "req=" << req << dendl;
+ if (r < 0) {
+ fail_in_flight_io(r, req);
+ } else {
+ // since IO was stalled for refresh -- original IO order is preserved
+ // if we requeue this op for work queue processing
+ this->requeue_front(req);
+ }
+
+ ceph_assert(m_io_blockers.load() > 0);
+ --m_io_blockers;
+ this->signal();
+}
+
+template <typename I>
+void ImageRequestWQ<I>::handle_blocked_writes(int r) {
+ Contexts contexts;
+ {
+ RWLock::WLocker locker(m_lock);
+ contexts.swap(m_write_blocker_contexts);
+ }
+
+ for (auto ctx : contexts) {
+ ctx->complete(0);
+ }
+}
+
+template class librbd::io::ImageRequestWQ<librbd::ImageCtx>;
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ImageRequestWQ.h b/src/librbd/io/ImageRequestWQ.h
new file mode 100644
index 00000000..23dcd48d
--- /dev/null
+++ b/src/librbd/io/ImageRequestWQ.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H
+#define CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H
+
+#include "include/Context.h"
+#include "common/RWLock.h"
+#include "common/Throttle.h"
+#include "common/WorkQueue.h"
+#include "librbd/io/Types.h"
+
+#include <list>
+#include <atomic>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+class AioCompletion;
+template <typename> class ImageDispatchSpec;
+class ReadResult;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageRequestWQ
+ : public ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT> > {
+public:
+ ImageRequestWQ(ImageCtxT *image_ctx, const string &name, time_t ti,
+ ThreadPool *tp);
+ ~ImageRequestWQ();
+
+ ssize_t read(uint64_t off, uint64_t len, ReadResult &&read_result,
+ int op_flags);
+ ssize_t write(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
+ ssize_t discard(uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes);
+ ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
+ ssize_t write_zeroes(uint64_t off, uint64_t len, int zero_flags,
+ int op_flags);
+ ssize_t compare_and_write(uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl, bufferlist &&bl,
+ uint64_t *mismatch_off, int op_flags);
+ int flush();
+
+ void aio_read(AioCompletion *c, uint64_t off, uint64_t len,
+ ReadResult &&read_result, int op_flags, bool native_async=true);
+ void aio_write(AioCompletion *c, uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags, bool native_async=true);
+ void aio_discard(AioCompletion *c, uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes, bool native_async=true);
+ void aio_flush(AioCompletion *c, bool native_async=true);
+ void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags, bool native_async=true);
+ void aio_write_zeroes(AioCompletion *c, uint64_t off, uint64_t len,
+ int zero_flags, int op_flags, bool native_async);
+ void aio_compare_and_write(AioCompletion *c, uint64_t off,
+ uint64_t len, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_off,
+ int op_flags, bool native_async=true);
+
+ using ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT> >::drain;
+ using ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT> >::empty;
+
+ void shut_down(Context *on_shutdown);
+
+ inline bool writes_blocked() const {
+ RWLock::RLocker locker(m_lock);
+ return (m_write_blockers > 0);
+ }
+
+ int block_writes();
+ void block_writes(Context *on_blocked);
+ void unblock_writes();
+
+ void wait_on_writes_unblocked(Context *on_unblocked);
+
+ void set_require_lock(Direction direction, bool enabled);
+
+ void apply_qos_schedule_tick_min(uint64_t tick);
+
+ void apply_qos_limit(const uint64_t flag, uint64_t limit, uint64_t burst);
+protected:
+ void *_void_dequeue() override;
+ void process(ImageDispatchSpec<ImageCtxT> *req) override;
+ bool _empty() override {
+ return (ThreadPool::PointerWQ<ImageDispatchSpec<ImageCtxT>>::_empty() &&
+ m_io_throttled.load() == 0);
+ }
+
+
+private:
+ typedef std::list<Context *> Contexts;
+
+ struct C_AcquireLock;
+ struct C_BlockedWrites;
+ struct C_RefreshFinish;
+
+ ImageCtxT &m_image_ctx;
+ mutable RWLock m_lock;
+ Contexts m_write_blocker_contexts;
+ uint32_t m_write_blockers = 0;
+ Contexts m_unblocked_write_waiter_contexts;
+ bool m_require_lock_on_read = false;
+ bool m_require_lock_on_write = false;
+ std::atomic<unsigned> m_queued_reads { 0 };
+ std::atomic<unsigned> m_queued_writes { 0 };
+ std::atomic<unsigned> m_in_flight_ios { 0 };
+ std::atomic<unsigned> m_in_flight_writes { 0 };
+ std::atomic<unsigned> m_io_blockers { 0 };
+ std::atomic<unsigned> m_io_throttled { 0 };
+
+ std::list<std::pair<uint64_t, TokenBucketThrottle*> > m_throttles;
+ uint64_t m_qos_enabled_flag = 0;
+
+ bool m_shutdown = false;
+ Context *m_on_shutdown = nullptr;
+
+ bool is_lock_required(bool write_op) const;
+
+ inline bool require_lock_on_read() const {
+ RWLock::RLocker locker(m_lock);
+ return m_require_lock_on_read;
+ }
+ inline bool writes_empty() const {
+ RWLock::RLocker locker(m_lock);
+ return (m_queued_writes == 0);
+ }
+
+ bool needs_throttle(ImageDispatchSpec<ImageCtxT> *item);
+
+ void finish_queued_io(ImageDispatchSpec<ImageCtxT> *req);
+ void finish_in_flight_write();
+
+ int start_in_flight_io(AioCompletion *c);
+ void finish_in_flight_io();
+ void fail_in_flight_io(int r, ImageDispatchSpec<ImageCtxT> *req);
+
+ void queue(ImageDispatchSpec<ImageCtxT> *req);
+
+ void handle_acquire_lock(int r, ImageDispatchSpec<ImageCtxT> *req);
+ void handle_refreshed(int r, ImageDispatchSpec<ImageCtxT> *req);
+ void handle_blocked_writes(int r);
+
+ void handle_throttle_ready(int r, ImageDispatchSpec<ImageCtxT> *item, uint64_t flag);
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageRequestWQ<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H
diff --git a/src/librbd/io/ObjectDispatch.cc b/src/librbd/io/ObjectDispatch.cc
new file mode 100644
index 00000000..85c8b034
--- /dev/null
+++ b/src/librbd/io/ObjectDispatch.cc
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatch.h"
+#include "common/dout.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/ObjectRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+ObjectDispatch<I>::ObjectDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+void ObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_image_ctx->op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+bool ObjectDispatch<I>::read(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ ExtentMap* extent_map, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectReadRequest<I>(m_image_ctx, oid, object_no, object_off,
+ object_len, snap_id, op_flags,
+ parent_trace, read_data, extent_map,
+ on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectDiscardRequest<I>(m_image_ctx, oid, object_no,
+ object_off, object_len, snapc,
+ discard_flags, parent_trace,
+ on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << data.length() << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectWriteRequest<I>(m_image_ctx, oid, object_no, object_off,
+ std::move(data), snapc, op_flags,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, Extents&& buffer_extents, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectWriteSameRequest<I>(m_image_ctx, oid, object_no,
+ object_off, object_len,
+ std::move(data), snapc, op_flags,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << write_data.length()
+ << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectCompareAndWriteRequest<I>(m_image_ctx, oid, object_no,
+ object_off,
+ std::move(cmp_data),
+ std::move(write_data), snapc,
+ mismatch_offset, op_flags,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectDispatch.h b/src/librbd/io/ObjectDispatch.h
new file mode 100644
index 00000000..32e8272d
--- /dev/null
+++ b/src/librbd/io/ObjectDispatch.h
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectDispatch : public ObjectDispatchInterface {
+public:
+ ObjectDispatch(ImageCtxT* image_ctx);
+
+ ObjectDispatchLayer get_object_dispatch_layer() const override {
+ return OBJECT_DISPATCH_LAYER_CORE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ ExtentMap* extent_map, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, Extents&& buffer_extents, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
diff --git a/src/librbd/io/ObjectDispatchInterface.h b/src/librbd/io/ObjectDispatchInterface.h
new file mode 100644
index 00000000..fddf0db1
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchInterface.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+struct RWLock;
+
+namespace librbd {
+namespace io {
+
+struct AioCompletion;
+
+struct ObjectDispatchInterface {
+ virtual ~ObjectDispatchInterface() {
+ }
+
+ virtual ObjectDispatchLayer get_object_dispatch_layer() const = 0;
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual bool read(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ ExtentMap* extent_map, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, Extents&& buffer_extents, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool invalidate_cache(Context* on_finish) = 0;
+ virtual bool reset_existence_cache(Context* on_finish) = 0;
+
+ virtual void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) = 0;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
diff --git a/src/librbd/io/ObjectDispatchSpec.cc b/src/librbd/io/ObjectDispatchSpec.cc
new file mode 100644
index 00000000..2b6dccc5
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchSpec.cc
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "include/Context.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include <boost/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+void ObjectDispatchSpec::C_Dispatcher::complete(int r) {
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ switch (object_dispatch_spec->dispatch_result) {
+ case DISPATCH_RESULT_CONTINUE:
+ object_dispatch_spec->send();
+ break;
+ case DISPATCH_RESULT_COMPLETE:
+ finish(r);
+ break;
+ case DISPATCH_RESULT_INVALID:
+ ceph_abort();
+ break;
+ }
+}
+
+void ObjectDispatchSpec::C_Dispatcher::finish(int r) {
+ on_finish->complete(r);
+ delete object_dispatch_spec;
+}
+
+void ObjectDispatchSpec::send() {
+ object_dispatcher->send(this);
+}
+
+void ObjectDispatchSpec::fail(int r) {
+ ceph_assert(r < 0);
+ dispatcher_ctx.complete(r);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ObjectDispatchSpec.h b/src/librbd/io/ObjectDispatchSpec.h
new file mode 100644
index 00000000..a26d89fe
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchSpec.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include <boost/variant/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+struct ObjectDispatcherInterface;
+
+struct ObjectDispatchSpec {
+private:
+ // helper to avoid extra heap allocation per object IO
+ struct C_Dispatcher : public Context {
+ ObjectDispatchSpec* object_dispatch_spec;
+ Context* on_finish;
+
+ C_Dispatcher(ObjectDispatchSpec* object_dispatch_spec, Context* on_finish)
+ : object_dispatch_spec(object_dispatch_spec), on_finish(on_finish) {
+ }
+
+ void complete(int r) override;
+ void finish(int r) override;
+ };
+
+public:
+ struct RequestBase {
+ std::string oid;
+ uint64_t object_no;
+ uint64_t object_off;
+
+ RequestBase(const std::string& oid, uint64_t object_no, uint64_t object_off)
+ : oid(oid), object_no(object_no), object_off(object_off) {
+ }
+ };
+
+ struct ReadRequest : public RequestBase {
+ uint64_t object_len;
+ librados::snap_t snap_id;
+ ceph::bufferlist* read_data;
+ ExtentMap* extent_map;
+
+ ReadRequest(const std::string& oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id,
+ ceph::bufferlist* read_data, ExtentMap* extent_map)
+ : RequestBase(oid, object_no, object_off),
+ object_len(object_len), snap_id(snap_id), read_data(read_data),
+ extent_map(extent_map) {
+ }
+ };
+
+ struct WriteRequestBase : public RequestBase {
+ ::SnapContext snapc;
+ uint64_t journal_tid;
+
+ WriteRequestBase(const std::string& oid, uint64_t object_no,
+ uint64_t object_off, const ::SnapContext& snapc,
+ uint64_t journal_tid)
+ : RequestBase(oid, object_no, object_off), snapc(snapc),
+ journal_tid(journal_tid) {
+ }
+ };
+
+ struct DiscardRequest : public WriteRequestBase {
+ uint64_t object_len;
+ int discard_flags;
+
+ DiscardRequest(const std::string& oid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len,
+ int discard_flags, const ::SnapContext& snapc,
+ uint64_t journal_tid)
+ : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid),
+ object_len(object_len), discard_flags(discard_flags) {
+ }
+ };
+
+ struct WriteRequest : public WriteRequestBase {
+ ceph::bufferlist data;
+
+ WriteRequest(const std::string& oid, uint64_t object_no,
+ uint64_t object_off, ceph::bufferlist&& data,
+ const ::SnapContext& snapc, uint64_t journal_tid)
+ : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid),
+ data(std::move(data)) {
+ }
+ };
+
+ struct WriteSameRequest : public WriteRequestBase {
+ uint64_t object_len;
+ Extents buffer_extents;
+ ceph::bufferlist data;
+
+ WriteSameRequest(const std::string& oid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len,
+ Extents&& buffer_extents, ceph::bufferlist&& data,
+ const ::SnapContext& snapc, uint64_t journal_tid)
+ : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid),
+ object_len(object_len), buffer_extents(std::move(buffer_extents)),
+ data(std::move(data)) {
+ }
+ };
+
+ struct CompareAndWriteRequest : public WriteRequestBase {
+ ceph::bufferlist cmp_data;
+ ceph::bufferlist data;
+ uint64_t* mismatch_offset;
+
+ CompareAndWriteRequest(const std::string& oid, uint64_t object_no,
+ uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& data, uint64_t* mismatch_offset,
+ const ::SnapContext& snapc, uint64_t journal_tid)
+ : WriteRequestBase(oid, object_no, object_off, snapc, journal_tid),
+ cmp_data(std::move(cmp_data)), data(std::move(data)),
+ mismatch_offset(mismatch_offset) {
+ }
+ };
+
+ struct FlushRequest {
+ FlushSource flush_source;
+
+ FlushRequest(FlushSource flush_source) : flush_source(flush_source) {
+ }
+ };
+
+ typedef boost::variant<ReadRequest,
+ DiscardRequest,
+ WriteRequest,
+ WriteSameRequest,
+ CompareAndWriteRequest,
+ FlushRequest> Request;
+
+ C_Dispatcher dispatcher_ctx;
+
+ ObjectDispatcherInterface* object_dispatcher;
+ ObjectDispatchLayer object_dispatch_layer;
+ int object_dispatch_flags = 0;
+ DispatchResult dispatch_result = DISPATCH_RESULT_INVALID;
+
+ Request request;
+ int op_flags;
+ ZTracer::Trace parent_trace;
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_read(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ ExtentMap* extent_map, Context* on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ ReadRequest{oid, object_no, object_off,
+ object_len, snap_id, read_data,
+ extent_map},
+ op_flags, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_discard(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ uint64_t journal_tid, const ZTracer::Trace &parent_trace,
+ Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ DiscardRequest{oid, object_no, object_off,
+ object_len, discard_flags,
+ snapc, journal_tid},
+ 0, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_write(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ uint64_t journal_tid, const ZTracer::Trace &parent_trace,
+ Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ WriteRequest{oid, object_no, object_off,
+ std::move(data), snapc,
+ journal_tid},
+ op_flags, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_write_same(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, Extents&& buffer_extents,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ uint64_t journal_tid, const ZTracer::Trace &parent_trace,
+ Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ WriteSameRequest{oid, object_no, object_off,
+ object_len,
+ std::move(buffer_extents),
+ std::move(data), snapc,
+ journal_tid},
+ op_flags, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_compare_and_write(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, uint64_t *mismatch_offset, int op_flags,
+ uint64_t journal_tid, const ZTracer::Trace &parent_trace,
+ Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ CompareAndWriteRequest{oid, object_no,
+ object_off,
+ std::move(cmp_data),
+ std::move(write_data),
+ mismatch_offset,
+ snapc, journal_tid},
+ op_flags, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_flush(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ FlushRequest{flush_source}, 0,
+ parent_trace, on_finish);
+ }
+
+ void send();
+ void fail(int r);
+
+private:
+ template <typename> friend class ObjectDispatcher;
+
+ ObjectDispatchSpec(ObjectDispatcherInterface* object_dispatcher,
+ ObjectDispatchLayer object_dispatch_layer,
+ Request&& request, int op_flags,
+ const ZTracer::Trace& parent_trace, Context* on_finish)
+ : dispatcher_ctx(this, on_finish), object_dispatcher(object_dispatcher),
+ object_dispatch_layer(object_dispatch_layer), request(std::move(request)),
+ op_flags(op_flags), parent_trace(parent_trace) {
+ }
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
diff --git a/src/librbd/io/ObjectDispatcher.cc b/src/librbd/io/ObjectDispatcher.cc
new file mode 100644
index 00000000..befc0750
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcher.cc
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatcher.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectDispatch.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include <boost/variant.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectDispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct ObjectDispatcher<I>::C_LayerIterator : public Context {
+ ObjectDispatcher* object_dispatcher;
+ Context* on_finish;
+
+ ObjectDispatchLayer object_dispatch_layer = OBJECT_DISPATCH_LAYER_NONE;
+
+ C_LayerIterator(ObjectDispatcher* object_dispatcher,
+ Context* on_finish)
+ : object_dispatcher(object_dispatcher), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ while (true) {
+ object_dispatcher->m_lock.get_read();
+ auto it = object_dispatcher->m_object_dispatches.upper_bound(
+ object_dispatch_layer);
+ if (it == object_dispatcher->m_object_dispatches.end()) {
+ object_dispatcher->m_lock.put_read();
+ Context::complete(r);
+ return;
+ }
+
+ auto& object_dispatch_meta = it->second;
+ auto object_dispatch = object_dispatch_meta.object_dispatch;
+
+ // prevent recursive locking back into the dispatcher while handling IO
+ object_dispatch_meta.async_op_tracker->start_op();
+ object_dispatcher->m_lock.put_read();
+
+ // next loop should start after current layer
+ object_dispatch_layer = object_dispatch->get_object_dispatch_layer();
+
+ auto handled = execute(object_dispatch, this);
+ object_dispatch_meta.async_op_tracker->finish_op();
+
+ if (handled) {
+ break;
+ }
+ }
+ }
+
+ void finish(int r) override {
+ on_finish->complete(0);
+ }
+
+ virtual bool execute(ObjectDispatchInterface* object_dispatch,
+ Context* on_finish) = 0;
+};
+
+template <typename I>
+struct ObjectDispatcher<I>::C_InvalidateCache : public C_LayerIterator {
+ C_InvalidateCache(ObjectDispatcher* object_dispatcher, Context* on_finish)
+ : C_LayerIterator(object_dispatcher, on_finish) {
+ }
+
+ bool execute(ObjectDispatchInterface* object_dispatch,
+ Context* on_finish) override {
+ return object_dispatch->invalidate_cache(on_finish);
+ }
+};
+
+template <typename I>
+struct ObjectDispatcher<I>::C_ResetExistenceCache : public C_LayerIterator {
+ C_ResetExistenceCache(ObjectDispatcher* object_dispatcher, Context* on_finish)
+ : C_LayerIterator(object_dispatcher, on_finish) {
+ }
+
+ bool execute(ObjectDispatchInterface* object_dispatch,
+ Context* on_finish) override {
+ return object_dispatch->reset_existence_cache(on_finish);
+ }
+};
+
+template <typename I>
+struct ObjectDispatcher<I>::SendVisitor : public boost::static_visitor<bool> {
+ ObjectDispatchInterface* object_dispatch;
+ ObjectDispatchSpec* object_dispatch_spec;
+
+ SendVisitor(ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec)
+ : object_dispatch(object_dispatch),
+ object_dispatch_spec(object_dispatch_spec) {
+ }
+
+ bool operator()(ObjectDispatchSpec::ReadRequest& read) const {
+ return object_dispatch->read(
+ read.oid, read.object_no, read.object_off, read.object_len, read.snap_id,
+ object_dispatch_spec->op_flags, object_dispatch_spec->parent_trace,
+ read.read_data, read.extent_map,
+ &object_dispatch_spec->object_dispatch_flags,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::DiscardRequest& discard) const {
+ return object_dispatch->discard(
+ discard.oid, discard.object_no, discard.object_off, discard.object_len,
+ discard.snapc, discard.discard_flags, object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &discard.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::WriteRequest& write) const {
+ return object_dispatch->write(
+ write.oid, write.object_no, write.object_off, std::move(write.data),
+ write.snapc, object_dispatch_spec->op_flags,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &write.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::WriteSameRequest& write_same) const {
+ return object_dispatch->write_same(
+ write_same.oid, write_same.object_no, write_same.object_off,
+ write_same.object_len, std::move(write_same.buffer_extents),
+ std::move(write_same.data), write_same.snapc,
+ object_dispatch_spec->op_flags, object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &write_same.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(
+ ObjectDispatchSpec::CompareAndWriteRequest& compare_and_write) const {
+ return object_dispatch->compare_and_write(
+ compare_and_write.oid, compare_and_write.object_no,
+ compare_and_write.object_off, std::move(compare_and_write.cmp_data),
+ std::move(compare_and_write.data), compare_and_write.snapc,
+ object_dispatch_spec->op_flags, object_dispatch_spec->parent_trace,
+ compare_and_write.mismatch_offset,
+ &object_dispatch_spec->object_dispatch_flags,
+ &compare_and_write.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::FlushRequest& flush) const {
+ return object_dispatch->flush(
+ flush.flush_source, object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+};
+
+template <typename I>
+ObjectDispatcher<I>::ObjectDispatcher(I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(librbd::util::unique_lock_name("librbd::io::ObjectDispatcher::lock",
+ this)) {
+ // configure the core object dispatch handler on startup
+ auto object_dispatch = new ObjectDispatch(image_ctx);
+ m_object_dispatches[object_dispatch->get_object_dispatch_layer()] =
+ {object_dispatch, new AsyncOpTracker()};
+}
+
+template <typename I>
+ObjectDispatcher<I>::~ObjectDispatcher() {
+ ceph_assert(m_object_dispatches.empty());
+}
+
+template <typename I>
+void ObjectDispatcher<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ std::map<ObjectDispatchLayer, ObjectDispatchMeta> object_dispatches;
+ {
+ RWLock::WLocker locker(m_lock);
+ std::swap(object_dispatches, m_object_dispatches);
+ }
+
+ for (auto it : object_dispatches) {
+ shut_down_object_dispatch(it.second, &on_finish);
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::register_object_dispatch(
+ ObjectDispatchInterface* object_dispatch) {
+ auto cct = m_image_ctx->cct;
+ auto type = object_dispatch->get_object_dispatch_layer();
+ ldout(cct, 5) << "object_dispatch_layer=" << type << dendl;
+
+ RWLock::WLocker locker(m_lock);
+ ceph_assert(type < OBJECT_DISPATCH_LAYER_LAST);
+
+ auto result = m_object_dispatches.insert(
+ {type, {object_dispatch, new AsyncOpTracker()}});
+ ceph_assert(result.second);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::shut_down_object_dispatch(
+ ObjectDispatchLayer object_dispatch_layer, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "object_dispatch_layer=" << object_dispatch_layer << dendl;
+ ceph_assert(object_dispatch_layer + 1 < OBJECT_DISPATCH_LAYER_LAST);
+
+ ObjectDispatchMeta object_dispatch_meta;
+ {
+ RWLock::WLocker locker(m_lock);
+ auto it = m_object_dispatches.find(object_dispatch_layer);
+ ceph_assert(it != m_object_dispatches.end());
+
+ object_dispatch_meta = it->second;
+ m_object_dispatches.erase(it);
+ }
+
+ shut_down_object_dispatch(object_dispatch_meta, &on_finish);
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::shut_down_object_dispatch(
+ ObjectDispatchMeta& object_dispatch_meta, Context** on_finish) {
+ auto object_dispatch = object_dispatch_meta.object_dispatch;
+ auto async_op_tracker = object_dispatch_meta.async_op_tracker;
+
+ Context* ctx = *on_finish;
+ ctx = new FunctionContext(
+ [object_dispatch, async_op_tracker, ctx](int r) {
+ delete object_dispatch;
+ delete async_op_tracker;
+
+ ctx->complete(r);
+ });
+ ctx = new FunctionContext([object_dispatch, ctx](int r) {
+ object_dispatch->shut_down(ctx);
+ });
+ *on_finish = new FunctionContext([async_op_tracker, ctx](int r) {
+ async_op_tracker->wait_for_ops(ctx);
+ });
+}
+
+template <typename I>
+void ObjectDispatcher<I>::invalidate_cache(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish = util::create_async_context_callback(*m_image_ctx, on_finish);
+ auto ctx = new C_InvalidateCache(this, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::reset_existence_cache(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish = util::create_async_context_callback(*m_image_ctx, on_finish);
+ auto ctx = new C_ResetExistenceCache(this, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << object_no << " " << object_off << "~" << object_len
+ << dendl;
+
+ for (auto it : m_object_dispatches) {
+ auto& object_dispatch_meta = it.second;
+ auto object_dispatch = object_dispatch_meta.object_dispatch;
+ object_dispatch->extent_overwritten(object_no, object_off, object_len,
+ journal_tid, new_journal_tid);
+ }
+}
+
+template <typename I>
+void ObjectDispatcher<I>::send(ObjectDispatchSpec* object_dispatch_spec) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_dispatch_spec=" << object_dispatch_spec << dendl;
+
+ auto object_dispatch_layer = object_dispatch_spec->object_dispatch_layer;
+ ceph_assert(object_dispatch_layer + 1 < OBJECT_DISPATCH_LAYER_LAST);
+
+ // apply the IO request to all layers -- this method will be re-invoked
+ // by the dispatch layer if continuing / restarting the IO
+ while (true) {
+ m_lock.get_read();
+ object_dispatch_layer = object_dispatch_spec->object_dispatch_layer;
+ auto it = m_object_dispatches.upper_bound(object_dispatch_layer);
+ if (it == m_object_dispatches.end()) {
+ // the request is complete if handled by all layers
+ object_dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE;
+ m_lock.put_read();
+ break;
+ }
+
+ auto& object_dispatch_meta = it->second;
+ auto object_dispatch = object_dispatch_meta.object_dispatch;
+ object_dispatch_spec->dispatch_result = DISPATCH_RESULT_INVALID;
+
+ // prevent recursive locking back into the dispatcher while handling IO
+ object_dispatch_meta.async_op_tracker->start_op();
+ m_lock.put_read();
+
+ // advance to next layer in case we skip or continue
+ object_dispatch_spec->object_dispatch_layer =
+ object_dispatch->get_object_dispatch_layer();
+
+ bool handled = boost::apply_visitor(
+ SendVisitor{object_dispatch, object_dispatch_spec},
+ object_dispatch_spec->request);
+ object_dispatch_meta.async_op_tracker->finish_op();
+
+ // handled ops will resume when the dispatch ctx is invoked
+ if (handled) {
+ return;
+ }
+ }
+
+ // skipped through to the last layer
+ object_dispatch_spec->dispatcher_ctx.complete(0);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectDispatcher<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectDispatcher.h b/src/librbd/io/ObjectDispatcher.h
new file mode 100644
index 00000000..0370d268
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcher.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "common/RWLock.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+struct AsyncOpTracker;
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct ObjectDispatchInterface;
+struct ObjectDispatchSpec;
+
+struct ObjectDispatcherInterface {
+public:
+ virtual ~ObjectDispatcherInterface() {
+ }
+
+private:
+ friend class ObjectDispatchSpec;
+
+ virtual void send(ObjectDispatchSpec* object_dispatch_spec) = 0;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectDispatcher : public ObjectDispatcherInterface {
+public:
+ ObjectDispatcher(ImageCtxT* image_ctx);
+ ~ObjectDispatcher();
+
+ void shut_down(Context* on_finish);
+
+ void register_object_dispatch(ObjectDispatchInterface* object_dispatch);
+ void shut_down_object_dispatch(ObjectDispatchLayer object_dispatch_layer,
+ Context* on_finish);
+
+ void invalidate_cache(Context* on_finish);
+ void reset_existence_cache(Context* on_finish);
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid);
+
+private:
+ struct ObjectDispatchMeta {
+ ObjectDispatchInterface* object_dispatch = nullptr;
+ AsyncOpTracker* async_op_tracker = nullptr;
+
+ ObjectDispatchMeta() {
+ }
+ ObjectDispatchMeta(ObjectDispatchInterface* object_dispatch,
+ AsyncOpTracker* async_op_tracker)
+ : object_dispatch(object_dispatch), async_op_tracker(async_op_tracker) {
+ }
+ };
+
+ struct C_LayerIterator;
+ struct C_InvalidateCache;
+ struct C_ResetExistenceCache;
+ struct SendVisitor;
+
+ ImageCtxT* m_image_ctx;
+
+ RWLock m_lock;
+ std::map<ObjectDispatchLayer, ObjectDispatchMeta> m_object_dispatches;
+
+ void send(ObjectDispatchSpec* object_dispatch_spec);
+
+ void shut_down_object_dispatch(ObjectDispatchMeta& object_dispatch_meta,
+ Context** on_finish);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectDispatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc
new file mode 100644
index 00000000..60f53df1
--- /dev/null
+++ b/src/librbd/io/ObjectRequest.cc
@@ -0,0 +1,729 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectRequest.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "include/Context.h"
+#include "include/err.h"
+#include "osd/osd_types.h"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/CopyupRequest.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ReadResult.h"
+
+#include <boost/bind.hpp>
+#include <boost/optional.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+template <typename I>
+inline bool is_copy_on_read(I *ictx, librados::snap_t snap_id) {
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ return (ictx->clone_copy_on_read &&
+ !ictx->read_only && snap_id == CEPH_NOSNAP &&
+ (ictx->exclusive_lock == nullptr ||
+ ictx->exclusive_lock->is_lock_owner()));
+}
+
+} // anonymous namespace
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_write(I *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ return new ObjectWriteRequest<I>(ictx, oid, object_no, object_off,
+ std::move(data), snapc, op_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_discard(I *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ uint64_t object_len,
+ const ::SnapContext &snapc,
+ int discard_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ return new ObjectDiscardRequest<I>(ictx, oid, object_no, object_off,
+ object_len, snapc, discard_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_write_same(I *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ uint64_t object_len,
+ ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ return new ObjectWriteSameRequest<I>(ictx, oid, object_no, object_off,
+ object_len, std::move(data), snapc,
+ op_flags, parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_compare_and_write(I *ictx, const std::string &oid,
+ uint64_t object_no,
+ uint64_t object_off,
+ ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc,
+ uint64_t *mismatch_offset,
+ int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ return new ObjectCompareAndWriteRequest<I>(ictx, oid, object_no, object_off,
+ std::move(cmp_data),
+ std::move(write_data), snapc,
+ mismatch_offset, op_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>::ObjectRequest(I *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t off,
+ uint64_t len, librados::snap_t snap_id,
+ const char *trace_name,
+ const ZTracer::Trace &trace,
+ Context *completion)
+ : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
+ m_object_len(len), m_snap_id(snap_id), m_completion(completion),
+ m_trace(util::create_trace(*ictx, "", trace)) {
+ ceph_assert(m_ictx->data_ctx.is_valid());
+ if (m_trace.valid()) {
+ m_trace.copy_name(trace_name + std::string(" ") + oid);
+ m_trace.event("start");
+ }
+}
+
+template <typename I>
+void ObjectRequest<I>::add_write_hint(I& image_ctx,
+ librados::ObjectWriteOperation *wr) {
+ if (image_ctx.enable_alloc_hint) {
+ wr->set_alloc_hint2(image_ctx.get_object_size(),
+ image_ctx.get_object_size(),
+ image_ctx.alloc_hint_flags);
+ } else if (image_ctx.alloc_hint_flags != 0U) {
+ wr->set_alloc_hint2(0, 0, image_ctx.alloc_hint_flags);
+ }
+}
+
+template <typename I>
+bool ObjectRequest<I>::compute_parent_extents(Extents *parent_extents,
+ bool read_request) {
+ ceph_assert(m_ictx->snap_lock.is_locked());
+ ceph_assert(m_ictx->parent_lock.is_locked());
+
+ m_has_parent = false;
+ parent_extents->clear();
+
+ uint64_t parent_overlap;
+ int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+ if (r < 0) {
+ // NOTE: it's possible for a snapshot to be deleted while we are
+ // still reading from it
+ lderr(m_ictx->cct) << "failed to retrieve parent overlap: "
+ << cpp_strerror(r) << dendl;
+ return false;
+ }
+
+ if (!read_request && !m_ictx->migration_info.empty()) {
+ parent_overlap = m_ictx->migration_info.overlap;
+ }
+
+ if (parent_overlap == 0) {
+ return false;
+ }
+
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no, 0,
+ m_ictx->layout.object_size, *parent_extents);
+ uint64_t object_overlap = m_ictx->prune_parent_extents(*parent_extents,
+ parent_overlap);
+ if (object_overlap > 0) {
+ ldout(m_ictx->cct, 20) << "overlap " << parent_overlap << " "
+ << "extents " << *parent_extents << dendl;
+ m_has_parent = !parent_extents->empty();
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void ObjectRequest<I>::async_finish(int r) {
+ ldout(m_ictx->cct, 20) << "r=" << r << dendl;
+ m_ictx->op_work_queue->queue(util::create_context_callback<
+ ObjectRequest<I>, &ObjectRequest<I>::finish>(this), r);
+}
+
+template <typename I>
+void ObjectRequest<I>::finish(int r) {
+ ldout(m_ictx->cct, 20) << "r=" << r << dendl;
+ m_completion->complete(r);
+ delete this;
+}
+
+/** read **/
+
+template <typename I>
+ObjectReadRequest<I>::ObjectReadRequest(I *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t offset,
+ uint64_t len, librados::snap_t snap_id,
+ int op_flags,
+ const ZTracer::Trace &parent_trace,
+ bufferlist* read_data,
+ ExtentMap* extent_map,
+ Context *completion)
+ : ObjectRequest<I>(ictx, oid, objectno, offset, len, snap_id, "read",
+ parent_trace, completion),
+ m_op_flags(op_flags), m_read_data(read_data), m_extent_map(extent_map) {
+}
+
+template <typename I>
+void ObjectReadRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ read_object();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::read_object() {
+ I *image_ctx = this->m_ictx;
+ {
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+ if (image_ctx->object_map != nullptr &&
+ !image_ctx->object_map->object_may_exist(this->m_object_no)) {
+ image_ctx->op_work_queue->queue(new FunctionContext([this](int r) {
+ read_parent();
+ }), 0);
+ return;
+ }
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ if (this->m_object_len >= image_ctx->sparse_read_threshold_bytes) {
+ op.sparse_read(this->m_object_off, this->m_object_len, m_extent_map,
+ m_read_data, nullptr);
+ } else {
+ op.read(this->m_object_off, this->m_object_len, m_read_data, nullptr);
+ }
+ op.set_op_flags2(m_op_flags);
+
+ librados::AioCompletion *rados_completion = util::create_rados_callback<
+ ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_object>(this);
+ int flags = image_ctx->get_read_flags(this->m_snap_id);
+ int r = image_ctx->data_ctx.aio_operate(
+ this->m_oid, rados_completion, &op, flags, nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+ ceph_assert(r == 0);
+
+ rados_completion->release();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::handle_read_object(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ read_parent();
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to read from object: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectReadRequest<I>::read_parent() {
+ I *image_ctx = this->m_ictx;
+
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+ RWLock::RLocker parent_locker(image_ctx->parent_lock);
+
+ // calculate reverse mapping onto the image
+ Extents parent_extents;
+ Striper::extent_to_file(image_ctx->cct, &image_ctx->layout,
+ this->m_object_no, this->m_object_off,
+ this->m_object_len, parent_extents);
+
+ uint64_t parent_overlap = 0;
+ uint64_t object_overlap = 0;
+ int r = image_ctx->get_parent_overlap(this->m_snap_id, &parent_overlap);
+ if (r == 0) {
+ object_overlap = image_ctx->prune_parent_extents(parent_extents,
+ parent_overlap);
+ }
+
+ if (object_overlap == 0) {
+ parent_locker.unlock();
+ snap_locker.unlock();
+
+ this->finish(-ENOENT);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ auto parent_completion = AioCompletion::create_and_start<
+ ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_parent>(
+ this, util::get_image_ctx(image_ctx->parent), AIO_TYPE_READ);
+ ImageRequest<I>::aio_read(image_ctx->parent, parent_completion,
+ std::move(parent_extents), ReadResult{m_read_data},
+ 0, this->m_trace);
+}
+
+template <typename I>
+void ObjectReadRequest<I>::handle_read_parent(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ this->finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to read parent extents: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ copyup();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::copyup() {
+ I *image_ctx = this->m_ictx;
+ if (!is_copy_on_read(image_ctx, this->m_snap_id)) {
+ this->finish(0);
+ return;
+ }
+
+ image_ctx->owner_lock.get_read();
+ image_ctx->snap_lock.get_read();
+ image_ctx->parent_lock.get_read();
+ Extents parent_extents;
+ if (!this->compute_parent_extents(&parent_extents, true) ||
+ (image_ctx->exclusive_lock != nullptr &&
+ !image_ctx->exclusive_lock->is_lock_owner())) {
+ image_ctx->parent_lock.put_read();
+ image_ctx->snap_lock.put_read();
+ image_ctx->owner_lock.put_read();
+ this->finish(0);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ image_ctx->copyup_list_lock.Lock();
+ auto it = image_ctx->copyup_list.find(this->m_object_no);
+ if (it == image_ctx->copyup_list.end()) {
+ // create and kick off a CopyupRequest
+ auto new_req = CopyupRequest<I>::create(
+ image_ctx, this->m_oid, this->m_object_no, std::move(parent_extents),
+ this->m_trace);
+
+ image_ctx->copyup_list[this->m_object_no] = new_req;
+ image_ctx->copyup_list_lock.Unlock();
+ image_ctx->parent_lock.put_read();
+ image_ctx->snap_lock.put_read();
+ new_req->send();
+ } else {
+ image_ctx->copyup_list_lock.Unlock();
+ image_ctx->parent_lock.put_read();
+ image_ctx->snap_lock.put_read();
+ }
+
+ image_ctx->owner_lock.put_read();
+ this->finish(0);
+}
+
+/** write **/
+
+template <typename I>
+AbstractObjectWriteRequest<I>::AbstractObjectWriteRequest(
+ I *ictx, const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t len, const ::SnapContext &snapc, const char *trace_name,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : ObjectRequest<I>(ictx, oid, object_no, object_off, len, CEPH_NOSNAP,
+ trace_name, parent_trace, completion),
+ m_snap_seq(snapc.seq.val)
+{
+ m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
+
+ if (this->m_object_off == 0 &&
+ this->m_object_len == ictx->get_object_size()) {
+ m_full_object = true;
+ }
+
+ compute_parent_info();
+
+ ictx->snap_lock.get_read();
+ if (!ictx->migration_info.empty()) {
+ m_guarding_migration_write = true;
+ }
+ ictx->snap_lock.put_read();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::compute_parent_info() {
+ I *image_ctx = this->m_ictx;
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+ RWLock::RLocker parent_locker(image_ctx->parent_lock);
+
+ this->compute_parent_extents(&m_parent_extents, false);
+
+ if (!this->has_parent() ||
+ (m_full_object && m_snaps.empty() && !is_post_copyup_write_required())) {
+ m_copyup_enabled = false;
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::add_write_hint(
+ librados::ObjectWriteOperation *wr) {
+ I *image_ctx = this->m_ictx;
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+ if (image_ctx->object_map == nullptr || !this->m_object_may_exist) {
+ ObjectRequest<I>::add_write_hint(*image_ctx, wr);
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << this->get_op_type() << " " << this->m_oid << " "
+ << this->m_object_off << "~" << this->m_object_len
+ << dendl;
+ {
+ RWLock::RLocker snap_lock(image_ctx->snap_lock);
+ if (image_ctx->object_map == nullptr) {
+ m_object_may_exist = true;
+ } else {
+ // should have been flushed prior to releasing lock
+ ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
+ m_object_may_exist = image_ctx->object_map->object_may_exist(
+ this->m_object_no);
+ }
+ }
+
+ if (!m_object_may_exist && is_no_op_for_nonexistent_object()) {
+ ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object"
+ << dendl;
+ this->async_finish(0);
+ return;
+ }
+
+ pre_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::pre_write_object_map_update() {
+ I *image_ctx = this->m_ictx;
+
+ image_ctx->snap_lock.get_read();
+ if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) {
+ image_ctx->snap_lock.put_read();
+ write_object();
+ return;
+ }
+
+ if (!m_object_may_exist && m_copyup_enabled) {
+ // optimization: copyup required
+ image_ctx->snap_lock.put_read();
+ copyup();
+ return;
+ }
+
+ uint8_t new_state = this->get_pre_write_object_map_state();
+ ldout(image_ctx->cct, 20) << this->m_oid << " " << this->m_object_off
+ << "~" << this->m_object_len << dendl;
+
+ image_ctx->object_map_lock.get_write();
+ if (image_ctx->object_map->template aio_update<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update>(
+ CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false,
+ this)) {
+ image_ctx->object_map_lock.put_write();
+ image_ctx->snap_lock.put_read();
+ return;
+ }
+
+ image_ctx->object_map_lock.put_write();
+ image_ctx->snap_lock.put_read();
+ write_object();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ write_object();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::write_object() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ librados::ObjectWriteOperation write;
+ if (m_copyup_enabled) {
+ ldout(image_ctx->cct, 20) << "guarding write" << dendl;
+ if (m_guarding_migration_write) {
+ cls_client::assert_snapc_seq(
+ &write, m_snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ);
+ } else {
+ write.assert_exists();
+ }
+ }
+
+ add_write_hint(&write);
+ add_write_ops(&write);
+ ceph_assert(write.size() != 0);
+
+ librados::AioCompletion *rados_completion = util::create_rados_callback<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_write_object>(this);
+ int r = image_ctx->data_ctx.aio_operate(
+ this->m_oid, rados_completion, &write, m_snap_seq, m_snaps,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_write_object(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ r = filter_write_result(r);
+ if (r == -ENOENT) {
+ if (m_copyup_enabled) {
+ copyup();
+ return;
+ }
+ } else if (r == -ERANGE && m_guarding_migration_write) {
+ image_ctx->snap_lock.get_read();
+ m_guarding_migration_write = !image_ctx->migration_info.empty();
+ image_ctx->snap_lock.put_read();
+
+ if (m_guarding_migration_write) {
+ copyup();
+ } else {
+ ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl;
+ compute_parent_info();
+ write_object();
+ }
+ return;
+ } else if (r == -EILSEQ) {
+ ldout(image_ctx->cct, 10) << "failed to write object" << dendl;
+ this->finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ post_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::copyup() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ ceph_assert(!m_copyup_in_progress);
+ m_copyup_in_progress = true;
+
+ image_ctx->copyup_list_lock.Lock();
+ auto it = image_ctx->copyup_list.find(this->m_object_no);
+ if (it == image_ctx->copyup_list.end()) {
+ auto new_req = CopyupRequest<I>::create(
+ image_ctx, this->m_oid, this->m_object_no,
+ std::move(this->m_parent_extents), this->m_trace);
+ this->m_parent_extents.clear();
+
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this);
+ image_ctx->copyup_list[this->m_object_no] = new_req;
+
+ image_ctx->copyup_list_lock.Unlock();
+ new_req->send();
+ } else {
+ it->second->append_request(this);
+ image_ctx->copyup_list_lock.Unlock();
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_copyup(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ ceph_assert(m_copyup_in_progress);
+ m_copyup_in_progress = false;
+
+ if (r < 0 && r != -ERESTART) {
+ lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ if (r == -ERESTART || is_post_copyup_write_required()) {
+ write_object();
+ return;
+ }
+
+ post_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::post_write_object_map_update() {
+ I *image_ctx = this->m_ictx;
+
+ image_ctx->snap_lock.get_read();
+ if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() ||
+ !is_non_existent_post_write_object_map_state()) {
+ image_ctx->snap_lock.put_read();
+ this->finish(0);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ // should have been flushed prior to releasing lock
+ ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
+ image_ctx->object_map_lock.get_write();
+ if (image_ctx->object_map->template aio_update<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_post_write_object_map_update>(
+ CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING,
+ this->m_trace, false, this)) {
+ image_ctx->object_map_lock.put_write();
+ image_ctx->snap_lock.put_read();
+ return;
+ }
+
+ image_ctx->object_map_lock.put_write();
+ image_ctx->snap_lock.put_read();
+ this->finish(0);
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_post_write_object_map_update(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectWriteRequest<I>::add_write_ops(librados::ObjectWriteOperation *wr) {
+ if (this->m_full_object) {
+ wr->write_full(m_write_data);
+ } else {
+ wr->write(this->m_object_off, m_write_data);
+ }
+ wr->set_op_flags2(m_op_flags);
+}
+
+template <typename I>
+void ObjectWriteSameRequest<I>::add_write_ops(
+ librados::ObjectWriteOperation *wr) {
+ wr->writesame(this->m_object_off, this->m_object_len, m_write_data);
+ wr->set_op_flags2(m_op_flags);
+}
+
+template <typename I>
+void ObjectCompareAndWriteRequest<I>::add_write_ops(
+ librados::ObjectWriteOperation *wr) {
+ wr->cmpext(this->m_object_off, m_cmp_bl, nullptr);
+
+ if (this->m_full_object) {
+ wr->write_full(m_write_bl);
+ } else {
+ wr->write(this->m_object_off, m_write_bl);
+ }
+ wr->set_op_flags2(m_op_flags);
+}
+
+template <typename I>
+int ObjectCompareAndWriteRequest<I>::filter_write_result(int r) const {
+ if (r <= -MAX_ERRNO) {
+ I *image_ctx = this->m_ictx;
+ Extents image_extents;
+
+ // object extent compare mismatch
+ uint64_t offset = -MAX_ERRNO - r;
+ Striper::extent_to_file(image_ctx->cct, &image_ctx->layout,
+ this->m_object_no, offset, this->m_object_len,
+ image_extents);
+ ceph_assert(image_extents.size() == 1);
+
+ if (m_mismatch_offset) {
+ *m_mismatch_offset = image_extents[0].first;
+ }
+ r = -EILSEQ;
+ }
+ return r;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
+template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h
new file mode 100644
index 00000000..9452ec43
--- /dev/null
+++ b/src/librbd/io/ObjectRequest.h
@@ -0,0 +1,478 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_REQUEST_H
+#define CEPH_LIBRBD_IO_OBJECT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+class Context;
+class ObjectExtent;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class CopyupRequest;
+
+/**
+ * This class represents an I/O operation to a single RBD data object.
+ * Its subclasses encapsulate logic for dealing with special cases
+ * for I/O due to layering.
+ */
+template <typename ImageCtxT = ImageCtx>
+class ObjectRequest {
+public:
+ static ObjectRequest* create_write(
+ ImageCtxT *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, ceph::bufferlist&& data, const ::SnapContext &snapc,
+ int op_flags, const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_discard(
+ ImageCtxT *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len, const ::SnapContext &snapc,
+ int discard_flags, const ZTracer::Trace &parent_trace,
+ Context *completion);
+ static ObjectRequest* create_write_same(
+ ImageCtxT *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_compare_and_write(
+ ImageCtxT *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, const ::SnapContext &snapc,
+ uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion);
+
+ ObjectRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t off, uint64_t len,
+ librados::snap_t snap_id, const char *trace_name,
+ const ZTracer::Trace &parent_trace, Context *completion);
+ virtual ~ObjectRequest() {
+ m_trace.event("finish");
+ }
+
+ static void add_write_hint(ImageCtxT& image_ctx,
+ librados::ObjectWriteOperation *wr);
+
+ virtual void send() = 0;
+
+ bool has_parent() const {
+ return m_has_parent;
+ }
+
+ virtual const char *get_op_type() const = 0;
+
+protected:
+ bool compute_parent_extents(Extents *parent_extents, bool read_request);
+
+ ImageCtxT *m_ictx;
+ std::string m_oid;
+ uint64_t m_object_no, m_object_off, m_object_len;
+ librados::snap_t m_snap_id;
+ Context *m_completion;
+ ZTracer::Trace m_trace;
+
+ void async_finish(int r);
+ void finish(int r);
+
+private:
+ bool m_has_parent = false;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectReadRequest : public ObjectRequest<ImageCtxT> {
+public:
+ typedef std::map<uint64_t, uint64_t> ExtentMap;
+
+ static ObjectReadRequest* create(ImageCtxT *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t offset,
+ uint64_t len, librados::snap_t snap_id,
+ int op_flags,
+ const ZTracer::Trace &parent_trace,
+ ceph::bufferlist* read_data,
+ ExtentMap* extent_map, Context *completion) {
+ return new ObjectReadRequest(ictx, oid, objectno, offset, len,
+ snap_id, op_flags, parent_trace, read_data,
+ extent_map, completion);
+ }
+
+ ObjectReadRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t offset, uint64_t len,
+ librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ ceph::bufferlist* read_data, ExtentMap* extent_map,
+ Context *completion);
+
+ void send() override;
+
+ const char *get_op_type() const override {
+ return "read";
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v
+ * READ_OBJECT
+ * |
+ * v (skip if not needed)
+ * READ_PARENT
+ * |
+ * v (skip if not needed)
+ * COPYUP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ int m_op_flags;
+
+ ceph::bufferlist* m_read_data;
+ ExtentMap* m_extent_map;
+
+ void read_object();
+ void handle_read_object(int r);
+
+ void read_parent();
+ void handle_read_parent(int r);
+
+ void copyup();
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AbstractObjectWriteRequest : public ObjectRequest<ImageCtxT> {
+public:
+ AbstractObjectWriteRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ uint64_t len, const ::SnapContext &snapc,
+ const char *trace_name,
+ const ZTracer::Trace &parent_trace,
+ Context *completion);
+
+ virtual bool is_empty_write_op() const {
+ return false;
+ }
+
+ virtual uint8_t get_pre_write_object_map_state() const {
+ return OBJECT_EXISTS;
+ }
+
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {
+ add_write_ops(wr);
+ }
+
+ void handle_copyup(int r);
+
+ void send() override;
+
+protected:
+ bool m_full_object = false;
+ bool m_copyup_enabled = true;
+
+ virtual bool is_no_op_for_nonexistent_object() const {
+ return false;
+ }
+ virtual bool is_object_map_update_enabled() const {
+ return true;
+ }
+ virtual bool is_post_copyup_write_required() const {
+ return false;
+ }
+ virtual bool is_non_existent_post_write_object_map_state() const {
+ return false;
+ }
+
+ virtual void add_write_hint(librados::ObjectWriteOperation *wr);
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
+
+ virtual int filter_write_result(int r) const {
+ return r;
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (no-op write request)
+ * DETECT_NO_OP . . . . . . . . . . . . . . . . . . .
+ * | .
+ * v (skip if not required/disabled) .
+ * PRE_UPDATE_OBJECT_MAP .
+ * | . .
+ * | . (child dne) .
+ * | . . . . . . . . . .
+ * | . .
+ * | (post-copyup write) . .
+ * | . . . . . . . . . . . . . .
+ * | . . . .
+ * v v . v .
+ * WRITE . . . . . . . . > COPYUP (if required) .
+ * | | .
+ * |/----------------------/ .
+ * | .
+ * v (skip if not required/disabled) .
+ * POST_UPDATE_OBJECT_MAP .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ uint64_t m_snap_seq;
+ std::vector<librados::snap_t> m_snaps;
+
+ Extents m_parent_extents;
+ bool m_object_may_exist = false;
+ bool m_copyup_in_progress = false;
+ bool m_guarding_migration_write = false;
+
+ void compute_parent_info();
+
+ void pre_write_object_map_update();
+ void handle_pre_write_object_map_update(int r);
+
+ void write_object();
+ void handle_write_object(int r);
+
+ void copyup();
+
+ void post_write_object_map_update();
+ void handle_post_write_object_map_update(int r);
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectWriteRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off,
+ data.length(), snapc, "write",
+ parent_trace, completion),
+ m_write_data(std::move(data)), m_op_flags(op_flags) {
+ }
+
+ bool is_empty_write_op() const override {
+ return (m_write_data.length() == 0);
+ }
+
+ const char *get_op_type() const override {
+ return "write";
+ }
+
+protected:
+ void add_write_ops(librados::ObjectWriteOperation *wr) override;
+
+private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectDiscardRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectDiscardRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc,
+ int discard_flags, const ZTracer::Trace &parent_trace,
+ Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off,
+ object_len, snapc, "discard",
+ parent_trace, completion),
+ m_discard_flags(discard_flags) {
+ if (this->m_full_object) {
+ if ((m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE) != 0 &&
+ this->has_parent()) {
+ if (!this->m_copyup_enabled) {
+ // need to hide the parent object instead of child object
+ m_discard_action = DISCARD_ACTION_REMOVE_TRUNCATE;
+ } else {
+ m_discard_action = DISCARD_ACTION_TRUNCATE;
+ }
+ this->m_object_len = 0;
+ } else {
+ m_discard_action = DISCARD_ACTION_REMOVE;
+ }
+ } else if (object_off + object_len == ictx->layout.object_size) {
+ m_discard_action = DISCARD_ACTION_TRUNCATE;
+ } else {
+ m_discard_action = DISCARD_ACTION_ZERO;
+ }
+ }
+
+ const char* get_op_type() const override {
+ switch (m_discard_action) {
+ case DISCARD_ACTION_REMOVE:
+ return "remove";
+ case DISCARD_ACTION_REMOVE_TRUNCATE:
+ return "remove (create+truncate)";
+ case DISCARD_ACTION_TRUNCATE:
+ return "truncate";
+ case DISCARD_ACTION_ZERO:
+ return "zero";
+ }
+ ceph_abort();
+ return nullptr;
+ }
+
+ uint8_t get_pre_write_object_map_state() const override {
+ if (m_discard_action == DISCARD_ACTION_REMOVE) {
+ return OBJECT_PENDING;
+ }
+ return OBJECT_EXISTS;
+ }
+
+protected:
+ bool is_no_op_for_nonexistent_object() const override {
+ return (!this->has_parent());
+ }
+ bool is_object_map_update_enabled() const override {
+ return (
+ (m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE) == 0);
+ }
+ bool is_non_existent_post_write_object_map_state() const override {
+ return (m_discard_action == DISCARD_ACTION_REMOVE);
+ }
+
+ void add_write_hint(librados::ObjectWriteOperation *wr) override {
+ // no hint for discard
+ }
+
+ void add_write_ops(librados::ObjectWriteOperation *wr) override {
+ switch (m_discard_action) {
+ case DISCARD_ACTION_REMOVE:
+ wr->remove();
+ break;
+ case DISCARD_ACTION_REMOVE_TRUNCATE:
+ wr->create(false);
+ // fall through
+ case DISCARD_ACTION_TRUNCATE:
+ wr->truncate(this->m_object_off);
+ break;
+ case DISCARD_ACTION_ZERO:
+ wr->zero(this->m_object_off, this->m_object_len);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ }
+
+private:
+ enum DiscardAction {
+ DISCARD_ACTION_REMOVE,
+ DISCARD_ACTION_REMOVE_TRUNCATE,
+ DISCARD_ACTION_TRUNCATE,
+ DISCARD_ACTION_ZERO
+ };
+
+ DiscardAction m_discard_action;
+ int m_discard_flags;
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectWriteSameRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectWriteSameRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off,
+ object_len, snapc, "writesame",
+ parent_trace, completion),
+ m_write_data(std::move(data)), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "writesame";
+ }
+
+protected:
+ void add_write_ops(librados::ObjectWriteOperation *wr) override;
+
+private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectCompareAndWriteRequest(ImageCtxT *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& write_bl,
+ const ::SnapContext &snapc,
+ uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, oid, object_no, object_off,
+ cmp_bl.length(), snapc,
+ "compare_and_write", parent_trace,
+ completion),
+ m_cmp_bl(std::move(cmp_bl)), m_write_bl(std::move(write_bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "compare_and_write";
+ }
+
+ void add_copyup_ops(librados::ObjectWriteOperation *wr) override {
+ // no-op on copyup
+ }
+
+protected:
+ virtual bool is_post_copyup_write_required() const {
+ return true;
+ }
+
+ void add_write_ops(librados::ObjectWriteOperation *wr) override;
+
+ int filter_write_result(int r) const override;
+
+private:
+ ceph::bufferlist m_cmp_bl;
+ ceph::bufferlist m_write_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
+extern template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_REQUEST_H
diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc
new file mode 100644
index 00000000..c24d8b4a
--- /dev/null
+++ b/src/librbd/io/ReadResult.cc
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ReadResult.h"
+#include "include/buffer.h"
+#include "common/dout.h"
+#include "librbd/io/AioCompletion.h"
+#include <boost/variant/apply_visitor.hpp>
+#include <boost/variant/static_visitor.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+struct ReadResult::SetClipLengthVisitor : public boost::static_visitor<void> {
+ size_t length;
+
+ explicit SetClipLengthVisitor(size_t length) : length(length) {
+ }
+
+ void operator()(Linear &linear) const {
+ ceph_assert(length <= linear.buf_len);
+ linear.buf_len = length;
+ }
+
+ template <typename T>
+ void operator()(T &t) const {
+ }
+};
+
+struct ReadResult::AssembleResultVisitor : public boost::static_visitor<void> {
+ CephContext *cct;
+ Striper::StripedReadResult &destriper;
+
+ AssembleResultVisitor(CephContext *cct, Striper::StripedReadResult &destriper)
+ : cct(cct), destriper(destriper) {
+ }
+
+ void operator()(Empty &empty) const {
+ ldout(cct, 20) << "dropping read result" << dendl;
+ }
+
+ void operator()(Linear &linear) const {
+ ldout(cct, 20) << "copying resulting bytes to "
+ << reinterpret_cast<void*>(linear.buf) << dendl;
+ destriper.assemble_result(cct, linear.buf, linear.buf_len);
+ }
+
+ void operator()(Vector &vector) const {
+ bufferlist bl;
+ destriper.assemble_result(cct, bl, true);
+
+ ldout(cct, 20) << "copying resulting " << bl.length() << " bytes to iovec "
+ << reinterpret_cast<const void*>(vector.iov) << dendl;
+
+ bufferlist::iterator it = bl.begin();
+ size_t length = bl.length();
+ size_t offset = 0;
+ int idx = 0;
+ for (; offset < length && idx < vector.iov_count; idx++) {
+ size_t len = std::min(vector.iov[idx].iov_len, length - offset);
+ it.copy(len, static_cast<char *>(vector.iov[idx].iov_base));
+ offset += len;
+ }
+ ceph_assert(offset == bl.length());
+ }
+
+ void operator()(Bufferlist &bufferlist) const {
+ bufferlist.bl->clear();
+ destriper.assemble_result(cct, *bufferlist.bl, true);
+
+ ldout(cct, 20) << "moved resulting " << bufferlist.bl->length() << " "
+ << "bytes to bl " << reinterpret_cast<void*>(bufferlist.bl)
+ << dendl;
+ }
+};
+
+ReadResult::C_ImageReadRequest::C_ImageReadRequest(
+ AioCompletion *aio_completion, const Extents image_extents)
+ : aio_completion(aio_completion), image_extents(image_extents) {
+ aio_completion->add_request();
+}
+
+void ReadResult::C_ImageReadRequest::finish(int r) {
+ CephContext *cct = aio_completion->ictx->cct;
+ ldout(cct, 10) << "C_ImageReadRequest: r=" << r
+ << dendl;
+ if (r >= 0) {
+ size_t length = 0;
+ for (auto &image_extent : image_extents) {
+ length += image_extent.second;
+ }
+ ceph_assert(length == bl.length());
+
+ aio_completion->lock.Lock();
+ aio_completion->read_result.m_destriper.add_partial_result(
+ cct, bl, image_extents);
+ aio_completion->lock.Unlock();
+ r = length;
+ }
+
+ aio_completion->complete_request(r);
+}
+
+ReadResult::C_ObjectReadRequest::C_ObjectReadRequest(
+ AioCompletion *aio_completion, uint64_t object_off, uint64_t object_len,
+ Extents&& buffer_extents)
+ : aio_completion(aio_completion), object_off(object_off),
+ object_len(object_len), buffer_extents(std::move(buffer_extents)) {
+ aio_completion->add_request();
+}
+
+void ReadResult::C_ObjectReadRequest::finish(int r) {
+ CephContext *cct = aio_completion->ictx->cct;
+ ldout(cct, 10) << "C_ObjectReadRequest: r=" << r
+ << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r >= 0) {
+ ldout(cct, 10) << " got " << extent_map
+ << " for " << buffer_extents
+ << " bl " << bl.length() << dendl;
+ // handle the case where a sparse-read wasn't issued
+ if (extent_map.empty()) {
+ extent_map[object_off] = bl.length();
+ }
+
+ aio_completion->lock.Lock();
+ aio_completion->read_result.m_destriper.add_partial_sparse_result(
+ cct, bl, extent_map, object_off, buffer_extents);
+ aio_completion->lock.Unlock();
+
+ r = object_len;
+ }
+
+ aio_completion->complete_request(r);
+}
+
+ReadResult::ReadResult() : m_buffer(Empty()) {
+}
+
+ReadResult::ReadResult(char *buf, size_t buf_len)
+ : m_buffer(Linear(buf, buf_len)) {
+}
+
+ReadResult::ReadResult(const struct iovec *iov, int iov_count)
+ : m_buffer(Vector(iov, iov_count)) {
+}
+
+ReadResult::ReadResult(ceph::bufferlist *bl)
+ : m_buffer(Bufferlist(bl)) {
+}
+
+void ReadResult::set_clip_length(size_t length) {
+ boost::apply_visitor(SetClipLengthVisitor(length), m_buffer);
+}
+
+void ReadResult::assemble_result(CephContext *cct) {
+ boost::apply_visitor(AssembleResultVisitor(cct, m_destriper), m_buffer);
+}
+
+} // namespace io
+} // namespace librbd
+
diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h
new file mode 100644
index 00000000..6fb5e4a4
--- /dev/null
+++ b/src/librbd/io/ReadResult.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_READ_RESULT_H
+#define CEPH_LIBRBD_IO_READ_RESULT_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "librbd/io/Types.h"
+#include "osdc/Striper.h"
+#include <sys/uio.h>
+#include <boost/variant/variant.hpp>
+
+struct CephContext;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> struct ObjectReadRequest;
+
+class ReadResult {
+public:
+ struct C_ImageReadRequest : public Context {
+ AioCompletion *aio_completion;
+ Extents image_extents;
+ bufferlist bl;
+
+ C_ImageReadRequest(AioCompletion *aio_completion,
+ const Extents image_extents);
+
+ void finish(int r) override;
+ };
+
+ struct C_ObjectReadRequest : public Context {
+ AioCompletion *aio_completion;
+ uint64_t object_off;
+ uint64_t object_len;
+ Extents buffer_extents;
+
+ bufferlist bl;
+ ExtentMap extent_map;
+
+ C_ObjectReadRequest(AioCompletion *aio_completion, uint64_t object_off,
+ uint64_t object_len, Extents&& buffer_extents);
+
+ void finish(int r) override;
+ };
+
+ ReadResult();
+ ReadResult(char *buf, size_t buf_len);
+ ReadResult(const struct iovec *iov, int iov_count);
+ ReadResult(ceph::bufferlist *bl);
+
+ void set_clip_length(size_t length);
+ void assemble_result(CephContext *cct);
+
+private:
+ struct Empty {
+ };
+
+ struct Linear {
+ char *buf;
+ size_t buf_len;
+
+ Linear(char *buf, size_t buf_len) : buf(buf), buf_len(buf_len) {
+ }
+ };
+
+ struct Vector {
+ const struct iovec *iov;
+ int iov_count;
+
+ Vector(const struct iovec *iov, int iov_count)
+ : iov(iov), iov_count(iov_count) {
+ }
+ };
+
+ struct Bufferlist {
+ ceph::bufferlist *bl;
+
+ Bufferlist(ceph::bufferlist *bl) : bl(bl) {
+ }
+ };
+
+ typedef boost::variant<Empty,
+ Linear,
+ Vector,
+ Bufferlist> Buffer;
+ struct SetClipLengthVisitor;
+ struct AssembleResultVisitor;
+
+ Buffer m_buffer;
+ Striper::StripedReadResult m_destriper;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_READ_RESULT_H
+
diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h
new file mode 100644
index 00000000..6bd42eac
--- /dev/null
+++ b/src/librbd/io/Types.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_TYPES_H
+#define CEPH_LIBRBD_IO_TYPES_H
+
+#include "include/int_types.h"
+#include <map>
+#include <vector>
+
+namespace librbd {
+namespace io {
+
+#define RBD_QOS_IOPS_THROTTLE 1 << 0
+#define RBD_QOS_BPS_THROTTLE 1 << 1
+#define RBD_QOS_READ_IOPS_THROTTLE 1 << 2
+#define RBD_QOS_WRITE_IOPS_THROTTLE 1 << 3
+#define RBD_QOS_READ_BPS_THROTTLE 1 << 4
+#define RBD_QOS_WRITE_BPS_THROTTLE 1 << 5
+
+#define RBD_QOS_BPS_MASK (RBD_QOS_BPS_THROTTLE | RBD_QOS_READ_BPS_THROTTLE | RBD_QOS_WRITE_BPS_THROTTLE)
+#define RBD_QOS_IOPS_MASK (RBD_QOS_IOPS_THROTTLE | RBD_QOS_READ_IOPS_THROTTLE | RBD_QOS_WRITE_IOPS_THROTTLE)
+#define RBD_QOS_READ_MASK (RBD_QOS_READ_BPS_THROTTLE | RBD_QOS_READ_IOPS_THROTTLE)
+#define RBD_QOS_WRITE_MASK (RBD_QOS_WRITE_BPS_THROTTLE | RBD_QOS_WRITE_IOPS_THROTTLE)
+
+#define RBD_QOS_MASK (RBD_QOS_BPS_MASK | RBD_QOS_IOPS_MASK)
+
+typedef enum {
+ AIO_TYPE_NONE = 0,
+ AIO_TYPE_GENERIC,
+ AIO_TYPE_OPEN,
+ AIO_TYPE_CLOSE,
+ AIO_TYPE_READ,
+ AIO_TYPE_WRITE,
+ AIO_TYPE_DISCARD,
+ AIO_TYPE_FLUSH,
+ AIO_TYPE_WRITESAME,
+ AIO_TYPE_COMPARE_AND_WRITE,
+} aio_type_t;
+
+enum FlushSource {
+ FLUSH_SOURCE_USER,
+ FLUSH_SOURCE_INTERNAL,
+ FLUSH_SOURCE_SHUTDOWN
+};
+
+enum Direction {
+ DIRECTION_READ,
+ DIRECTION_WRITE,
+ DIRECTION_BOTH
+};
+
+enum DispatchResult {
+ DISPATCH_RESULT_INVALID,
+ DISPATCH_RESULT_CONTINUE,
+ DISPATCH_RESULT_COMPLETE
+};
+
+enum ObjectDispatchLayer {
+ OBJECT_DISPATCH_LAYER_NONE = 0,
+ OBJECT_DISPATCH_LAYER_CACHE,
+ OBJECT_DISPATCH_LAYER_JOURNAL,
+ OBJECT_DISPATCH_LAYER_CORE,
+ OBJECT_DISPATCH_LAYER_LAST
+};
+
+enum {
+ OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE = 1UL << 0,
+ OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE = 1UL << 1
+};
+
+enum {
+ OBJECT_DISPATCH_FLAG_FLUSH = 1UL << 0,
+ OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR = 1UL << 1
+};
+
+typedef std::vector<std::pair<uint64_t, uint64_t> > Extents;
+typedef std::map<uint64_t, uint64_t> ExtentMap;
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_TYPES_H
diff --git a/src/librbd/io/Utils.cc b/src/librbd/io/Utils.cc
new file mode 100644
index 00000000..1b50561a
--- /dev/null
+++ b/src/librbd/io/Utils.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/Utils.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+
+namespace librbd {
+namespace io {
+namespace util {
+
+bool assemble_write_same_extent(
+ const ObjectExtent &object_extent, const ceph::bufferlist& data,
+ ceph::bufferlist *ws_data, bool force_write) {
+ size_t data_len = data.length();
+
+ if (!force_write) {
+ bool may_writesame = true;
+ for (auto& q : object_extent.buffer_extents) {
+ if (!(q.first % data_len == 0 && q.second % data_len == 0)) {
+ may_writesame = false;
+ break;
+ }
+ }
+
+ if (may_writesame) {
+ ws_data->append(data);
+ return true;
+ }
+ }
+
+ for (auto& q : object_extent.buffer_extents) {
+ bufferlist sub_bl;
+ uint64_t sub_off = q.first % data_len;
+ uint64_t sub_len = data_len - sub_off;
+ uint64_t extent_left = q.second;
+ while (extent_left >= sub_len) {
+ sub_bl.substr_of(data, sub_off, sub_len);
+ ws_data->claim_append(sub_bl);
+ extent_left -= sub_len;
+ if (sub_off) {
+ sub_off = 0;
+ sub_len = data_len;
+ }
+ }
+ if (extent_left) {
+ sub_bl.substr_of(data, sub_off, extent_left);
+ ws_data->claim_append(sub_bl);
+ }
+ }
+ return false;
+}
+
+} // namespace util
+} // namespace io
+} // namespace librbd
+
diff --git a/src/librbd/io/Utils.h b/src/librbd/io/Utils.h
new file mode 100644
index 00000000..c1f373d4
--- /dev/null
+++ b/src/librbd/io/Utils.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_UTILS_H
+#define CEPH_LIBRBD_IO_UTILS_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include <map>
+
+class ObjectExtent;
+
+namespace librbd {
+namespace io {
+namespace util {
+
+bool assemble_write_same_extent(const ObjectExtent &object_extent,
+ const ceph::bufferlist& data,
+ ceph::bufferlist *ws_data,
+ bool force_write);
+
+} // namespace util
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_UTILS_H
diff --git a/src/librbd/journal/CreateRequest.cc b/src/librbd/journal/CreateRequest.cc
new file mode 100644
index 00000000..4f10ec65
--- /dev/null
+++ b/src/librbd/journal/CreateRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "librbd/Utils.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "journal/Settings.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal::CreateRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+
+namespace journal {
+
+template<typename I>
+CreateRequest<I>::CreateRequest(IoCtx &ioctx, const std::string &imageid,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool,
+ uint64_t tag_class, TagData &tag_data,
+ const std::string &client_id,
+ ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_image_id(imageid), m_order(order),
+ m_splay_width(splay_width), m_object_pool(object_pool),
+ m_tag_class(tag_class), m_tag_data(tag_data), m_image_client_id(client_id),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+template<typename I>
+void CreateRequest<I>::send() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ if (m_order > 64 || m_order < 12) {
+ lderr(m_cct) << "order must be in the range [12, 64]" << dendl;
+ complete(-EDOM);
+ return;
+ }
+ if (m_splay_width == 0) {
+ complete(-EINVAL);
+ return;
+ }
+
+ get_pool_id();
+}
+
+template<typename I>
+void CreateRequest<I>::get_pool_id() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ if (m_object_pool.empty()) {
+ create_journal();
+ return;
+ }
+
+ librados::Rados rados(m_ioctx);
+ IoCtx data_ioctx;
+ int r = rados.ioctx_create(m_object_pool.c_str(), data_ioctx);
+ if (r != 0) {
+ lderr(m_cct) << "failed to create journal: "
+ << "error opening journal object pool '" << m_object_pool
+ << "': " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+ data_ioctx.set_namespace(m_ioctx.get_namespace());
+
+ m_pool_id = data_ioctx.get_id();
+ create_journal();
+}
+
+template<typename I>
+void CreateRequest<I>::create_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock);
+ m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock,
+ m_ioctx, m_image_id, m_image_client_id, {});
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_create_journal>(this);
+
+ m_journaler->create(m_order, m_splay_width, m_pool_id, ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_create_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to create journal: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ allocate_journal_tag();
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::allocate_journal_tag() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journal_tag>(this);
+
+ encode(m_tag_data, m_bl);
+ m_journaler->allocate_tag(m_tag_class, m_bl, &m_tag, ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_journal_tag(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to allocate tag: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ register_client();
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::register_client() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ m_bl.clear();
+ encode(ClientData{ImageClientMeta{m_tag.tag_class}}, m_bl);
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_register_client>(this);
+
+ m_journaler->register_client(m_bl, ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_register_client(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to register client: " << cpp_strerror(*result) << dendl;
+ }
+
+ shut_down_journaler(*result);
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::shut_down_journaler(int r) {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ m_r_saved = r;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journaler_shutdown>(this);
+
+ m_journaler->shut_down(ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_journaler_shutdown(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl;
+ }
+
+ delete m_journaler;
+
+ if (!m_r_saved) {
+ complete(0);
+ return nullptr;
+ }
+
+ // there was an error during journal creation, so we rollback
+ // what ever was done. the easiest way to do this is to invoke
+ // journal remove state machine, although it's not the most
+ // cleanest approach when it comes to redundancy, but that's
+ // ok during the failure path.
+ remove_journal();
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::remove_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_journal>(this);
+
+ RemoveRequest<I> *req = RemoveRequest<I>::create(
+ m_ioctx, m_image_id, m_image_client_id, m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_remove_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "error cleaning up journal after creation failed: "
+ << cpp_strerror(*result) << dendl;
+ }
+
+ complete(m_r_saved);
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::complete(int r) {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ if (r == 0) {
+ ldout(m_cct, 20) << "done." << dendl;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::CreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/CreateRequest.h b/src/librbd/journal/CreateRequest.h
new file mode 100644
index 00000000..c2b8cd9c
--- /dev/null
+++ b/src/librbd/journal/CreateRequest.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "common/Mutex.h"
+#include "librbd/ImageCtx.h"
+#include "journal/Journaler.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "cls/journal/cls_journal_types.h"
+
+using librados::IoCtx;
+using journal::Journaler;
+
+class Context;
+class ContextWQ;
+class SafeTimer;
+
+namespace journal {
+ class Journaler;
+}
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class CreateRequest {
+public:
+ static CreateRequest *create(IoCtx &ioctx, const std::string &imageid,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool,
+ uint64_t tag_class, TagData &tag_data,
+ const std::string &client_id,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new CreateRequest(ioctx, imageid, order, splay_width, object_pool,
+ tag_class, tag_data, client_id, op_work_queue,
+ on_finish);
+ }
+
+ void send();
+
+private:
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ CreateRequest(IoCtx &ioctx, const std::string &imageid, uint8_t order,
+ uint8_t splay_width, const std::string &object_pool,
+ uint64_t tag_class, TagData &tag_data,
+ const std::string &client_id, ContextWQ *op_work_queue,
+ Context *on_finish);
+
+ IoCtx &m_ioctx;
+ std::string m_image_id;
+ uint8_t m_order;
+ uint8_t m_splay_width;
+ std::string m_object_pool;
+ uint64_t m_tag_class;
+ TagData m_tag_data;
+ std::string m_image_client_id;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ cls::journal::Tag m_tag;
+ bufferlist m_bl;
+ Journaler *m_journaler;
+ SafeTimer *m_timer;
+ Mutex *m_timer_lock;
+ int m_r_saved;
+
+ int64_t m_pool_id = -1;
+
+ void get_pool_id();
+
+ void create_journal();
+ Context *handle_create_journal(int *result);
+
+ void allocate_journal_tag();
+ Context *handle_journal_tag(int *result);
+
+ void register_client();
+ Context *handle_register_client(int *result);
+
+ void shut_down_journaler(int r);
+ Context *handle_journaler_shutdown(int *result);
+
+ void remove_journal();
+ Context *handle_remove_journal(int *result);
+
+ void complete(int r);
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::CreateRequest<librbd::ImageCtx>;
+
+#endif /* CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H */
diff --git a/src/librbd/journal/DemoteRequest.cc b/src/librbd/journal/DemoteRequest.cc
new file mode 100644
index 00000000..66a1d19e
--- /dev/null
+++ b/src/librbd/journal/DemoteRequest.cc
@@ -0,0 +1,255 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/DemoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/OpenRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::DemoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+DemoteRequest<I>::DemoteRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish),
+ m_lock("DemoteRequest::m_lock") {
+}
+
+template <typename I>
+DemoteRequest<I>::~DemoteRequest() {
+ ceph_assert(m_journaler == nullptr);
+}
+
+template <typename I>
+void DemoteRequest<I>::send() {
+ open_journaler();
+}
+
+template <typename I>
+void DemoteRequest<I>::open_journaler() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler = new Journaler(m_image_ctx.md_ctx, m_image_ctx.id,
+ Journal<>::IMAGE_CLIENT_ID, {});
+ auto ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_open_journaler>(this));
+ auto req = OpenRequest<I>::create(&m_image_ctx, m_journaler, &m_lock,
+ &m_client_meta, &m_tag_tid, &m_tag_data,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_open_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl;
+ shut_down_journaler();
+ return;
+ } else if (m_tag_data.mirror_uuid != Journal<>::LOCAL_MIRROR_UUID) {
+ m_ret_val = -EINVAL;
+ lderr(cct) << "image is not currently the primary" << dendl;
+ shut_down_journaler();
+ return;
+ }
+
+ allocate_tag();
+}
+
+template <typename I>
+void DemoteRequest<I>::allocate_tag() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ cls::journal::Client client;
+ int r = m_journaler->get_cached_client(Journal<>::IMAGE_CLIENT_ID, &client);
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ shut_down_journaler();
+ return;
+ }
+
+ TagPredecessor predecessor;
+ predecessor.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID;
+ if (!client.commit_position.object_positions.empty()) {
+ auto position = client.commit_position.object_positions.front();
+ predecessor.commit_valid = true;
+ predecessor.tag_tid = position.tag_tid;
+ predecessor.entry_tid = position.entry_tid;
+ }
+
+ TagData tag_data;
+ tag_data.mirror_uuid = Journal<>::ORPHAN_MIRROR_UUID;
+ tag_data.predecessor = std::move(predecessor);
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_allocate_tag>(this);
+ m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_allocate_tag(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+ shut_down_journaler();
+ return;
+ }
+
+ m_tag_tid = m_tag.tid;
+ append_event();
+}
+
+template <typename I>
+void DemoteRequest<I>::append_event() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ EventEntry event_entry{DemotePromoteEvent{}, {}};
+ bufferlist event_entry_bl;
+ encode(event_entry, event_entry_bl);
+
+ m_journaler->start_append(0);
+ m_future = m_journaler->append(m_tag_tid, event_entry_bl);
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_append_event>(this);
+ m_future.flush(ctx);
+
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_append_event(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to append demotion journal event: " << cpp_strerror(r)
+ << dendl;
+ stop_append();
+ return;
+ }
+
+ commit_event();
+}
+
+template <typename I>
+void DemoteRequest<I>::commit_event() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler->committed(m_future);
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_commit_event>(this);
+ m_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_commit_event(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to flush demotion commit position: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ stop_append();
+}
+
+template <typename I>
+void DemoteRequest<I>::stop_append() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_stop_append>(this);
+ m_journaler->stop_append(ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_stop_append(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl;
+ }
+
+ shut_down_journaler();
+}
+
+template <typename I>
+void DemoteRequest<I>::shut_down_journaler() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_shut_down_journaler>(this));
+ m_journaler->shut_down(ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_shut_down_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl;
+ }
+
+ delete m_journaler;
+ m_journaler = nullptr;
+ finish(r);
+}
+
+template <typename I>
+void DemoteRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::DemoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/DemoteRequest.h b/src/librbd/journal/DemoteRequest.h
new file mode 100644
index 00000000..5fea7f47
--- /dev/null
+++ b/src/librbd/journal/DemoteRequest.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H
+
+#include "common/Mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Future.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DemoteRequest {
+public:
+ static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new DemoteRequest(image_ctx, on_finish);
+ }
+
+ DemoteRequest(ImageCtxT &image_ctx, Context *on_finish);
+ ~DemoteRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_JOURNALER * * * * *
+ * | *
+ * v *
+ * ALLOCATE_TAG * * * * * *
+ * | *
+ * v *
+ * APPEND_EVENT * * * *
+ * | * *
+ * v * *
+ * COMMIT_EVENT * *
+ * | * *
+ * v * *
+ * STOP_APPEND <* * * *
+ * | *
+ * v *
+ * SHUT_DOWN_JOURNALER <* *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+ typedef typename TypeTraits<ImageCtxT>::Future Future;
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ Journaler *m_journaler = nullptr;
+ int m_ret_val = 0;
+
+ Mutex m_lock;
+ ImageClientMeta m_client_meta;
+ uint64_t m_tag_tid = 0;
+ TagData m_tag_data;
+
+ cls::journal::Tag m_tag;
+ Future m_future;
+
+ void open_journaler();
+ void handle_open_journaler(int r);
+
+ void allocate_tag();
+ void handle_allocate_tag(int r);
+
+ void append_event();
+ void handle_append_event(int r);
+
+ void commit_event();
+ void handle_commit_event(int r);
+
+ void stop_append();
+ void handle_stop_append(int r);
+
+ void shut_down_journaler();
+ void handle_shut_down_journaler(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::DemoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H
diff --git a/src/librbd/journal/DisabledPolicy.h b/src/librbd/journal/DisabledPolicy.h
new file mode 100644
index 00000000..27d69a50
--- /dev/null
+++ b/src/librbd/journal/DisabledPolicy.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H
+
+#include "librbd/journal/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+class DisabledPolicy : public Policy {
+public:
+ bool append_disabled() const override {
+ return true;
+ }
+ bool journal_disabled() const override {
+ return true;
+ }
+ void allocate_tag_on_lock(Context *on_finish) override {
+ ceph_abort();
+ }
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H
diff --git a/src/librbd/journal/ObjectDispatch.cc b/src/librbd/journal/ObjectDispatch.cc
new file mode 100644
index 00000000..737f5efe
--- /dev/null
+++ b/src/librbd/journal/ObjectDispatch.cc
@@ -0,0 +1,213 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/ObjectDispatch.h"
+#include "common/dout.h"
+#include "common/WorkQueue.h"
+#include "osdc/Striper.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::ObjectDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+namespace {
+
+template <typename I>
+struct C_CommitIOEvent : public Context {
+ I* image_ctx;
+ Journal<I>* journal;
+ uint64_t object_no;
+ uint64_t object_off;
+ uint64_t object_len;
+ uint64_t journal_tid;
+ int object_dispatch_flags;
+ Context* on_finish;
+
+ C_CommitIOEvent(I* image_ctx, Journal<I>* journal, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, int object_dispatch_flags,
+ Context* on_finish)
+ : image_ctx(image_ctx), journal(journal), object_no(object_no),
+ object_off(object_off), object_len(object_len), journal_tid(journal_tid),
+ object_dispatch_flags(object_dispatch_flags), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ // don't commit the IO extent if a previous dispatch handler will just
+ // retry the failed IO
+ if (r >= 0 ||
+ (object_dispatch_flags &
+ io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR) == 0) {
+ io::Extents file_extents;
+ Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, object_no,
+ object_off, object_len, file_extents);
+ for (auto& extent : file_extents) {
+ journal->commit_io_event_extent(journal_tid, extent.first,
+ extent.second, r);
+ }
+ }
+
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ }
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+ObjectDispatch<I>::ObjectDispatch(I* image_ctx, Journal<I>* journal)
+ : m_image_ctx(image_ctx), m_journal(journal) {
+}
+
+template <typename I>
+void ObjectDispatch<I>::shut_down(Context* on_finish) {
+ m_image_ctx->op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+bool ObjectDispatch<I>::discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, object_len, *journal_tid,
+ *object_dispatch_flags, *on_finish);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << data.length() << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, data.length(), *journal_tid,
+ *object_dispatch_flags, *on_finish);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, io::Extents&& buffer_extents, ceph::bufferlist&& data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << object_len << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, object_len, *journal_tid,
+ *object_dispatch_flags, *on_finish);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << oid << " " << object_off << "~" << write_data.length()
+ << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, write_data.length(),
+ *journal_tid, *object_dispatch_flags,
+ *on_finish);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+void ObjectDispatch<I>::extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << object_no << " " << object_off << "~" << object_len
+ << dendl;
+
+ auto ctx = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, object_len, journal_tid, false,
+ nullptr);
+ if (new_journal_tid != 0) {
+ // ensure new journal event is safely committed to disk before
+ // committing old event
+ m_journal->flush_event(new_journal_tid, ctx);
+ } else {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void ObjectDispatch<I>::wait_or_flush_event(
+ uint64_t journal_tid, int object_dispatch_flags, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "journal_tid=" << journal_tid << dendl;
+
+ if ((object_dispatch_flags & io::OBJECT_DISPATCH_FLAG_FLUSH) != 0) {
+ m_journal->flush_event(journal_tid, on_dispatched);
+ } else {
+ m_journal->wait_event(journal_tid, on_dispatched);
+ }
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::ObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/journal/ObjectDispatch.h b/src/librbd/journal/ObjectDispatch.h
new file mode 100644
index 00000000..b5ae747e
--- /dev/null
+++ b/src/librbd/journal/ObjectDispatch.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+template <typename> class Journal;
+
+namespace journal {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectDispatch : public io::ObjectDispatchInterface {
+public:
+ static ObjectDispatch* create(ImageCtxT* image_ctx,
+ Journal<ImageCtxT>* journal) {
+ return new ObjectDispatch(image_ctx, journal);
+ }
+
+ ObjectDispatch(ImageCtxT* image_ctx, Journal<ImageCtxT>* journal);
+
+ io::ObjectDispatchLayer get_object_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_JOURNAL;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, librados::snap_t snap_id, int op_flags,
+ const ZTracer::Trace &parent_trace, ceph::bufferlist* read_data,
+ io::ExtentMap* extent_map, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ return false;
+ }
+
+ bool discard(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, const ::SnapContext &snapc, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, io::Extents&& buffer_extents,
+ ceph::bufferlist&& data, const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ const std::string &oid, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ const ::SnapContext &snapc, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+ Journal<ImageCtxT>* m_journal;
+
+ void wait_or_flush_event(uint64_t journal_tid, int object_dispatch_flags,
+ Context* on_dispatched);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::ObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H
diff --git a/src/librbd/journal/OpenRequest.cc b/src/librbd/journal/OpenRequest.cc
new file mode 100644
index 00000000..a5178de8
--- /dev/null
+++ b/src/librbd/journal/OpenRequest.cc
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/OpenRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::OpenRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using util::C_DecodeTags;
+
+template <typename I>
+OpenRequest<I>::OpenRequest(I *image_ctx, Journaler *journaler, Mutex *lock,
+ journal::ImageClientMeta *client_meta,
+ uint64_t *tag_tid, journal::TagData *tag_data,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_journaler(journaler), m_lock(lock),
+ m_client_meta(client_meta), m_tag_tid(tag_tid), m_tag_data(tag_data),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void OpenRequest<I>::send() {
+ send_init();
+}
+
+template <typename I>
+void OpenRequest<I>::send_init() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler->init(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ OpenRequest<I>, &OpenRequest<I>::handle_init>(this)));
+}
+
+template <typename I>
+void OpenRequest<I>::handle_init(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to initialize journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ // locate the master image client record
+ cls::journal::Client client;
+ r = m_journaler->get_cached_client(Journal<ImageCtx>::IMAGE_CLIENT_ID,
+ &client);
+ if (r < 0) {
+ lderr(cct) << "failed to locate master image client" << dendl;
+ finish(r);
+ return;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto bl = client.data.cbegin();
+ try {
+ decode(client_data, bl);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode client meta data: " << err.what()
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ journal::ImageClientMeta *image_client_meta =
+ boost::get<journal::ImageClientMeta>(&client_data.client_meta);
+ if (image_client_meta == nullptr) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to extract client meta data" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "client: " << client << ", "
+ << "image meta: " << *image_client_meta << dendl;
+
+ m_tag_class = image_client_meta->tag_class;
+ {
+ Mutex::Locker locker(*m_lock);
+ *m_client_meta = *image_client_meta;
+ }
+
+ send_get_tags();
+}
+
+template <typename I>
+void OpenRequest<I>::send_get_tags() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ C_DecodeTags *tags_ctx = new C_DecodeTags(
+ cct, m_lock, m_tag_tid, m_tag_data, create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ OpenRequest<I>, &OpenRequest<I>::handle_get_tags>(this)));
+ m_journaler->get_tags(m_tag_class, &tags_ctx->tags, tags_ctx);
+}
+
+template <typename I>
+void OpenRequest<I>::handle_get_tags(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to decode journal tags: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void OpenRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::OpenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/OpenRequest.h b/src/librbd/journal/OpenRequest.h
new file mode 100644
index 00000000..f71d8c53
--- /dev/null
+++ b/src/librbd/journal/OpenRequest.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/journal/TypeTraits.h"
+
+struct Context;
+struct Mutex;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+struct ImageClientMeta;
+struct TagData;
+
+template <typename ImageCtxT = ImageCtx>
+class OpenRequest {
+public:
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ static OpenRequest* create(ImageCtxT *image_ctx, Journaler *journaler,
+ Mutex *lock, journal::ImageClientMeta *client_meta,
+ uint64_t *tag_tid, journal::TagData *tag_data,
+ Context *on_finish) {
+ return new OpenRequest(image_ctx, journaler, lock, client_meta, tag_tid,
+ tag_data, on_finish);
+ }
+
+ OpenRequest(ImageCtxT *image_ctx, Journaler *journaler, Mutex *lock,
+ journal::ImageClientMeta *client_meta, uint64_t *tag_tid,
+ journal::TagData *tag_data, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * GET_TAGS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+
+ ImageCtxT *m_image_ctx;
+ Journaler *m_journaler;
+ Mutex *m_lock;
+ journal::ImageClientMeta *m_client_meta;
+ uint64_t *m_tag_tid;
+ journal::TagData *m_tag_data;
+ Context *m_on_finish;
+
+ uint64_t m_tag_class = 0;
+
+ void send_init();
+ void handle_init(int r);
+
+ void send_get_tags();
+ void handle_get_tags(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::OpenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H
diff --git a/src/librbd/journal/Policy.h b/src/librbd/journal/Policy.h
new file mode 100644
index 00000000..1ced3c53
--- /dev/null
+++ b/src/librbd/journal/Policy.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_POLICY_H
+
+class Context;
+
+namespace librbd {
+
+namespace journal {
+
+struct Policy {
+ virtual ~Policy() {
+ }
+
+ virtual bool append_disabled() const = 0;
+ virtual bool journal_disabled() const = 0;
+ virtual void allocate_tag_on_lock(Context *on_finish) = 0;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_POLICY_H
diff --git a/src/librbd/journal/PromoteRequest.cc b/src/librbd/journal/PromoteRequest.cc
new file mode 100644
index 00000000..17f2957e
--- /dev/null
+++ b/src/librbd/journal/PromoteRequest.cc
@@ -0,0 +1,237 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/PromoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/OpenRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::PromoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+PromoteRequest<I>::PromoteRequest(I *image_ctx, bool force, Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish),
+ m_lock("PromoteRequest::m_lock") {
+}
+
+template <typename I>
+void PromoteRequest<I>::send() {
+ send_open();
+}
+
+template <typename I>
+void PromoteRequest<I>::send_open() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler = new Journaler(m_image_ctx->md_ctx, m_image_ctx->id,
+ Journal<>::IMAGE_CLIENT_ID, {});
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_open>(this));
+ auto open_req = OpenRequest<I>::create(m_image_ctx, m_journaler,
+ &m_lock, &m_client_meta,
+ &m_tag_tid, &m_tag_data, ctx);
+ open_req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_open(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl;
+ shut_down();
+ return;
+ }
+
+ allocate_tag();
+}
+
+template <typename I>
+void PromoteRequest<I>::allocate_tag() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ journal::TagPredecessor predecessor;
+ if (!m_force && m_tag_data.mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) {
+ // orderly promotion -- demotion epoch will have a single entry
+ // so link to our predecessor (demotion) epoch
+ predecessor = TagPredecessor{Journal<>::ORPHAN_MIRROR_UUID, true, m_tag_tid,
+ 1};
+ } else {
+ // forced promotion -- create an epoch no peers can link against
+ predecessor = TagPredecessor{Journal<>::LOCAL_MIRROR_UUID, true, m_tag_tid,
+ 0};
+ }
+
+ TagData tag_data;
+ tag_data.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID;
+ tag_data.predecessor = predecessor;
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ Context *ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_allocate_tag>(this);
+ m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_allocate_tag(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+ shut_down();
+ return;
+ }
+
+ m_tag_tid = m_tag.tid;
+ append_event();
+}
+
+template <typename I>
+void PromoteRequest<I>::append_event() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ EventEntry event_entry{DemotePromoteEvent{}, {}};
+ bufferlist event_entry_bl;
+ encode(event_entry, event_entry_bl);
+
+ m_journaler->start_append(0);
+ m_future = m_journaler->append(m_tag_tid, event_entry_bl);
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_append_event>(this);
+ m_future.flush(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_append_event(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to append promotion journal event: "
+ << cpp_strerror(r) << dendl;
+ stop_append();
+ return;
+ }
+
+ commit_event();
+}
+
+template <typename I>
+void PromoteRequest<I>::commit_event() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler->committed(m_future);
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_commit_event>(this);
+ m_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_commit_event(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to flush promote commit position: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ stop_append();
+}
+
+template <typename I>
+void PromoteRequest<I>::stop_append() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_stop_append>(this);
+ m_journaler->stop_append(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_stop_append(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl;
+ }
+
+ shut_down();
+}
+
+template <typename I>
+void PromoteRequest<I>::shut_down() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_shut_down>(this));
+ m_journaler->shut_down(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_shut_down(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl;
+ }
+
+ delete m_journaler;
+ finish(r);
+}
+
+template <typename I>
+void PromoteRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::PromoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/PromoteRequest.h b/src/librbd/journal/PromoteRequest.h
new file mode 100644
index 00000000..0d01f596
--- /dev/null
+++ b/src/librbd/journal/PromoteRequest.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H
+
+#include "include/int_types.h"
+#include "common/Mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Future.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+template <typename ImageCtxT = ImageCtx>
+class PromoteRequest {
+public:
+ static PromoteRequest* create(ImageCtxT *image_ctx, bool force,
+ Context *on_finish) {
+ return new PromoteRequest(image_ctx, force, on_finish);
+ }
+
+ PromoteRequest(ImageCtxT *image_ctx, bool force, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN * * * * * * * * * *
+ * | *
+ * v *
+ * ALLOCATE_TAG * * * * * *
+ * | *
+ * v *
+ * APPEND_EVENT * * * *
+ * | * *
+ * v * *
+ * COMMIT_EVENT * *
+ * | * *
+ * v * *
+ * STOP_APPEND <* * * *
+ * | *
+ * v *
+ * SHUT_DOWN <* * * * * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+ typedef typename TypeTraits<ImageCtxT>::Future Future;
+
+ ImageCtxT *m_image_ctx;
+ bool m_force;
+ Context *m_on_finish;
+
+ Journaler *m_journaler = nullptr;
+ int m_ret_val = 0;
+
+ Mutex m_lock;
+ ImageClientMeta m_client_meta;
+ uint64_t m_tag_tid = 0;
+ TagData m_tag_data;
+
+ cls::journal::Tag m_tag;
+ Future m_future;
+
+ void send_open();
+ void handle_open(int r);
+
+ void allocate_tag();
+ void handle_allocate_tag(int r);
+
+ void append_event();
+ void handle_append_event(int r);
+
+ void commit_event();
+ void handle_commit_event(int r);
+
+ void stop_append();
+ void handle_stop_append(int r);
+
+ void shut_down();
+ void handle_shut_down(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::PromoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H
diff --git a/src/librbd/journal/RemoveRequest.cc b/src/librbd/journal/RemoveRequest.cc
new file mode 100644
index 00000000..6ed5f54c
--- /dev/null
+++ b/src/librbd/journal/RemoveRequest.cc
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "journal/Settings.h"
+#include "include/ceph_assert.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal::RemoveRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+
+namespace journal {
+
+template<typename I>
+RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, const std::string &image_id,
+ const std::string &client_id,
+ ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_image_id(image_id), m_image_client_id(client_id),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+template<typename I>
+void RemoveRequest<I>::send() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ stat_journal();
+}
+
+template<typename I>
+void RemoveRequest<I>::stat_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock);
+ m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock,
+ m_ioctx, m_image_id, m_image_client_id, {});
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_stat_journal>(this);
+
+ m_journaler->exists(ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_stat_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if ((*result < 0) && (*result != -ENOENT)) {
+ lderr(m_cct) << "failed to stat journal header: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ if (*result == -ENOENT) {
+ shut_down_journaler(0);
+ return nullptr;
+ }
+
+ init_journaler();
+ return nullptr;
+}
+
+template<typename I>
+void RemoveRequest<I>::init_journaler() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_init_journaler>(this);
+
+ m_journaler->init(ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_init_journaler(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if ((*result < 0) && (*result != -ENOENT)) {
+ lderr(m_cct) << "failed to init journaler: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ remove_journal();
+ return nullptr;
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_journal>(this);
+
+ m_journaler->remove(true, ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_remove_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to remove journal: " << cpp_strerror(*result) << dendl;
+ }
+
+ shut_down_journaler(*result);
+ return nullptr;
+}
+
+template<typename I>
+void RemoveRequest<I>::shut_down_journaler(int r) {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ m_r_saved = r;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journaler_shutdown>(this);
+
+ m_journaler->shut_down(ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_journaler_shutdown(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl;
+ }
+
+ delete m_journaler;
+
+ if (m_r_saved == 0) {
+ ldout(m_cct, 20) << "done." << dendl;
+ }
+
+ m_on_finish->complete(m_r_saved);
+ delete this;
+
+ return nullptr;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/RemoveRequest.h b/src/librbd/journal/RemoveRequest.h
new file mode 100644
index 00000000..594e62d5
--- /dev/null
+++ b/src/librbd/journal/RemoveRequest.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "common/Mutex.h"
+#include "librbd/ImageCtx.h"
+#include "journal/Journaler.h"
+#include "librbd/journal/TypeTraits.h"
+
+using librados::IoCtx;
+using journal::Journaler;
+
+class Context;
+class ContextWQ;
+class SafeTimer;
+
+namespace journal {
+ class Journaler;
+}
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class RemoveRequest {
+public:
+ static RemoveRequest *create(IoCtx &ioctx, const std::string &image_id,
+ const std::string &client_id,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new RemoveRequest(ioctx, image_id, client_id,
+ op_work_queue, on_finish);
+ }
+
+ void send();
+
+private:
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ RemoveRequest(IoCtx &ioctx, const std::string &image_id,
+ const std::string &client_id,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ IoCtx &m_ioctx;
+ std::string m_image_id;
+ std::string m_image_client_id;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ Journaler *m_journaler;
+ SafeTimer *m_timer;
+ Mutex *m_timer_lock;
+ int m_r_saved;
+
+ void stat_journal();
+ Context *handle_stat_journal(int *result);
+
+ void init_journaler();
+ Context *handle_init_journaler(int *result);
+
+ void remove_journal();
+ Context *handle_remove_journal(int *result);
+
+ void shut_down_journaler(int r);
+ Context *handle_journaler_shutdown(int *result);
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc
new file mode 100644
index 00000000..f96153e7
--- /dev/null
+++ b/src/librbd/journal/Replay.cc
@@ -0,0 +1,1190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/Replay.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " "
+
+namespace librbd {
+namespace journal {
+
+namespace {
+
+static const uint64_t IN_FLIGHT_IO_LOW_WATER_MARK(32);
+static const uint64_t IN_FLIGHT_IO_HIGH_WATER_MARK(64);
+
+static NoOpProgressContext no_op_progress_callback;
+
+template <typename I, typename E>
+struct ExecuteOp : public Context {
+ I &image_ctx;
+ E event;
+ Context *on_op_complete;
+
+ ExecuteOp(I &image_ctx, const E &event, Context *on_op_complete)
+ : image_ctx(image_ctx), event(event), on_op_complete(on_op_complete) {
+ }
+
+ void execute(const journal::SnapCreateEvent &_) {
+ image_ctx.operations->execute_snap_create(event.snap_namespace,
+ event.snap_name,
+ on_op_complete,
+ event.op_tid, false);
+ }
+
+ void execute(const journal::SnapRemoveEvent &_) {
+ image_ctx.operations->execute_snap_remove(event.snap_namespace,
+ event.snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapRenameEvent &_) {
+ image_ctx.operations->execute_snap_rename(event.snap_id,
+ event.dst_snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapProtectEvent &_) {
+ image_ctx.operations->execute_snap_protect(event.snap_namespace,
+ event.snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapUnprotectEvent &_) {
+ image_ctx.operations->execute_snap_unprotect(event.snap_namespace,
+ event.snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapRollbackEvent &_) {
+ image_ctx.operations->execute_snap_rollback(event.snap_namespace,
+ event.snap_name,
+ no_op_progress_callback,
+ on_op_complete);
+ }
+
+ void execute(const journal::RenameEvent &_) {
+ image_ctx.operations->execute_rename(event.image_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::ResizeEvent &_) {
+ image_ctx.operations->execute_resize(event.size, true, no_op_progress_callback,
+ on_op_complete, event.op_tid);
+ }
+
+ void execute(const journal::FlattenEvent &_) {
+ image_ctx.operations->execute_flatten(no_op_progress_callback,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapLimitEvent &_) {
+ image_ctx.operations->execute_snap_set_limit(event.limit, on_op_complete);
+ }
+
+ void execute(const journal::UpdateFeaturesEvent &_) {
+ image_ctx.operations->execute_update_features(event.features, event.enabled,
+ on_op_complete, event.op_tid);
+ }
+
+ void execute(const journal::MetadataSetEvent &_) {
+ image_ctx.operations->execute_metadata_set(event.key, event.value,
+ on_op_complete);
+ }
+
+ void execute(const journal::MetadataRemoveEvent &_) {
+ image_ctx.operations->execute_metadata_remove(event.key, on_op_complete);
+ }
+
+ void finish(int r) override {
+ CephContext *cct = image_ctx.cct;
+ if (r < 0) {
+ lderr(cct) << ": ExecuteOp::" << __func__ << ": r=" << r << dendl;
+ on_op_complete->complete(r);
+ return;
+ }
+
+ ldout(cct, 20) << ": ExecuteOp::" << __func__ << dendl;
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+
+ if (image_ctx.exclusive_lock == nullptr ||
+ !image_ctx.exclusive_lock->accept_ops()) {
+ ldout(cct, 5) << ": lost exclusive lock -- skipping op" << dendl;
+ on_op_complete->complete(-ECANCELED);
+ return;
+ }
+
+ execute(event);
+ }
+};
+
+template <typename I>
+struct C_RefreshIfRequired : public Context {
+ I &image_ctx;
+ Context *on_finish;
+
+ C_RefreshIfRequired(I &image_ctx, Context *on_finish)
+ : image_ctx(image_ctx), on_finish(on_finish) {
+ }
+ ~C_RefreshIfRequired() override {
+ delete on_finish;
+ }
+
+ void finish(int r) override {
+ CephContext *cct = image_ctx.cct;
+ Context *ctx = on_finish;
+ on_finish = nullptr;
+
+ if (r < 0) {
+ lderr(cct) << ": C_RefreshIfRequired::" << __func__ << ": r=" << r << dendl;
+ image_ctx.op_work_queue->queue(ctx, r);
+ return;
+ }
+
+ if (image_ctx.state->is_refresh_required()) {
+ ldout(cct, 20) << ": C_RefreshIfRequired::" << __func__ << ": "
+ << "refresh required" << dendl;
+ image_ctx.state->refresh(ctx);
+ return;
+ }
+
+ image_ctx.op_work_queue->queue(ctx, 0);
+ }
+};
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " \
+ << __func__
+
+template <typename I>
+Replay<I>::Replay(I &image_ctx)
+ : m_image_ctx(image_ctx), m_lock("Replay<I>::m_lock") {
+}
+
+template <typename I>
+Replay<I>::~Replay() {
+ ceph_assert(m_in_flight_aio_flush == 0);
+ ceph_assert(m_in_flight_aio_modify == 0);
+ ceph_assert(m_aio_modify_unsafe_contexts.empty());
+ ceph_assert(m_aio_modify_safe_contexts.empty());
+ ceph_assert(m_op_events.empty());
+ ceph_assert(m_in_flight_op_events == 0);
+}
+
+template <typename I>
+int Replay<I>::decode(bufferlist::const_iterator *it, EventEntry *event_entry) {
+ try {
+ using ceph::decode;
+ decode(*event_entry, *it);
+ } catch (const buffer::error &err) {
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+template <typename I>
+void Replay<I>::process(const EventEntry &event_entry,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": on_ready=" << on_ready << ", on_safe=" << on_safe
+ << dendl;
+
+ on_ready = util::create_async_context_callback(m_image_ctx, on_ready);
+
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ if (m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->accept_ops()) {
+ ldout(cct, 5) << ": lost exclusive lock -- skipping event" << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, -ECANCELED);
+ on_ready->complete(0);
+ return;
+ }
+
+ boost::apply_visitor(EventVisitor(this, on_ready, on_safe),
+ event_entry.event);
+}
+
+template <typename I>
+void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ io::AioCompletion *flush_comp = nullptr;
+ on_finish = util::create_async_context_callback(
+ m_image_ctx, on_finish);
+
+ {
+ Mutex::Locker locker(m_lock);
+
+ // safely commit any remaining AIO modify operations
+ if ((m_in_flight_aio_flush + m_in_flight_aio_modify) != 0) {
+ flush_comp = create_aio_flush_completion(nullptr);
+ ceph_assert(flush_comp != nullptr);
+ }
+
+ for (auto &op_event_pair : m_op_events) {
+ OpEvent &op_event = op_event_pair.second;
+ if (cancel_ops) {
+ // cancel ops that are waiting to start (waiting for
+ // OpFinishEvent or waiting for ready)
+ if (op_event.on_start_ready == nullptr &&
+ op_event.on_op_finish_event != nullptr) {
+ Context *on_op_finish_event = nullptr;
+ std::swap(on_op_finish_event, op_event.on_op_finish_event);
+ m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART);
+ }
+ } else if (op_event.on_op_finish_event != nullptr) {
+ // start ops waiting for OpFinishEvent
+ Context *on_op_finish_event = nullptr;
+ std::swap(on_op_finish_event, op_event.on_op_finish_event);
+ m_image_ctx.op_work_queue->queue(on_op_finish_event, 0);
+ } else if (op_event.on_start_ready != nullptr) {
+ // waiting for op ready
+ op_event_pair.second.finish_on_ready = true;
+ }
+ }
+
+ ceph_assert(!m_shut_down);
+ m_shut_down = true;
+
+ ceph_assert(m_flush_ctx == nullptr);
+ if (m_in_flight_op_events > 0 || flush_comp != nullptr) {
+ std::swap(m_flush_ctx, on_finish);
+ }
+ }
+
+ // execute the following outside of lock scope
+ if (flush_comp != nullptr) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ if (on_finish != nullptr) {
+ on_finish->complete(0);
+ }
+}
+
+template <typename I>
+void Replay<I>::flush(Context *on_finish) {
+ io::AioCompletion *aio_comp;
+ {
+ Mutex::Locker locker(m_lock);
+ aio_comp = create_aio_flush_completion(
+ util::create_async_context_callback(m_image_ctx, on_finish));
+ if (aio_comp == nullptr) {
+ return;
+ }
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+}
+
+template <typename I>
+void Replay<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": op_tid=" << op_tid << dendl;
+
+ Mutex::Locker locker(m_lock);
+ auto op_it = m_op_events.find(op_tid);
+ ceph_assert(op_it != m_op_events.end());
+
+ OpEvent &op_event = op_it->second;
+ ceph_assert(op_event.op_in_progress &&
+ op_event.on_op_finish_event == nullptr &&
+ op_event.on_finish_ready == nullptr &&
+ op_event.on_finish_safe == nullptr);
+
+ // resume processing replay events
+ Context *on_start_ready = nullptr;
+ std::swap(on_start_ready, op_event.on_start_ready);
+ on_start_ready->complete(0);
+
+ // cancel has been requested -- send error to paused state machine
+ if (!op_event.finish_on_ready && m_flush_ctx != nullptr) {
+ m_image_ctx.op_work_queue->queue(on_resume, -ERESTART);
+ return;
+ }
+
+ // resume the op state machine once the associated OpFinishEvent
+ // is processed
+ op_event.on_op_finish_event = new FunctionContext(
+ [on_resume](int r) {
+ on_resume->complete(r);
+ });
+
+ // shut down request -- don't expect OpFinishEvent
+ if (op_event.finish_on_ready) {
+ m_image_ctx.op_work_queue->queue(on_resume, 0);
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioDiscardEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO discard event" << dendl;
+
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_DISCARD,
+ &flush_required,
+ {});
+ if (aio_comp == nullptr) {
+ return;
+ }
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ event.discard_granularity_bytes, {});
+ }
+
+ if (flush_required) {
+ m_lock.Lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.Unlock();
+
+ if (flush_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioWriteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO write event" << dendl;
+
+ bufferlist data = event.data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_WRITE,
+ &flush_required,
+ {});
+ if (aio_comp == nullptr) {
+ return;
+ }
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_write(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(data), 0, {});
+ }
+
+ if (flush_required) {
+ m_lock.Lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.Unlock();
+
+ if (flush_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioFlushEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO flush event" << dendl;
+
+ io::AioCompletion *aio_comp;
+ {
+ Mutex::Locker locker(m_lock);
+ aio_comp = create_aio_flush_completion(on_safe);
+ }
+
+ if (aio_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioWriteSameEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO writesame event" << dendl;
+
+ bufferlist data = event.data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_WRITESAME,
+ &flush_required,
+ {});
+ if (aio_comp == nullptr) {
+ return;
+ }
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_writesame(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(data), 0, {});
+ }
+
+ if (flush_required) {
+ m_lock.Lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.Unlock();
+
+ if (flush_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ }
+}
+
+ template <typename I>
+ void Replay<I>::handle_event(const journal::AioCompareAndWriteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO CompareAndWrite event" << dendl;
+
+ bufferlist cmp_data = event.cmp_data;
+ bufferlist write_data = event.write_data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_COMPARE_AND_WRITE,
+ &flush_required,
+ {-EILSEQ});
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_compare_and_write(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(cmp_data),
+ std::move(write_data),
+ nullptr, 0, {});
+ }
+
+ if (flush_required) {
+ m_lock.Lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.Unlock();
+
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::OpFinishEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Op finish event: "
+ << "op_tid=" << event.op_tid << dendl;
+
+ bool op_in_progress;
+ bool filter_ret_val;
+ Context *on_op_complete = nullptr;
+ Context *on_op_finish_event = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ auto op_it = m_op_events.find(event.op_tid);
+ if (op_it == m_op_events.end()) {
+ ldout(cct, 10) << ": unable to locate associated op: assuming previously "
+ << "committed." << dendl;
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, 0);
+ return;
+ }
+
+ OpEvent &op_event = op_it->second;
+ ceph_assert(op_event.on_finish_safe == nullptr);
+ op_event.on_finish_ready = on_ready;
+ op_event.on_finish_safe = on_safe;
+ op_in_progress = op_event.op_in_progress;
+ std::swap(on_op_complete, op_event.on_op_complete);
+ std::swap(on_op_finish_event, op_event.on_op_finish_event);
+
+ // special errors which indicate op never started but was recorded
+ // as failed in the journal
+ filter_ret_val = (op_event.op_finish_error_codes.count(event.r) != 0);
+ }
+
+ if (event.r < 0) {
+ if (op_in_progress) {
+ // bubble the error up to the in-progress op to cancel it
+ on_op_finish_event->complete(event.r);
+ } else {
+ // op hasn't been started -- bubble the error up since
+ // our image is now potentially in an inconsistent state
+ // since simple errors should have been caught before
+ // creating the op event
+ delete on_op_complete;
+ delete on_op_finish_event;
+ handle_op_complete(event.op_tid, filter_ret_val ? 0 : event.r);
+ }
+ return;
+ }
+
+ // journal recorded success -- apply the op now
+ on_op_finish_event->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapCreateEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap create event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EEXIST};
+
+ // avoid lock cycles
+ m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapCreateEvent>(m_image_ctx, event,
+ on_op_complete)),
+ 0);
+
+ // do not process more events until the state machine is ready
+ // since it will affect IO
+ op_event->op_in_progress = true;
+ op_event->on_start_ready = on_ready;
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapRemoveEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap remove event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapRemoveEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-ENOENT};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapRenameEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap rename event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapRenameEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EEXIST};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapProtectEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap protect event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapProtectEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EBUSY};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapUnprotectEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap unprotect event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapUnprotectEvent>(m_image_ctx,
+ event,
+ on_op_complete));
+
+ // ignore errors recorded in the journal
+ op_event->op_finish_error_codes = {-EBUSY};
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EINVAL};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapRollbackEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap rollback start event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapRollbackEvent>(m_image_ctx,
+ event,
+ on_op_complete));
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::RenameEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Rename event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::RenameEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EEXIST};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::ResizeEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Resize start event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ // avoid lock cycles
+ m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::ResizeEvent>(m_image_ctx, event,
+ on_op_complete)), 0);
+
+ // do not process more events until the state machine is ready
+ // since it will affect IO
+ op_event->op_in_progress = true;
+ op_event->on_start_ready = on_ready;
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::FlattenEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Flatten start event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::FlattenEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EINVAL};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::DemotePromoteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Demote/Promote event" << dendl;
+ on_ready->complete(0);
+ on_safe->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapLimitEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap limit event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapLimitEvent>(m_image_ctx,
+ event,
+ on_op_complete));
+
+ op_event->ignore_error_codes = {-ERANGE};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::UpdateFeaturesEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Update features event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ // avoid lock cycles
+ m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::UpdateFeaturesEvent>(
+ m_image_ctx, event, on_op_complete)), 0);
+
+ // do not process more events until the state machine is ready
+ // since it will affect IO
+ op_event->op_in_progress = true;
+ op_event->on_start_ready = on_ready;
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::MetadataSetEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Metadata set event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete);
+ op_event->on_op_finish_event = util::create_async_context_callback(
+ m_image_ctx, new ExecuteOp<I, journal::MetadataSetEvent>(
+ m_image_ctx, event, on_op_complete));
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::MetadataRemoveEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Metadata remove event" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete);
+ op_event->on_op_finish_event = util::create_async_context_callback(
+ m_image_ctx, new ExecuteOp<I, journal::MetadataRemoveEvent>(
+ m_image_ctx, event, on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-ENOENT};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::UnknownEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": unknown event" << dendl;
+ on_ready->complete(0);
+ on_safe->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe,
+ int r, std::set<int> &filters,
+ bool writeback_cache_enabled) {
+ Mutex::Locker locker(m_lock);
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": on_ready=" << on_ready << ", "
+ << "on_safe=" << on_safe << ", r=" << r << dendl;
+
+ if (on_ready != nullptr) {
+ on_ready->complete(0);
+ }
+
+ if (filters.find(r) != filters.end())
+ r = 0;
+
+ if (r < 0) {
+ lderr(cct) << ": AIO modify op failed: " << cpp_strerror(r) << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, r);
+ return;
+ }
+
+ if (writeback_cache_enabled) {
+ // will be completed after next flush operation completes
+ m_aio_modify_safe_contexts.insert(on_safe);
+ } else {
+ // IO is safely stored on disk
+ ceph_assert(m_in_flight_aio_modify > 0);
+ --m_in_flight_aio_modify;
+
+ if (m_on_aio_ready != nullptr) {
+ ldout(cct, 10) << ": resuming paused AIO" << dendl;
+ m_on_aio_ready->complete(0);
+ m_on_aio_ready = nullptr;
+ }
+
+ ldout(cct, 20) << ": completing safe context: " << on_safe << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, 0);
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_aio_flush_complete(Context *on_flush_safe,
+ Contexts &on_safe_ctxs, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << ": AIO flush failed: " << cpp_strerror(r) << dendl;
+ }
+
+ Context *on_aio_ready = nullptr;
+ Context *on_flush = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_in_flight_aio_flush > 0);
+ ceph_assert(m_in_flight_aio_modify >= on_safe_ctxs.size());
+ --m_in_flight_aio_flush;
+ m_in_flight_aio_modify -= on_safe_ctxs.size();
+
+ std::swap(on_aio_ready, m_on_aio_ready);
+ if (m_in_flight_op_events == 0 &&
+ (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) {
+ on_flush = m_flush_ctx;
+ }
+
+ // strip out previously failed on_safe contexts
+ for (auto it = on_safe_ctxs.begin(); it != on_safe_ctxs.end(); ) {
+ if (m_aio_modify_safe_contexts.erase(*it)) {
+ ++it;
+ } else {
+ it = on_safe_ctxs.erase(it);
+ }
+ }
+ }
+
+ if (on_aio_ready != nullptr) {
+ ldout(cct, 10) << ": resuming paused AIO" << dendl;
+ on_aio_ready->complete(0);
+ }
+
+ if (on_flush_safe != nullptr) {
+ on_safe_ctxs.push_back(on_flush_safe);
+ }
+ for (auto ctx : on_safe_ctxs) {
+ ldout(cct, 20) << ": completing safe context: " << ctx << dendl;
+ ctx->complete(r);
+ }
+
+ if (on_flush != nullptr) {
+ ldout(cct, 20) << ": completing flush context: " << on_flush << dendl;
+ on_flush->complete(r);
+ }
+}
+
+template <typename I>
+Context *Replay<I>::create_op_context_callback(uint64_t op_tid,
+ Context *on_ready,
+ Context *on_safe,
+ OpEvent **op_event) {
+ CephContext *cct = m_image_ctx.cct;
+ if (m_shut_down) {
+ ldout(cct, 5) << ": ignoring event after shut down" << dendl;
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN);
+ return nullptr;
+ }
+
+ ceph_assert(m_lock.is_locked());
+ if (m_op_events.count(op_tid) != 0) {
+ lderr(cct) << ": duplicate op tid detected: " << op_tid << dendl;
+
+ // on_ready is already async but on failure invoke on_safe async
+ // as well
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, -EINVAL);
+ return nullptr;
+ }
+
+ ++m_in_flight_op_events;
+ *op_event = &m_op_events[op_tid];
+ (*op_event)->on_start_safe = on_safe;
+
+ Context *on_op_complete = new C_OpOnComplete(this, op_tid);
+ (*op_event)->on_op_complete = on_op_complete;
+ return on_op_complete;
+}
+
+template <typename I>
+void Replay<I>::handle_op_complete(uint64_t op_tid, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": op_tid=" << op_tid << ", "
+ << "r=" << r << dendl;
+
+ OpEvent op_event;
+ bool shutting_down = false;
+ {
+ Mutex::Locker locker(m_lock);
+ auto op_it = m_op_events.find(op_tid);
+ ceph_assert(op_it != m_op_events.end());
+
+ op_event = std::move(op_it->second);
+ m_op_events.erase(op_it);
+
+ if (m_shut_down) {
+ ceph_assert(m_flush_ctx != nullptr);
+ shutting_down = true;
+ }
+ }
+
+ ceph_assert(op_event.on_start_ready == nullptr || (r < 0 && r != -ERESTART));
+ if (op_event.on_start_ready != nullptr) {
+ // blocking op event failed before it became ready
+ ceph_assert(op_event.on_finish_ready == nullptr &&
+ op_event.on_finish_safe == nullptr);
+
+ op_event.on_start_ready->complete(0);
+ } else {
+ // event kicked off by OpFinishEvent
+ ceph_assert((op_event.on_finish_ready != nullptr &&
+ op_event.on_finish_safe != nullptr) || shutting_down);
+ }
+
+ if (op_event.on_op_finish_event != nullptr) {
+ op_event.on_op_finish_event->complete(r);
+ }
+
+ if (op_event.on_finish_ready != nullptr) {
+ op_event.on_finish_ready->complete(0);
+ }
+
+ // filter out errors caused by replay of the same op
+ if (r < 0 && op_event.ignore_error_codes.count(r) != 0) {
+ r = 0;
+ }
+
+ op_event.on_start_safe->complete(r);
+ if (op_event.on_finish_safe != nullptr) {
+ op_event.on_finish_safe->complete(r);
+ }
+
+ // shut down request might have occurred while lock was
+ // dropped -- handle if pending
+ Context *on_flush = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_in_flight_op_events > 0);
+ --m_in_flight_op_events;
+ if (m_in_flight_op_events == 0 &&
+ (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) {
+ on_flush = m_flush_ctx;
+ }
+ }
+ if (on_flush != nullptr) {
+ m_image_ctx.op_work_queue->queue(on_flush, 0);
+ }
+}
+
+template <typename I>
+io::AioCompletion *
+Replay<I>::create_aio_modify_completion(Context *on_ready,
+ Context *on_safe,
+ io::aio_type_t aio_type,
+ bool *flush_required,
+ std::set<int> &&filters) {
+ Mutex::Locker locker(m_lock);
+ CephContext *cct = m_image_ctx.cct;
+ ceph_assert(m_on_aio_ready == nullptr);
+
+ if (m_shut_down) {
+ ldout(cct, 5) << ": ignoring event after shut down" << dendl;
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN);
+ return nullptr;
+ }
+
+ ++m_in_flight_aio_modify;
+
+ bool writeback_cache_enabled = m_image_ctx.is_writeback_cache_enabled();
+ if (writeback_cache_enabled) {
+ m_aio_modify_unsafe_contexts.push_back(on_safe);
+ }
+
+ // FLUSH if we hit the low-water mark -- on_safe contexts are
+ // completed by flushes-only so that we don't move the journal
+ // commit position until safely on-disk
+
+ *flush_required = (writeback_cache_enabled &&
+ m_aio_modify_unsafe_contexts.size() ==
+ IN_FLIGHT_IO_LOW_WATER_MARK);
+ if (*flush_required) {
+ ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush"
+ << dendl;
+ }
+
+ // READY for more events if:
+ // * not at high-water mark for IO
+ // * in-flight ops are at a consistent point (snap create has IO flushed,
+ // shrink has adjusted clip boundary, etc) -- should have already been
+ // flagged not-ready
+ if (m_in_flight_aio_modify == IN_FLIGHT_IO_HIGH_WATER_MARK) {
+ ldout(cct, 10) << ": hit AIO replay high-water mark: pausing replay"
+ << dendl;
+ ceph_assert(m_on_aio_ready == nullptr);
+ std::swap(m_on_aio_ready, on_ready);
+ }
+
+ // when the modification is ACKed by librbd, we can process the next
+ // event. when flushed, the completion of the next flush will fire the
+ // on_safe callback
+ auto aio_comp = io::AioCompletion::create_and_start<Context>(
+ new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters),
+ writeback_cache_enabled),
+ util::get_image_ctx(&m_image_ctx), aio_type);
+ return aio_comp;
+}
+
+template <typename I>
+io::AioCompletion *Replay<I>::create_aio_flush_completion(Context *on_safe) {
+ ceph_assert(m_lock.is_locked());
+
+ CephContext *cct = m_image_ctx.cct;
+ if (m_shut_down) {
+ ldout(cct, 5) << ": ignoring event after shut down" << dendl;
+ if (on_safe != nullptr) {
+ m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN);
+ }
+ return nullptr;
+ }
+
+ ++m_in_flight_aio_flush;
+
+ // associate all prior write/discard ops to this flush request
+ auto aio_comp = io::AioCompletion::create_and_start<Context>(
+ new C_AioFlushComplete(this, on_safe,
+ std::move(m_aio_modify_unsafe_contexts)),
+ util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH);
+ m_aio_modify_unsafe_contexts.clear();
+ return aio_comp;
+}
+
+template <typename I>
+bool Replay<I>::clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp) {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.snap_lock.get_read();
+ size_t image_size = m_image_ctx.size;
+ m_image_ctx.snap_lock.put_read();
+
+ if (image_offset >= image_size) {
+ // rbd-mirror image sync might race an IO event w/ associated resize between
+ // the point the peer is registered and the sync point is created, so no-op
+ // IO events beyond the current image extents since under normal conditions
+ // it wouldn't have been recorded in the journal
+ ldout(cct, 5) << ": no-op IO event beyond image size" << dendl;
+ aio_comp->get();
+ aio_comp->set_request_count(0);
+ aio_comp->put();
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::Replay<librbd::ImageCtx>;
diff --git a/src/librbd/journal/Replay.h b/src/librbd/journal/Replay.h
new file mode 100644
index 00000000..206d7db4
--- /dev/null
+++ b/src/librbd/journal/Replay.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_REPLAY_H
+#define CEPH_LIBRBD_JOURNAL_REPLAY_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "common/Mutex.h"
+#include "librbd/io/Types.h"
+#include "librbd/journal/Types.h"
+#include <boost/variant.hpp>
+#include <list>
+#include <unordered_set>
+#include <unordered_map>
+
+namespace librbd {
+
+class ImageCtx;
+namespace io { struct AioCompletion; }
+
+namespace journal {
+
+template <typename ImageCtxT = ImageCtx>
+class Replay {
+public:
+ static Replay *create(ImageCtxT &image_ctx) {
+ return new Replay(image_ctx);
+ }
+
+ Replay(ImageCtxT &image_ctx);
+ ~Replay();
+
+ int decode(bufferlist::const_iterator *it, EventEntry *event_entry);
+ void process(const EventEntry &event_entry,
+ Context *on_ready, Context *on_safe);
+
+ void shut_down(bool cancel_ops, Context *on_finish);
+ void flush(Context *on_finish);
+
+ void replay_op_ready(uint64_t op_tid, Context *on_resume);
+
+private:
+ typedef std::unordered_set<int> ReturnValues;
+
+ struct OpEvent {
+ bool op_in_progress = false;
+ bool finish_on_ready = false;
+ Context *on_op_finish_event = nullptr;
+ Context *on_start_ready = nullptr;
+ Context *on_start_safe = nullptr;
+ Context *on_finish_ready = nullptr;
+ Context *on_finish_safe = nullptr;
+ Context *on_op_complete = nullptr;
+ ReturnValues op_finish_error_codes;
+ ReturnValues ignore_error_codes;
+ };
+
+ typedef std::list<uint64_t> OpTids;
+ typedef std::list<Context *> Contexts;
+ typedef std::unordered_set<Context *> ContextSet;
+ typedef std::unordered_map<uint64_t, OpEvent> OpEvents;
+
+ struct C_OpOnComplete : public Context {
+ Replay *replay;
+ uint64_t op_tid;
+ C_OpOnComplete(Replay *replay, uint64_t op_tid)
+ : replay(replay), op_tid(op_tid) {
+ }
+ void finish(int r) override {
+ replay->handle_op_complete(op_tid, r);
+ }
+ };
+
+ struct C_AioModifyComplete : public Context {
+ Replay *replay;
+ Context *on_ready;
+ Context *on_safe;
+ std::set<int> filters;
+ bool writeback_cache_enabled;
+ C_AioModifyComplete(Replay *replay, Context *on_ready,
+ Context *on_safe, std::set<int> &&filters,
+ bool writeback_cache_enabled)
+ : replay(replay), on_ready(on_ready), on_safe(on_safe),
+ filters(std::move(filters)),
+ writeback_cache_enabled(writeback_cache_enabled) {
+ }
+ void finish(int r) override {
+ replay->handle_aio_modify_complete(on_ready, on_safe, r, filters,
+ writeback_cache_enabled);
+ }
+ };
+
+ struct C_AioFlushComplete : public Context {
+ Replay *replay;
+ Context *on_flush_safe;
+ Contexts on_safe_ctxs;
+ C_AioFlushComplete(Replay *replay, Context *on_flush_safe,
+ Contexts &&on_safe_ctxs)
+ : replay(replay), on_flush_safe(on_flush_safe),
+ on_safe_ctxs(on_safe_ctxs) {
+ }
+ void finish(int r) override {
+ replay->handle_aio_flush_complete(on_flush_safe, on_safe_ctxs, r);
+ }
+ };
+
+ struct EventVisitor : public boost::static_visitor<void> {
+ Replay *replay;
+ Context *on_ready;
+ Context *on_safe;
+
+ EventVisitor(Replay *_replay, Context *_on_ready, Context *_on_safe)
+ : replay(_replay), on_ready(_on_ready), on_safe(_on_safe) {
+ }
+
+ template <typename Event>
+ inline void operator()(const Event &event) const {
+ replay->handle_event(event, on_ready, on_safe);
+ }
+ };
+
+ ImageCtxT &m_image_ctx;
+
+ Mutex m_lock;
+
+ uint64_t m_in_flight_aio_flush = 0;
+ uint64_t m_in_flight_aio_modify = 0;
+ Contexts m_aio_modify_unsafe_contexts;
+ ContextSet m_aio_modify_safe_contexts;
+
+ OpEvents m_op_events;
+ uint64_t m_in_flight_op_events = 0;
+
+ bool m_shut_down = false;
+ Context *m_flush_ctx = nullptr;
+ Context *m_on_aio_ready = nullptr;
+
+ void handle_event(const AioDiscardEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioWriteEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioWriteSameEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioCompareAndWriteEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioFlushEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const OpFinishEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapCreateEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapRemoveEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapRenameEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapProtectEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapUnprotectEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapRollbackEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const RenameEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const ResizeEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const FlattenEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const DemotePromoteEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapLimitEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const UpdateFeaturesEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const MetadataSetEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const MetadataRemoveEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const UnknownEvent &event, Context *on_ready,
+ Context *on_safe);
+
+ void handle_aio_modify_complete(Context *on_ready, Context *on_safe,
+ int r, std::set<int> &filters,
+ bool writeback_cache_enabled);
+ void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs,
+ int r);
+
+ Context *create_op_context_callback(uint64_t op_tid, Context *on_ready,
+ Context *on_safe, OpEvent **op_event);
+ void handle_op_complete(uint64_t op_tid, int r);
+
+ io::AioCompletion *create_aio_modify_completion(Context *on_ready,
+ Context *on_safe,
+ io::aio_type_t aio_type,
+ bool *flush_required,
+ std::set<int> &&filters);
+ io::AioCompletion *create_aio_flush_completion(Context *on_safe);
+ void handle_aio_completion(io::AioCompletion *aio_comp);
+
+ bool clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::Replay<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_REPLAY_H
diff --git a/src/librbd/journal/ResetRequest.cc b/src/librbd/journal/ResetRequest.cc
new file mode 100644
index 00000000..9b67f3e4
--- /dev/null
+++ b/src/librbd/journal/ResetRequest.cc
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/ResetRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "include/ceph_assert.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::ResetRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template<typename I>
+void ResetRequest<I>::send() {
+ init_journaler();
+}
+
+template<typename I>
+void ResetRequest<I>::init_journaler() {
+ ldout(m_cct, 10) << dendl;
+
+ m_journaler = new Journaler(m_io_ctx, m_image_id, m_client_id, {});
+ Context *ctx = create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_init_journaler>(this);
+ m_journaler->init(ctx);
+}
+
+template<typename I>
+void ResetRequest<I>::handle_init_journaler(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 5) << "journal does not exist" << dendl;
+ m_ret_val = r;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to init journaler: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ } else {
+ int64_t pool_id;
+ m_journaler->get_metadata(&m_order, &m_splay_width, &pool_id);
+
+ if (pool_id != -1) {
+ librados::Rados rados(m_io_ctx);
+ r = rados.pool_reverse_lookup(pool_id, &m_object_pool_name);
+ if (r < 0) {
+ lderr(m_cct) << "failed to lookup data pool: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ }
+ }
+ }
+
+ shut_down_journaler();
+}
+
+template<typename I>
+void ResetRequest<I>::shut_down_journaler() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_op_work_queue, create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_journaler_shutdown>(this));
+ m_journaler->shut_down(ctx);
+}
+
+template<typename I>
+void ResetRequest<I>::handle_journaler_shutdown(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ delete m_journaler;
+ if (r < 0) {
+ lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(r)
+ << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ }
+
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ remove_journal();
+}
+
+template<typename I>
+void ResetRequest<I>::remove_journal() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_remove_journal>(this);
+ auto req = RemoveRequest<I>::create(m_io_ctx, m_image_id, m_client_id,
+ m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+void ResetRequest<I>::handle_remove_journal(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove journal: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ create_journal();
+}
+
+template<typename I>
+void ResetRequest<I>::create_journal() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_create_journal>(this);
+ journal::TagData tag_data(m_mirror_uuid);
+ auto req = CreateRequest<I>::create(m_io_ctx, m_image_id, m_order,
+ m_splay_width, m_object_pool_name,
+ cls::journal::Tag::TAG_CLASS_NEW,
+ tag_data, m_client_id, m_op_work_queue,
+ ctx);
+ req->send();
+}
+
+template<typename I>
+void ResetRequest<I>::handle_create_journal(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+}
+
+template<typename I>
+void ResetRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::ResetRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/ResetRequest.h b/src/librbd/journal/ResetRequest.h
new file mode 100644
index 00000000..9cc8e2e4
--- /dev/null
+++ b/src/librbd/journal/ResetRequest.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "common/Mutex.h"
+#include "librbd/journal/TypeTraits.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+class SafeTimer;
+
+namespace journal { class Journaler; }
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class ResetRequest {
+public:
+ static ResetRequest *create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ const std::string &client_id,
+ const std::string &mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new ResetRequest(io_ctx, image_id, client_id, mirror_uuid,
+ op_work_queue, on_finish);
+ }
+
+ ResetRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ const std::string &client_id, const std::string &mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_client_id(client_id),
+ m_mirror_uuid(mirror_uuid), m_op_work_queue(op_work_queue),
+ m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(m_io_ctx.cct())) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT_JOURNALER
+ * |
+ * v
+ * SHUT_DOWN_JOURNALER
+ * |
+ * v
+ * REMOVE_JOURNAL
+ * |
+ * v
+ * CREATE_JOURNAL
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ std::string m_client_id;
+ std::string m_mirror_uuid;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ Journaler *m_journaler = nullptr;
+ int m_ret_val = 0;
+
+ uint8_t m_order = 0;
+ uint8_t m_splay_width = 0;
+ std::string m_object_pool_name;
+
+ void init_journaler();
+ void handle_init_journaler(int r);
+
+ void shut_down_journaler();
+ void handle_journaler_shutdown(int r);
+
+ void remove_journal();
+ void handle_remove_journal(int r);
+
+ void create_journal();
+ void handle_create_journal(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::ResetRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
diff --git a/src/librbd/journal/StandardPolicy.cc b/src/librbd/journal/StandardPolicy.cc
new file mode 100644
index 00000000..58631801
--- /dev/null
+++ b/src/librbd/journal/StandardPolicy.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/StandardPolicy.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::StandardPolicy: "
+
+namespace librbd {
+namespace journal {
+
+template<typename I>
+void StandardPolicy<I>::allocate_tag_on_lock(Context *on_finish) {
+ ceph_assert(m_image_ctx->journal != nullptr);
+
+ if (!m_image_ctx->journal->is_tag_owner()) {
+ lderr(m_image_ctx->cct) << "local image not promoted" << dendl;
+ m_image_ctx->op_work_queue->queue(on_finish, -EPERM);
+ return;
+ }
+
+ m_image_ctx->journal->allocate_local_tag(on_finish);
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::StandardPolicy<librbd::ImageCtx>;
diff --git a/src/librbd/journal/StandardPolicy.h b/src/librbd/journal/StandardPolicy.h
new file mode 100644
index 00000000..ec8d0148
--- /dev/null
+++ b/src/librbd/journal/StandardPolicy.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
+
+#include "librbd/journal/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class StandardPolicy : public Policy {
+public:
+ StandardPolicy(ImageCtxT *image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ bool append_disabled() const override {
+ return false;
+ }
+ bool journal_disabled() const override {
+ return false;
+ }
+ void allocate_tag_on_lock(Context *on_finish) override;
+
+private:
+ ImageCtxT *m_image_ctx;
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::StandardPolicy<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
diff --git a/src/librbd/journal/TypeTraits.h b/src/librbd/journal/TypeTraits.h
new file mode 100644
index 00000000..d6dde690
--- /dev/null
+++ b/src/librbd/journal/TypeTraits.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
+#define CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
+
+namespace journal {
+class Future;
+class Journaler;
+class ReplayEntry;
+}
+
+namespace librbd {
+namespace journal {
+
+template <typename ImageCtxT>
+struct TypeTraits {
+ typedef ::journal::Journaler Journaler;
+ typedef ::journal::Future Future;
+ typedef ::journal::ReplayEntry ReplayEntry;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
diff --git a/src/librbd/journal/Types.cc b/src/librbd/journal/Types.cc
new file mode 100644
index 00000000..ee919d35
--- /dev/null
+++ b/src/librbd/journal/Types.cc
@@ -0,0 +1,950 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+
+namespace librbd {
+namespace journal {
+
+using ceph::encode;
+using ceph::decode;
+
+namespace {
+
+template <typename E>
+class GetTypeVisitor : public boost::static_visitor<E> {
+public:
+ template <typename T>
+ inline E operator()(const T&) const {
+ return T::TYPE;
+ }
+};
+
+class EncodeVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) {
+ }
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ encode(static_cast<uint32_t>(T::TYPE), m_bl);
+ t.encode(m_bl);
+ }
+private:
+ bufferlist &m_bl;
+};
+
+class DecodeVisitor : public boost::static_visitor<void> {
+public:
+ DecodeVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {
+ }
+
+ template <typename T>
+ inline void operator()(T& t) const {
+ t.decode(m_version, m_iter);
+ }
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpVisitor(Formatter *formatter, const std::string &key)
+ : m_formatter(formatter), m_key(key) {}
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ auto type = T::TYPE;
+ m_formatter->dump_string(m_key.c_str(), stringify(type));
+ t.dump(m_formatter);
+ }
+private:
+ ceph::Formatter *m_formatter;
+ std::string m_key;
+};
+
+} // anonymous namespace
+
+void AioDiscardEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ bool skip_partial_discard = (discard_granularity_bytes > 0);
+ encode(skip_partial_discard, bl);
+ encode(discard_granularity_bytes, bl);
+}
+
+void AioDiscardEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+
+ bool skip_partial_discard = false;
+ if (version >= 4) {
+ decode(skip_partial_discard, it);
+ }
+
+ if (version >= 5) {
+ decode(discard_granularity_bytes, it);
+ } else {
+ if (skip_partial_discard) {
+ // use a size larger than the maximum object size which will
+ // truncated down to object size during IO processing
+ discard_granularity_bytes = std::numeric_limits<uint32_t>::max();
+ } else {
+ discard_granularity_bytes = 0;
+ }
+ }
+}
+
+void AioDiscardEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+ f->dump_unsigned("discard_granularity_bytes", discard_granularity_bytes);
+}
+
+uint32_t AioWriteEvent::get_fixed_size() {
+ return EventEntry::get_fixed_size() + 16 /* offset, length */;
+}
+
+void AioWriteEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ encode(data, bl);
+}
+
+void AioWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+ decode(data, it);
+}
+
+void AioWriteEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+void AioWriteSameEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ encode(data, bl);
+}
+
+void AioWriteSameEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+ decode(data, it);
+}
+
+void AioWriteSameEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+uint32_t AioCompareAndWriteEvent::get_fixed_size() {
+ return EventEntry::get_fixed_size() + 32 /* offset, length */;
+}
+
+void AioCompareAndWriteEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ encode(cmp_data, bl);
+ encode(write_data, bl);
+}
+
+void AioCompareAndWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+ decode(cmp_data, it);
+ decode(write_data, it);
+}
+
+void AioCompareAndWriteEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+void AioFlushEvent::encode(bufferlist& bl) const {
+}
+
+void AioFlushEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void AioFlushEvent::dump(Formatter *f) const {
+}
+
+void OpEventBase::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(op_tid, bl);
+}
+
+void OpEventBase::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(op_tid, it);
+}
+
+void OpEventBase::dump(Formatter *f) const {
+ f->dump_unsigned("op_tid", op_tid);
+}
+
+void OpFinishEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(op_tid, bl);
+ encode(r, bl);
+}
+
+void OpFinishEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(op_tid, it);
+ decode(r, it);
+}
+
+void OpFinishEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("op_tid", op_tid);
+ f->dump_int("result", r);
+}
+
+void SnapEventBase::encode(bufferlist& bl) const {
+ using ceph::encode;
+ OpEventBase::encode(bl);
+ encode(snap_name, bl);
+ encode(snap_namespace, bl);
+}
+
+void SnapEventBase::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(snap_name, it);
+ if (version >= 4) {
+ decode(snap_namespace, it);
+ }
+}
+
+void SnapEventBase::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("snap_name", snap_name);
+ snap_namespace.dump(f);
+}
+
+void SnapCreateEvent::encode(bufferlist &bl) const {
+ SnapEventBase::encode(bl);
+}
+
+void SnapCreateEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ SnapEventBase::decode(version, it);
+ if (version == 3) {
+ decode(snap_namespace, it);
+ }
+}
+
+void SnapCreateEvent::dump(Formatter *f) const {
+ SnapEventBase::dump(f);
+}
+
+void SnapLimitEvent::encode(bufferlist &bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(limit, bl);
+}
+
+void SnapLimitEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(limit, it);
+}
+
+void SnapLimitEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("limit", limit);
+}
+
+void SnapRenameEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(dst_snap_name, bl);
+ encode(snap_id, bl);
+ encode(src_snap_name, bl);
+}
+
+void SnapRenameEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ OpEventBase::decode(version, it);
+ decode(dst_snap_name, it);
+ decode(snap_id, it);
+ if (version >= 2) {
+ decode(src_snap_name, it);
+ }
+}
+
+void SnapRenameEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("src_snap_id", snap_id);
+ f->dump_string("src_snap_name", src_snap_name);
+ f->dump_string("dest_snap_name", dst_snap_name);
+}
+
+void RenameEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(image_name, bl);
+}
+
+void RenameEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(image_name, it);
+}
+
+void RenameEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("image_name", image_name);
+}
+
+void ResizeEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(size, bl);
+}
+
+void ResizeEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(size, it);
+}
+
+void ResizeEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("size", size);
+}
+
+void DemotePromoteEvent::encode(bufferlist& bl) const {
+}
+
+void DemotePromoteEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void DemotePromoteEvent::dump(Formatter *f) const {
+}
+
+void UpdateFeaturesEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(features, bl);
+ encode(enabled, bl);
+}
+
+void UpdateFeaturesEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(features, it);
+ decode(enabled, it);
+}
+
+void UpdateFeaturesEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("features", features);
+ f->dump_bool("enabled", enabled);
+}
+
+void MetadataSetEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(key, bl);
+ encode(value, bl);
+}
+
+void MetadataSetEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(key, it);
+ decode(value, it);
+}
+
+void MetadataSetEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("key", key);
+ f->dump_string("value", value);
+}
+
+void MetadataRemoveEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(key, bl);
+}
+
+void MetadataRemoveEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(key, it);
+}
+
+void MetadataRemoveEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("key", key);
+}
+
+void UnknownEvent::encode(bufferlist& bl) const {
+ ceph_abort();
+}
+
+void UnknownEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void UnknownEvent::dump(Formatter *f) const {
+}
+
+EventType EventEntry::get_event_type() const {
+ return boost::apply_visitor(GetTypeVisitor<EventType>(), event);
+}
+
+void EventEntry::encode(bufferlist& bl) const {
+ ENCODE_START(5, 1, bl);
+ boost::apply_visitor(EncodeVisitor(bl), event);
+ ENCODE_FINISH(bl);
+ encode_metadata(bl);
+}
+
+void EventEntry::decode(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+
+ uint32_t event_type;
+ decode(event_type, it);
+
+ // select the correct payload variant based upon the encoded op
+ switch (event_type) {
+ case EVENT_TYPE_AIO_DISCARD:
+ event = AioDiscardEvent();
+ break;
+ case EVENT_TYPE_AIO_WRITE:
+ event = AioWriteEvent();
+ break;
+ case EVENT_TYPE_AIO_FLUSH:
+ event = AioFlushEvent();
+ break;
+ case EVENT_TYPE_OP_FINISH:
+ event = OpFinishEvent();
+ break;
+ case EVENT_TYPE_SNAP_CREATE:
+ event = SnapCreateEvent();
+ break;
+ case EVENT_TYPE_SNAP_REMOVE:
+ event = SnapRemoveEvent();
+ break;
+ case EVENT_TYPE_SNAP_RENAME:
+ event = SnapRenameEvent();
+ break;
+ case EVENT_TYPE_SNAP_PROTECT:
+ event = SnapProtectEvent();
+ break;
+ case EVENT_TYPE_SNAP_UNPROTECT:
+ event = SnapUnprotectEvent();
+ break;
+ case EVENT_TYPE_SNAP_ROLLBACK:
+ event = SnapRollbackEvent();
+ break;
+ case EVENT_TYPE_RENAME:
+ event = RenameEvent();
+ break;
+ case EVENT_TYPE_RESIZE:
+ event = ResizeEvent();
+ break;
+ case EVENT_TYPE_FLATTEN:
+ event = FlattenEvent();
+ break;
+ case EVENT_TYPE_DEMOTE_PROMOTE:
+ event = DemotePromoteEvent();
+ break;
+ case EVENT_TYPE_UPDATE_FEATURES:
+ event = UpdateFeaturesEvent();
+ break;
+ case EVENT_TYPE_METADATA_SET:
+ event = MetadataSetEvent();
+ break;
+ case EVENT_TYPE_METADATA_REMOVE:
+ event = MetadataRemoveEvent();
+ break;
+ case EVENT_TYPE_AIO_WRITESAME:
+ event = AioWriteSameEvent();
+ break;
+ case EVENT_TYPE_AIO_COMPARE_AND_WRITE:
+ event = AioCompareAndWriteEvent();
+ break;
+ default:
+ event = UnknownEvent();
+ break;
+ }
+
+ boost::apply_visitor(DecodeVisitor(struct_v, it), event);
+ DECODE_FINISH(it);
+ if (struct_v >= 4) {
+ decode_metadata(it);
+ }
+}
+
+void EventEntry::dump(Formatter *f) const {
+ boost::apply_visitor(DumpVisitor(f, "event_type"), event);
+ f->dump_stream("timestamp") << timestamp;
+}
+
+void EventEntry::encode_metadata(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(timestamp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EventEntry::decode_metadata(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+ decode(timestamp, it);
+ DECODE_FINISH(it);
+}
+
+void EventEntry::generate_test_instances(std::list<EventEntry *> &o) {
+ o.push_back(new EventEntry(AioDiscardEvent()));
+ o.push_back(new EventEntry(AioDiscardEvent(123, 345, 4096), utime_t(1, 1)));
+
+ bufferlist bl;
+ bl.append(std::string(32, '1'));
+ o.push_back(new EventEntry(AioWriteEvent()));
+ o.push_back(new EventEntry(AioWriteEvent(123, 456, bl), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(AioFlushEvent()));
+
+ o.push_back(new EventEntry(OpFinishEvent(123, -1), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapCreateEvent(), utime_t(1, 1)));
+ o.push_back(new EventEntry(SnapCreateEvent(234, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapRemoveEvent()));
+ o.push_back(new EventEntry(SnapRemoveEvent(345, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapRenameEvent()));
+ o.push_back(new EventEntry(SnapRenameEvent(456, 1, "src snap", "dest snap"),
+ utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapProtectEvent()));
+ o.push_back(new EventEntry(SnapProtectEvent(567, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapUnprotectEvent()));
+ o.push_back(new EventEntry(SnapUnprotectEvent(678, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapRollbackEvent()));
+ o.push_back(new EventEntry(SnapRollbackEvent(789, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(RenameEvent()));
+ o.push_back(new EventEntry(RenameEvent(890, "image name"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(ResizeEvent()));
+ o.push_back(new EventEntry(ResizeEvent(901, 1234), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(FlattenEvent(123), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(DemotePromoteEvent()));
+
+ o.push_back(new EventEntry(UpdateFeaturesEvent()));
+ o.push_back(new EventEntry(UpdateFeaturesEvent(123, 127, true), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(MetadataSetEvent()));
+ o.push_back(new EventEntry(MetadataSetEvent(123, "key", "value"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(MetadataRemoveEvent()));
+ o.push_back(new EventEntry(MetadataRemoveEvent(123, "key"), utime_t(1, 1)));
+}
+
+// Journal Client
+
+void ImageClientMeta::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(tag_class, bl);
+ encode(resync_requested, bl);
+}
+
+void ImageClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(tag_class, it);
+ decode(resync_requested, it);
+}
+
+void ImageClientMeta::dump(Formatter *f) const {
+ f->dump_unsigned("tag_class", tag_class);
+ f->dump_bool("resync_requested", resync_requested);
+}
+
+void MirrorPeerSyncPoint::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(snap_name, bl);
+ encode(from_snap_name, bl);
+ encode(object_number, bl);
+ encode(snap_namespace, bl);
+}
+
+void MirrorPeerSyncPoint::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(snap_name, it);
+ decode(from_snap_name, it);
+ decode(object_number, it);
+ if (version >= 2) {
+ decode(snap_namespace, it);
+ }
+}
+
+void MirrorPeerSyncPoint::dump(Formatter *f) const {
+ f->dump_string("snap_name", snap_name);
+ f->dump_string("from_snap_name", from_snap_name);
+ if (object_number) {
+ f->dump_unsigned("object_number", *object_number);
+ }
+ snap_namespace.dump(f);
+}
+
+void MirrorPeerClientMeta::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(image_id, bl);
+ encode(static_cast<uint32_t>(state), bl);
+ encode(sync_object_count, bl);
+ encode(static_cast<uint32_t>(sync_points.size()), bl);
+ for (auto &sync_point : sync_points) {
+ sync_point.encode(bl);
+ }
+ encode(snap_seqs, bl);
+}
+
+void MirrorPeerClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(image_id, it);
+
+ uint32_t decode_state;
+ decode(decode_state, it);
+ state = static_cast<MirrorPeerState>(decode_state);
+
+ decode(sync_object_count, it);
+
+ uint32_t sync_point_count;
+ decode(sync_point_count, it);
+ sync_points.resize(sync_point_count);
+ for (auto &sync_point : sync_points) {
+ sync_point.decode(version, it);
+ }
+
+ decode(snap_seqs, it);
+}
+
+void MirrorPeerClientMeta::dump(Formatter *f) const {
+ f->dump_string("image_id", image_id);
+ f->dump_stream("state") << state;
+ f->dump_unsigned("sync_object_count", sync_object_count);
+ f->open_array_section("sync_points");
+ for (auto &sync_point : sync_points) {
+ f->open_object_section("sync_point");
+ sync_point.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("snap_seqs");
+ for (auto &pair : snap_seqs) {
+ f->open_object_section("snap_seq");
+ f->dump_unsigned("local_snap_seq", pair.first);
+ f->dump_unsigned("peer_snap_seq", pair.second);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void CliClientMeta::encode(bufferlist& bl) const {
+}
+
+void CliClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void CliClientMeta::dump(Formatter *f) const {
+}
+
+void UnknownClientMeta::encode(bufferlist& bl) const {
+ ceph_abort();
+}
+
+void UnknownClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void UnknownClientMeta::dump(Formatter *f) const {
+}
+
+ClientMetaType ClientData::get_client_meta_type() const {
+ return boost::apply_visitor(GetTypeVisitor<ClientMetaType>(), client_meta);
+}
+
+void ClientData::encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ boost::apply_visitor(EncodeVisitor(bl), client_meta);
+ ENCODE_FINISH(bl);
+}
+
+void ClientData::decode(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+
+ uint32_t client_meta_type;
+ decode(client_meta_type, it);
+
+ // select the correct payload variant based upon the encoded op
+ switch (client_meta_type) {
+ case IMAGE_CLIENT_META_TYPE:
+ client_meta = ImageClientMeta();
+ break;
+ case MIRROR_PEER_CLIENT_META_TYPE:
+ client_meta = MirrorPeerClientMeta();
+ break;
+ case CLI_CLIENT_META_TYPE:
+ client_meta = CliClientMeta();
+ break;
+ default:
+ client_meta = UnknownClientMeta();
+ break;
+ }
+
+ boost::apply_visitor(DecodeVisitor(struct_v, it), client_meta);
+ DECODE_FINISH(it);
+}
+
+void ClientData::dump(Formatter *f) const {
+ boost::apply_visitor(DumpVisitor(f, "client_meta_type"), client_meta);
+}
+
+void ClientData::generate_test_instances(std::list<ClientData *> &o) {
+ o.push_back(new ClientData(ImageClientMeta()));
+ o.push_back(new ClientData(ImageClientMeta(123)));
+ o.push_back(new ClientData(MirrorPeerClientMeta()));
+ o.push_back(new ClientData(MirrorPeerClientMeta("image_id",
+ {{{}, "snap 2", "snap 1", 123}},
+ {{1, 2}, {3, 4}})));
+ o.push_back(new ClientData(CliClientMeta()));
+}
+
+// Journal Tag
+
+void TagPredecessor::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(mirror_uuid, bl);
+ encode(commit_valid, bl);
+ encode(tag_tid, bl);
+ encode(entry_tid, bl);
+}
+
+void TagPredecessor::decode(bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(mirror_uuid, it);
+ decode(commit_valid, it);
+ decode(tag_tid, it);
+ decode(entry_tid, it);
+}
+
+void TagPredecessor::dump(Formatter *f) const {
+ f->dump_string("mirror_uuid", mirror_uuid);
+ f->dump_string("commit_valid", commit_valid ? "true" : "false");
+ f->dump_unsigned("tag_tid", tag_tid);
+ f->dump_unsigned("entry_tid", entry_tid);
+}
+
+void TagData::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(mirror_uuid, bl);
+ predecessor.encode(bl);
+}
+
+void TagData::decode(bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(mirror_uuid, it);
+ predecessor.decode(it);
+}
+
+void TagData::dump(Formatter *f) const {
+ f->dump_string("mirror_uuid", mirror_uuid);
+ f->open_object_section("predecessor");
+ predecessor.dump(f);
+ f->close_section();
+}
+
+void TagData::generate_test_instances(std::list<TagData *> &o) {
+ o.push_back(new TagData());
+ o.push_back(new TagData("mirror-uuid"));
+ o.push_back(new TagData("mirror-uuid", "remote-mirror-uuid", true, 123, 234));
+}
+
+std::ostream &operator<<(std::ostream &out, const EventType &type) {
+ using namespace librbd::journal;
+
+ switch (type) {
+ case EVENT_TYPE_AIO_DISCARD:
+ out << "AioDiscard";
+ break;
+ case EVENT_TYPE_AIO_WRITE:
+ out << "AioWrite";
+ break;
+ case EVENT_TYPE_AIO_FLUSH:
+ out << "AioFlush";
+ break;
+ case EVENT_TYPE_OP_FINISH:
+ out << "OpFinish";
+ break;
+ case EVENT_TYPE_SNAP_CREATE:
+ out << "SnapCreate";
+ break;
+ case EVENT_TYPE_SNAP_REMOVE:
+ out << "SnapRemove";
+ break;
+ case EVENT_TYPE_SNAP_RENAME:
+ out << "SnapRename";
+ break;
+ case EVENT_TYPE_SNAP_PROTECT:
+ out << "SnapProtect";
+ break;
+ case EVENT_TYPE_SNAP_UNPROTECT:
+ out << "SnapUnprotect";
+ break;
+ case EVENT_TYPE_SNAP_ROLLBACK:
+ out << "SnapRollback";
+ break;
+ case EVENT_TYPE_RENAME:
+ out << "Rename";
+ break;
+ case EVENT_TYPE_RESIZE:
+ out << "Resize";
+ break;
+ case EVENT_TYPE_FLATTEN:
+ out << "Flatten";
+ break;
+ case EVENT_TYPE_DEMOTE_PROMOTE:
+ out << "Demote/Promote";
+ break;
+ case EVENT_TYPE_UPDATE_FEATURES:
+ out << "UpdateFeatures";
+ break;
+ case EVENT_TYPE_METADATA_SET:
+ out << "MetadataSet";
+ break;
+ case EVENT_TYPE_METADATA_REMOVE:
+ out << "MetadataRemove";
+ break;
+ case EVENT_TYPE_AIO_WRITESAME:
+ out << "AioWriteSame";
+ break;
+ case EVENT_TYPE_AIO_COMPARE_AND_WRITE:
+ out << "AioCompareAndWrite";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(type) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ClientMetaType &type) {
+ using namespace librbd::journal;
+
+ switch (type) {
+ case IMAGE_CLIENT_META_TYPE:
+ out << "Master Image";
+ break;
+ case MIRROR_PEER_CLIENT_META_TYPE:
+ out << "Mirror Peer";
+ break;
+ case CLI_CLIENT_META_TYPE:
+ out << "CLI Tool";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(type) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta) {
+ out << "[tag_class=" << meta.tag_class << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync) {
+ out << "[snap_name=" << sync.snap_name << ", "
+ << "from_snap_name=" << sync.from_snap_name;
+ if (sync.object_number) {
+ out << ", " << *sync.object_number;
+ }
+ out << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const MirrorPeerState &state) {
+ switch (state) {
+ case MIRROR_PEER_STATE_SYNCING:
+ out << "Syncing";
+ break;
+ case MIRROR_PEER_STATE_REPLAYING:
+ out << "Replaying";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta) {
+ out << "[image_id=" << meta.image_id << ", "
+ << "state=" << meta.state << ", "
+ << "sync_object_count=" << meta.sync_object_count << ", "
+ << "sync_points=[";
+ std::string delimiter;
+ for (auto &sync_point : meta.sync_points) {
+ out << delimiter << "[" << sync_point << "]";
+ delimiter = ", ";
+ }
+ out << "], snap_seqs=[";
+ delimiter = "";
+ for (auto &pair : meta.snap_seqs) {
+ out << delimiter << "["
+ << "local_snap_seq=" << pair.first << ", "
+ << "peer_snap_seq" << pair.second << "]";
+ delimiter = ", ";
+ }
+ out << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor) {
+ out << "["
+ << "mirror_uuid=" << predecessor.mirror_uuid;
+ if (predecessor.commit_valid) {
+ out << ", "
+ << "tag_tid=" << predecessor.tag_tid << ", "
+ << "entry_tid=" << predecessor.entry_tid;
+ }
+ out << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const TagData &tag_data) {
+ out << "["
+ << "mirror_uuid=" << tag_data.mirror_uuid << ", "
+ << "predecessor=" << tag_data.predecessor
+ << "]";
+ return out;
+}
+
+} // namespace journal
+} // namespace librbd
+
diff --git a/src/librbd/journal/Types.h b/src/librbd/journal/Types.h
new file mode 100644
index 00000000..ae5681ad
--- /dev/null
+++ b/src/librbd/journal/Types.h
@@ -0,0 +1,685 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_TYPES_H
+#define CEPH_LIBRBD_JOURNAL_TYPES_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/types.h"
+#include "include/utime.h"
+#include "librbd/Types.h"
+#include <iosfwd>
+#include <list>
+#include <boost/none.hpp>
+#include <boost/optional.hpp>
+#include <boost/variant.hpp>
+#include <boost/mpl/vector.hpp>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace librbd {
+namespace journal {
+
+enum EventType {
+ EVENT_TYPE_AIO_DISCARD = 0,
+ EVENT_TYPE_AIO_WRITE = 1,
+ EVENT_TYPE_AIO_FLUSH = 2,
+ EVENT_TYPE_OP_FINISH = 3,
+ EVENT_TYPE_SNAP_CREATE = 4,
+ EVENT_TYPE_SNAP_REMOVE = 5,
+ EVENT_TYPE_SNAP_RENAME = 6,
+ EVENT_TYPE_SNAP_PROTECT = 7,
+ EVENT_TYPE_SNAP_UNPROTECT = 8,
+ EVENT_TYPE_SNAP_ROLLBACK = 9,
+ EVENT_TYPE_RENAME = 10,
+ EVENT_TYPE_RESIZE = 11,
+ EVENT_TYPE_FLATTEN = 12,
+ EVENT_TYPE_DEMOTE_PROMOTE = 13,
+ EVENT_TYPE_SNAP_LIMIT = 14,
+ EVENT_TYPE_UPDATE_FEATURES = 15,
+ EVENT_TYPE_METADATA_SET = 16,
+ EVENT_TYPE_METADATA_REMOVE = 17,
+ EVENT_TYPE_AIO_WRITESAME = 18,
+ EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19,
+};
+
+struct AioDiscardEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_DISCARD;
+
+ uint64_t offset = 0;
+ uint64_t length = 0;
+ uint32_t discard_granularity_bytes = 0;
+
+ AioDiscardEvent() {
+ }
+ AioDiscardEvent(uint64_t _offset, uint64_t _length,
+ uint32_t discard_granularity_bytes)
+ : offset(_offset), length(_length),
+ discard_granularity_bytes(discard_granularity_bytes) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioWriteEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_WRITE;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist data;
+
+ static uint32_t get_fixed_size();
+
+ AioWriteEvent() : offset(0), length(0) {
+ }
+ AioWriteEvent(uint64_t _offset, uint64_t _length, const bufferlist &_data)
+ : offset(_offset), length(_length), data(_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioWriteSameEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_WRITESAME;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist data;
+
+ AioWriteSameEvent() : offset(0), length(0) {
+ }
+ AioWriteSameEvent(uint64_t _offset, uint64_t _length,
+ const bufferlist &_data)
+ : offset(_offset), length(_length), data(_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioCompareAndWriteEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_COMPARE_AND_WRITE;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist cmp_data;
+ bufferlist write_data;
+
+ static uint32_t get_fixed_size();
+
+ AioCompareAndWriteEvent() : offset(0), length(0) {
+ }
+ AioCompareAndWriteEvent(uint64_t _offset, uint64_t _length,
+ const bufferlist &_cmp_data, const bufferlist &_write_data)
+ : offset(_offset), length(_length), cmp_data(_cmp_data), write_data(_write_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioFlushEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_FLUSH;
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct OpEventBase {
+ uint64_t op_tid;
+
+protected:
+ OpEventBase() : op_tid(0) {
+ }
+ OpEventBase(uint64_t op_tid) : op_tid(op_tid) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct OpFinishEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_OP_FINISH;
+
+ int r;
+
+ OpFinishEvent() : r(0) {
+ }
+ OpFinishEvent(uint64_t op_tid, int r) : OpEventBase(op_tid), r(r) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapEventBase : public OpEventBase {
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+
+protected:
+ SnapEventBase() {
+ }
+ SnapEventBase(uint64_t op_tid, const cls::rbd::SnapshotNamespace& _snap_namespace,
+ const std::string &_snap_name)
+ : OpEventBase(op_tid),
+ snap_namespace(_snap_namespace),
+ snap_name(_snap_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapCreateEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_CREATE;
+
+ SnapCreateEvent() {
+ }
+ SnapCreateEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapRemoveEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_REMOVE;
+
+ SnapRemoveEvent() {
+ }
+ SnapRemoveEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct SnapRenameEvent : public OpEventBase{
+ static const EventType TYPE = EVENT_TYPE_SNAP_RENAME;
+
+ uint64_t snap_id;
+ std::string src_snap_name;
+ std::string dst_snap_name;
+
+ SnapRenameEvent() : snap_id(CEPH_NOSNAP) {
+ }
+ SnapRenameEvent(uint64_t op_tid, uint64_t src_snap_id,
+ const std::string &src_snap_name,
+ const std::string &dest_snap_name)
+ : OpEventBase(op_tid),
+ snap_id(src_snap_id),
+ src_snap_name(src_snap_name),
+ dst_snap_name(dest_snap_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapProtectEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_PROTECT;
+
+ SnapProtectEvent() {
+ }
+ SnapProtectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct SnapUnprotectEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_UNPROTECT;
+
+ SnapUnprotectEvent() {
+ }
+ SnapUnprotectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct SnapLimitEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_LIMIT;
+ uint64_t limit;
+
+ SnapLimitEvent() {
+ }
+ SnapLimitEvent(uint64_t op_tid, const uint64_t _limit)
+ : OpEventBase(op_tid), limit(_limit) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapRollbackEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_ROLLBACK;
+
+ SnapRollbackEvent() {
+ }
+ SnapRollbackEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct RenameEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_RENAME;
+
+ std::string image_name;
+
+ RenameEvent() {
+ }
+ RenameEvent(uint64_t op_tid, const std::string &_image_name)
+ : OpEventBase(op_tid), image_name(_image_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct ResizeEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_RESIZE;
+
+ uint64_t size;
+
+ ResizeEvent() : size(0) {
+ }
+ ResizeEvent(uint64_t op_tid, uint64_t _size)
+ : OpEventBase(op_tid), size(_size) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct FlattenEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_FLATTEN;
+
+ FlattenEvent() {
+ }
+ FlattenEvent(uint64_t op_tid) : OpEventBase(op_tid) {
+ }
+
+ using OpEventBase::encode;
+ using OpEventBase::decode;
+ using OpEventBase::dump;
+};
+
+struct DemotePromoteEvent {
+ static const EventType TYPE = static_cast<EventType>(
+ EVENT_TYPE_DEMOTE_PROMOTE);
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct UpdateFeaturesEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_UPDATE_FEATURES;
+
+ uint64_t features;
+ bool enabled;
+
+ UpdateFeaturesEvent() : features(0), enabled(false) {
+ }
+ UpdateFeaturesEvent(uint64_t op_tid, uint64_t _features, bool _enabled)
+ : OpEventBase(op_tid), features(_features), enabled(_enabled) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct MetadataSetEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_METADATA_SET;
+
+ string key;
+ string value;
+
+ MetadataSetEvent() {
+ }
+ MetadataSetEvent(uint64_t op_tid, const string &_key, const string &_value)
+ : OpEventBase(op_tid), key(_key), value(_value) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct MetadataRemoveEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_METADATA_REMOVE;
+
+ string key;
+
+ MetadataRemoveEvent() {
+ }
+ MetadataRemoveEvent(uint64_t op_tid, const string &_key)
+ : OpEventBase(op_tid), key(_key) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownEvent {
+ static const EventType TYPE = static_cast<EventType>(-1);
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::mpl::vector<AioDiscardEvent,
+ AioWriteEvent,
+ AioFlushEvent,
+ OpFinishEvent,
+ SnapCreateEvent,
+ SnapRemoveEvent,
+ SnapRenameEvent,
+ SnapProtectEvent,
+ SnapUnprotectEvent,
+ SnapRollbackEvent,
+ RenameEvent,
+ ResizeEvent,
+ FlattenEvent,
+ DemotePromoteEvent,
+ SnapLimitEvent,
+ UpdateFeaturesEvent,
+ MetadataSetEvent,
+ MetadataRemoveEvent,
+ AioWriteSameEvent,
+ AioCompareAndWriteEvent,
+ UnknownEvent> EventVector;
+typedef boost::make_variant_over<EventVector>::type Event;
+
+struct EventEntry {
+ static uint32_t get_fixed_size() {
+ return EVENT_FIXED_SIZE + METADATA_FIXED_SIZE;
+ }
+
+ EventEntry() : event(UnknownEvent()) {
+ }
+ EventEntry(const Event &_event, const utime_t &_timestamp = utime_t())
+ : event(_event), timestamp(_timestamp) {
+ }
+
+ Event event;
+ utime_t timestamp;
+
+ EventType get_event_type() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<EventEntry *> &o);
+
+private:
+ static const uint32_t EVENT_FIXED_SIZE = 14; /// version encoding, type
+ static const uint32_t METADATA_FIXED_SIZE = 14; /// version encoding, timestamp
+
+ void encode_metadata(bufferlist& bl) const;
+ void decode_metadata(bufferlist::const_iterator& it);
+};
+
+// Journal Client data structures
+
+enum ClientMetaType {
+ IMAGE_CLIENT_META_TYPE = 0,
+ MIRROR_PEER_CLIENT_META_TYPE = 1,
+ CLI_CLIENT_META_TYPE = 2
+};
+
+struct ImageClientMeta {
+ static const ClientMetaType TYPE = IMAGE_CLIENT_META_TYPE;
+
+ uint64_t tag_class = 0;
+ bool resync_requested = false;
+
+ ImageClientMeta() {
+ }
+ ImageClientMeta(uint64_t tag_class) : tag_class(tag_class) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct MirrorPeerSyncPoint {
+ typedef boost::optional<uint64_t> ObjectNumber;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+ std::string from_snap_name;
+ ObjectNumber object_number;
+
+ MirrorPeerSyncPoint() : MirrorPeerSyncPoint({}, "", "", boost::none) {
+ }
+ MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ const ObjectNumber &object_number)
+ : MirrorPeerSyncPoint(snap_namespace, snap_name, "", object_number) {
+ }
+ MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ const std::string &from_snap_name,
+ const ObjectNumber &object_number)
+ : snap_namespace(snap_namespace), snap_name(snap_name),
+ from_snap_name(from_snap_name), object_number(object_number) {
+ }
+
+ inline bool operator==(const MirrorPeerSyncPoint &sync) const {
+ return (snap_name == sync.snap_name &&
+ from_snap_name == sync.from_snap_name &&
+ object_number == sync.object_number &&
+ snap_namespace == sync.snap_namespace);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+enum MirrorPeerState {
+ MIRROR_PEER_STATE_SYNCING,
+ MIRROR_PEER_STATE_REPLAYING
+};
+
+struct MirrorPeerClientMeta {
+ typedef std::list<MirrorPeerSyncPoint> SyncPoints;
+
+ static const ClientMetaType TYPE = MIRROR_PEER_CLIENT_META_TYPE;
+
+ std::string image_id;
+ MirrorPeerState state = MIRROR_PEER_STATE_SYNCING; ///< replay state
+ uint64_t sync_object_count = 0; ///< maximum number of objects ever sync'ed
+ SyncPoints sync_points; ///< max two in-use snapshots for sync
+ SnapSeqs snap_seqs; ///< local to peer snap seq mapping
+
+ MirrorPeerClientMeta() {
+ }
+ MirrorPeerClientMeta(const std::string &image_id,
+ const SyncPoints &sync_points = SyncPoints(),
+ const SnapSeqs &snap_seqs = SnapSeqs())
+ : image_id(image_id), sync_points(sync_points), snap_seqs(snap_seqs) {
+ }
+
+ inline bool operator==(const MirrorPeerClientMeta &meta) const {
+ return (image_id == meta.image_id &&
+ state == meta.state &&
+ sync_object_count == meta.sync_object_count &&
+ sync_points == meta.sync_points &&
+ snap_seqs == meta.snap_seqs);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct CliClientMeta {
+ static const ClientMetaType TYPE = CLI_CLIENT_META_TYPE;
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownClientMeta {
+ static const ClientMetaType TYPE = static_cast<ClientMetaType>(-1);
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ImageClientMeta,
+ MirrorPeerClientMeta,
+ CliClientMeta,
+ UnknownClientMeta> ClientMeta;
+
+struct ClientData {
+ ClientData() {
+ }
+ ClientData(const ClientMeta &client_meta) : client_meta(client_meta) {
+ }
+
+ ClientMeta client_meta;
+
+ ClientMetaType get_client_meta_type() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<ClientData *> &o);
+};
+
+// Journal Tag data structures
+
+struct TagPredecessor {
+ std::string mirror_uuid; // empty if local
+ bool commit_valid = false;
+ uint64_t tag_tid = 0;
+ uint64_t entry_tid = 0;
+
+ TagPredecessor() {
+ }
+ TagPredecessor(const std::string &mirror_uuid, bool commit_valid,
+ uint64_t tag_tid, uint64_t entry_tid)
+ : mirror_uuid(mirror_uuid), commit_valid(commit_valid), tag_tid(tag_tid),
+ entry_tid(entry_tid) {
+ }
+
+ inline bool operator==(const TagPredecessor &rhs) const {
+ return (mirror_uuid == rhs.mirror_uuid &&
+ commit_valid == rhs.commit_valid &&
+ tag_tid == rhs.tag_tid &&
+ entry_tid == rhs.entry_tid);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct TagData {
+ // owner of the tag (exclusive lock epoch)
+ std::string mirror_uuid; // empty if local
+
+ // mapping to last committed record of previous tag
+ TagPredecessor predecessor;
+
+ TagData() {
+ }
+ TagData(const std::string &mirror_uuid) : mirror_uuid(mirror_uuid) {
+ }
+ TagData(const std::string &mirror_uuid,
+ const std::string &predecessor_mirror_uuid,
+ bool predecessor_commit_valid,
+ uint64_t predecessor_tag_tid, uint64_t predecessor_entry_tid)
+ : mirror_uuid(mirror_uuid),
+ predecessor(predecessor_mirror_uuid, predecessor_commit_valid,
+ predecessor_tag_tid, predecessor_entry_tid) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<TagData *> &o);
+};
+
+std::ostream &operator<<(std::ostream &out, const EventType &type);
+std::ostream &operator<<(std::ostream &out, const ClientMetaType &type);
+std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerState &meta);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta);
+std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor);
+std::ostream &operator<<(std::ostream &out, const TagData &tag_data);
+
+struct Listener {
+ virtual ~Listener() {
+ }
+
+ /// invoked when journal close is requested
+ virtual void handle_close() = 0;
+
+ /// invoked when journal is promoted to primary
+ virtual void handle_promoted() = 0;
+
+ /// invoked when journal resync is requested
+ virtual void handle_resync() = 0;
+};
+
+WRITE_CLASS_ENCODER(EventEntry);
+WRITE_CLASS_ENCODER(ClientData);
+WRITE_CLASS_ENCODER(TagData);
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_TYPES_H
diff --git a/src/librbd/journal/Utils.cc b/src/librbd/journal/Utils.cc
new file mode 100644
index 00000000..1721a9b2
--- /dev/null
+++ b/src/librbd/journal/Utils.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/journal/Types.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::"
+
+namespace librbd {
+namespace journal {
+namespace util {
+
+int C_DecodeTag::decode(bufferlist::const_iterator *it, TagData *tag_data) {
+ try {
+ using ceph::decode;
+ decode(*tag_data, *it);
+ } catch (const buffer::error &err) {
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+int C_DecodeTag::process(int r) {
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": "
+ << "failed to allocate tag: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ Mutex::Locker locker(*lock);
+ *tag_tid = tag.tid;
+
+ auto data_it = tag.data.cbegin();
+ r = decode(&data_it, tag_data);
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": "
+ << "failed to decode allocated tag" << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "C_DecodeTag: " << this << " " << __func__ << ": "
+ << "allocated journal tag: "
+ << "tid=" << tag.tid << ", "
+ << "data=" << *tag_data << dendl;
+ return 0;
+}
+
+int C_DecodeTags::process(int r) {
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "failed to retrieve journal tags: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (tags.empty()) {
+ lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "no journal tags retrieved" << dendl;
+ return -ENOENT;
+ }
+
+ Mutex::Locker locker(*lock);
+ *tag_tid = tags.back().tid;
+ auto data_it = tags.back().data.cbegin();
+ r = C_DecodeTag::decode(&data_it, tag_data);
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "failed to decode journal tag" << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "most recent journal tag: "
+ << "tid=" << *tag_tid << ", "
+ << "data=" << *tag_data << dendl;
+ return 0;
+}
+
+} // namespace util
+} // namespace journal
+} // namespace librbd
diff --git a/src/librbd/journal/Utils.h b/src/librbd/journal/Utils.h
new file mode 100644
index 00000000..63d37c03
--- /dev/null
+++ b/src/librbd/journal/Utils.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_UTILS_H
+#define CEPH_LIBRBD_JOURNAL_UTILS_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "cls/journal/cls_journal_types.h"
+#include <list>
+
+struct CephContext;
+struct Mutex;
+
+namespace librbd {
+namespace journal {
+
+struct TagData;
+
+namespace util {
+
+struct C_DecodeTag : public Context {
+ CephContext *cct;
+ Mutex *lock;
+ uint64_t *tag_tid;
+ TagData *tag_data;
+ Context *on_finish;
+
+ cls::journal::Tag tag;
+
+ C_DecodeTag(CephContext *cct, Mutex *lock, uint64_t *tag_tid,
+ TagData *tag_data, Context *on_finish)
+ : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data),
+ on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ on_finish->complete(process(r));
+ Context::complete(0);
+ }
+ void finish(int r) override {
+ }
+
+ int process(int r);
+
+ static int decode(bufferlist::const_iterator *it, TagData *tag_data);
+
+};
+
+struct C_DecodeTags : public Context {
+ typedef std::list<cls::journal::Tag> Tags;
+
+ CephContext *cct;
+ Mutex *lock;
+ uint64_t *tag_tid;
+ TagData *tag_data;
+ Context *on_finish;
+
+ Tags tags;
+
+ C_DecodeTags(CephContext *cct, Mutex *lock, uint64_t *tag_tid,
+ TagData *tag_data, Context *on_finish)
+ : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data),
+ on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ on_finish->complete(process(r));
+ Context::complete(0);
+ }
+ void finish(int r) override {
+ }
+
+ int process(int r);
+};
+
+} // namespace util
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_UTILS_H
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
new file mode 100644
index 00000000..ccab8893
--- /dev/null
+++ b/src/librbd/librbd.cc
@@ -0,0 +1,6356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/int_types.h"
+
+#include <errno.h>
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/TracepointProvider.h"
+#include "include/Context.h"
+
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/DiffIterate.h"
+#include "librbd/api/Group.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Migration.h"
+#include "librbd/api/Mirror.h"
+#include "librbd/api/Namespace.h"
+#include "librbd/api/Pool.h"
+#include "librbd/api/PoolMetadata.h"
+#include "librbd/api/Snapshot.h"
+#include "librbd/api/Trash.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ReadResult.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/librbd.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd: "
+
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+
+namespace {
+
+TracepointProvider::Traits tracepoint_traits("librbd_tp.so", "rbd_tracing");
+
+static auto create_write_raw(librbd::ImageCtx *ictx, const char *buf,
+ size_t len) {
+ // TODO: until librados can guarantee memory won't be referenced after
+ // it ACKs a request, always make a copy of the user-provided memory
+ return buffer::copy(buf, len);
+}
+
+CephContext* get_cct(IoCtx &io_ctx) {
+ return reinterpret_cast<CephContext*>(io_ctx.cct());
+}
+
+librbd::io::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) {
+ return reinterpret_cast<librbd::io::AioCompletion *>(comp->pc);
+}
+
+struct C_AioCompletion : public Context {
+ CephContext *cct;
+ librbd::io::aio_type_t aio_type;
+ librbd::io::AioCompletion* aio_comp;
+
+ C_AioCompletion(librbd::ImageCtx *ictx, librbd::io::aio_type_t aio_type,
+ librbd::io::AioCompletion* aio_comp)
+ : cct(ictx->cct), aio_type(aio_type), aio_comp(aio_comp) {
+ aio_comp->init_time(ictx, aio_type);
+ aio_comp->get();
+ }
+
+ void finish(int r) override {
+ ldout(cct, 20) << "C_AioComplete::finish: r=" << r << dendl;
+ if (r < 0) {
+ aio_comp->fail(r);
+ } else {
+ aio_comp->lock.Lock();
+ aio_comp->complete();
+ aio_comp->put_unlock();
+ }
+ }
+};
+
+struct C_OpenComplete : public C_AioCompletion {
+ librbd::ImageCtx *ictx;
+ void **ictxp;
+ C_OpenComplete(librbd::ImageCtx *ictx, librbd::io::AioCompletion* comp,
+ void **ictxp)
+ : C_AioCompletion(ictx, librbd::io::AIO_TYPE_OPEN, comp),
+ ictx(ictx), ictxp(ictxp) {
+ }
+ void finish(int r) override {
+ ldout(ictx->cct, 20) << "C_OpenComplete::finish: r=" << r << dendl;
+ if (r < 0) {
+ *ictxp = nullptr;
+ } else {
+ *ictxp = ictx;
+ }
+
+ C_AioCompletion::finish(r);
+ }
+};
+
+struct C_OpenAfterCloseComplete : public Context {
+ librbd::ImageCtx *ictx;
+ librbd::io::AioCompletion* comp;
+ void **ictxp;
+ C_OpenAfterCloseComplete(librbd::ImageCtx *ictx,
+ librbd::io::AioCompletion* comp,
+ void **ictxp)
+ : ictx(ictx), comp(comp), ictxp(ictxp) {
+ }
+ void finish(int r) override {
+ ldout(ictx->cct, 20) << "C_OpenAfterCloseComplete::finish: r=" << r
+ << dendl;
+ delete reinterpret_cast<librbd::ImageCtx*>(*ictxp);
+ *ictxp = nullptr;
+
+ ictx->state->open(0, new C_OpenComplete(ictx, comp, ictxp));
+ }
+};
+
+struct C_UpdateWatchCB : public librbd::UpdateWatchCtx {
+ rbd_update_callback_t watch_cb;
+ void *arg;
+ uint64_t handle = 0;
+
+ C_UpdateWatchCB(rbd_update_callback_t watch_cb, void *arg) :
+ watch_cb(watch_cb), arg(arg) {
+ }
+ void handle_notify() override {
+ watch_cb(arg);
+ }
+};
+
+void group_image_status_cpp_to_c(const librbd::group_image_info_t &cpp_info,
+ rbd_group_image_info_t *c_info) {
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->pool = cpp_info.pool;
+ c_info->state = cpp_info.state;
+}
+
+void group_info_cpp_to_c(const librbd::group_info_t &cpp_info,
+ rbd_group_info_t *c_info) {
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->pool = cpp_info.pool;
+}
+
+void group_snap_info_cpp_to_c(const librbd::group_snap_info_t &cpp_info,
+ rbd_group_snap_info_t *c_info) {
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->state = cpp_info.state;
+}
+
+void mirror_image_info_cpp_to_c(const librbd::mirror_image_info_t &cpp_info,
+ rbd_mirror_image_info_t *c_info) {
+ c_info->global_id = strdup(cpp_info.global_id.c_str());
+ c_info->state = cpp_info.state;
+ c_info->primary = cpp_info.primary;
+}
+
+void mirror_image_status_cpp_to_c(const librbd::mirror_image_status_t &cpp_status,
+ rbd_mirror_image_status_t *c_status) {
+ c_status->name = strdup(cpp_status.name.c_str());
+ mirror_image_info_cpp_to_c(cpp_status.info, &c_status->info);
+ c_status->state = cpp_status.state;
+ c_status->description = strdup(cpp_status.description.c_str());
+ c_status->last_update = cpp_status.last_update;
+ c_status->up = cpp_status.up;
+}
+
+void trash_image_info_cpp_to_c(const librbd::trash_image_info_t &cpp_info,
+ rbd_trash_image_info_t *c_info) {
+ c_info->id = strdup(cpp_info.id.c_str());
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->source = cpp_info.source;
+ c_info->deletion_time = cpp_info.deletion_time;
+ c_info->deferment_end_time = cpp_info.deferment_end_time;
+}
+
+void config_option_cpp_to_c(const librbd::config_option_t &cpp_option,
+ rbd_config_option_t *c_option) {
+ c_option->name = strdup(cpp_option.name.c_str());
+ c_option->value = strdup(cpp_option.value.c_str());
+ c_option->source = cpp_option.source;
+}
+
+void config_option_cleanup(rbd_config_option_t &option) {
+ free(option.name);
+ free(option.value);
+}
+
+struct C_MirrorImageGetInfo : public Context {
+ rbd_mirror_image_info_t *mirror_image_info;
+ Context *on_finish;
+
+ librbd::mirror_image_info_t cpp_mirror_image_info;
+
+ C_MirrorImageGetInfo(rbd_mirror_image_info_t *mirror_image_info,
+ Context *on_finish)
+ : mirror_image_info(mirror_image_info), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_info_cpp_to_c(cpp_mirror_image_info, mirror_image_info);
+ on_finish->complete(0);
+ }
+};
+
+struct C_MirrorImageGetStatus : public Context {
+ rbd_mirror_image_status_t *mirror_image_status;
+ Context *on_finish;
+
+ librbd::mirror_image_status_t cpp_mirror_image_status;
+
+ C_MirrorImageGetStatus(rbd_mirror_image_status_t *mirror_image_status,
+ Context *on_finish)
+ : mirror_image_status(mirror_image_status), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_status_cpp_to_c(cpp_mirror_image_status, mirror_image_status);
+ on_finish->complete(0);
+ }
+};
+
+} // anonymous namespace
+
+namespace librbd {
+ ProgressContext::~ProgressContext()
+ {
+ }
+
+ class CProgressContext : public ProgressContext
+ {
+ public:
+ CProgressContext(librbd_progress_fn_t fn, void *data)
+ : m_fn(fn), m_data(data)
+ {
+ }
+ int update_progress(uint64_t offset, uint64_t src_size) override
+ {
+ return m_fn(offset, src_size, m_data);
+ }
+ private:
+ librbd_progress_fn_t m_fn;
+ void *m_data;
+ };
+
+ /*
+ * Pool stats
+ */
+ PoolStats::PoolStats() {
+ rbd_pool_stats_create(&pool_stats);
+ }
+
+ PoolStats::~PoolStats() {
+ rbd_pool_stats_destroy(pool_stats);
+ }
+
+ int PoolStats::add(rbd_pool_stat_option_t option, uint64_t* opt_val) {
+ return rbd_pool_stats_option_add_uint64(pool_stats, option, opt_val);
+ }
+
+ /*
+ * RBD
+ */
+ RBD::RBD()
+ {
+ }
+
+ RBD::~RBD()
+ {
+ }
+
+ void RBD::version(int *major, int *minor, int *extra)
+ {
+ rbd_version(major, minor, extra);
+ }
+
+ int RBD::open(IoCtx& io_ctx, Image& image, const char *name)
+ {
+ return open(io_ctx, image, name, NULL);
+ }
+
+ int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id)
+ {
+ return open_by_id(io_ctx, image, id, nullptr);
+ }
+
+ int RBD::open(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = NULL;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = nullptr;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_by_id_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = NULL;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = nullptr;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_by_id_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order);
+ int r = librbd::create(io_ctx, name, size, order);
+ tracepoint(librbd, create_exit, r, *order);
+ return r;
+ }
+
+ int RBD::create2(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order);
+ int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0);
+ tracepoint(librbd, create2_exit, r, *order);
+ return r;
+ }
+
+ int RBD::create3(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order, uint64_t stripe_unit,
+ uint64_t stripe_count)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count);
+ int r = librbd::create(io_ctx, name, size, false, features, order,
+ stripe_unit, stripe_count);
+ tracepoint(librbd, create3_exit, r, *order);
+ return r;
+ }
+
+ int RBD::create4(IoCtx& io_ctx, const char *name, uint64_t size,
+ ImageOptions& opts)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts.opts);
+ int r = librbd::create(io_ctx, name, "", size, opts, "", "", false);
+ tracepoint(librbd, create4_exit, r);
+ return r;
+ }
+
+ int RBD::clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+ tracepoint(librbd, clone_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features);
+ int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
+ features, c_order, 0, 0);
+ tracepoint(librbd, clone_exit, r, *c_order);
+ return r;
+ }
+
+ int RBD::clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order, uint64_t stripe_unit, int stripe_count)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+ tracepoint(librbd, clone2_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features, stripe_unit, stripe_count);
+ int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
+ features, c_order, stripe_unit, stripe_count);
+ tracepoint(librbd, clone2_exit, r, *c_order);
+ return r;
+ }
+
+ int RBD::clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+ tracepoint(librbd, clone3_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, c_opts.opts);
+ int r = librbd::clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx,
+ nullptr, c_name, c_opts, "", "");
+ tracepoint(librbd, clone3_exit, r);
+ return r;
+ }
+
+ int RBD::remove(IoCtx& io_ctx, const char *name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+ }
+
+ int RBD::remove_with_progress(IoCtx& io_ctx, const char *name,
+ ProgressContext& pctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ int r = librbd::api::Image<>::remove(io_ctx, name, pctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+ }
+
+ int RBD::trash_move(IoCtx &io_ctx, const char *name, uint64_t delay) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER,
+ name, delay);
+ tracepoint(librbd, trash_move_exit, r);
+ return r;
+ }
+
+ int RBD::trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info) {
+ return librbd::api::Trash<>::get(io_ctx, id, info);
+ }
+
+ int RBD::trash_list(IoCtx &io_ctx, vector<trash_image_info_t> &entries) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_list_enter,
+ io_ctx.get_pool_name().c_str(), io_ctx.get_id());
+ int r = librbd::api::Trash<>::list(io_ctx, entries, true);
+#ifdef WITH_LTTNG
+ if (r >= 0) {
+ for (const auto& entry : entries) {
+ tracepoint(librbd, trash_list_entry, entry.id.c_str());
+ }
+ }
+#endif
+ tracepoint(librbd, trash_list_exit, r, r);
+ return r;
+ }
+
+ int RBD::trash_remove(IoCtx &io_ctx, const char *image_id, bool force) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+ }
+
+ int RBD::trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+ bool force, ProgressContext &pctx) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, pctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+ }
+
+ int RBD::trash_restore(IoCtx &io_ctx, const char *id, const char *name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), id, name);
+ int r = librbd::api::Trash<>::restore(
+ io_ctx, librbd::api::Trash<>::RESTORE_SOURCE_WHITELIST, id, name);
+ tracepoint(librbd, trash_undelete_exit, r);
+ return r;
+ }
+
+ int RBD::trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ NoOpProgressContext nop_pctx;
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+ }
+
+ int RBD::trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts,
+ float threshold, ProgressContext &pctx) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+ }
+
+ int RBD::namespace_create(IoCtx& io_ctx, const char *namespace_name) {
+ return librbd::api::Namespace<>::create(io_ctx, namespace_name);
+ }
+
+ int RBD::namespace_remove(IoCtx& io_ctx, const char *namespace_name) {
+ return librbd::api::Namespace<>::remove(io_ctx, namespace_name);
+ }
+
+ int RBD::namespace_list(IoCtx& io_ctx,
+ std::vector<std::string>* namespace_names) {
+ return librbd::api::Namespace<>::list(io_ctx, namespace_names);
+ }
+
+ int RBD::namespace_exists(IoCtx& io_ctx, const char *namespace_name,
+ bool *exists) {
+ return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists);
+ }
+
+ int RBD::pool_init(IoCtx& io_ctx, bool force) {
+ return librbd::api::Pool<>::init(io_ctx, force);
+ }
+
+ int RBD::pool_stats_get(IoCtx& io_ctx, PoolStats* stats) {
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats->pool_stats);
+ return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options);
+ }
+
+ int RBD::list(IoCtx& io_ctx, vector<string>& names)
+ {
+ std::vector<image_spec_t> image_specs;
+ int r = list2(io_ctx, &image_specs);
+ if (r < 0) {
+ return r;
+ }
+
+ names.clear();
+ for (auto& it : image_specs) {
+ names.push_back(it.name);
+ }
+ return 0;
+ }
+
+ int RBD::list2(IoCtx& io_ctx, std::vector<image_spec_t> *images)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+
+ int r = librbd::api::Image<>::list_images(io_ctx, images);
+ if (r >= 0) {
+ for (auto& it : *images) {
+ tracepoint(librbd, list_entry, it.name.c_str());
+ }
+ }
+ tracepoint(librbd, list_exit, r, r);
+ return r;
+ }
+
+ int RBD::rename(IoCtx& src_io_ctx, const char *srcname, const char *destname)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx));
+ tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname);
+ int r = librbd::rename(src_io_ctx, srcname, destname);
+ tracepoint(librbd, rename_exit, r);
+ return r;
+ }
+
+ int RBD::migration_prepare(IoCtx& io_ctx, const char *image_name,
+ IoCtx& dest_io_ctx, const char *dest_image_name,
+ ImageOptions& opts)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(),
+ dest_io_ctx.get_id(), dest_image_name, opts.opts);
+ int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx,
+ dest_image_name, opts);
+ tracepoint(librbd, migration_prepare_exit, r);
+ return r;
+ }
+
+ int RBD::migration_execute(IoCtx& io_ctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+ }
+
+ int RBD::migration_execute_with_progress(IoCtx& io_ctx,
+ const char *image_name,
+ librbd::ProgressContext &prog_ctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+ }
+
+ int RBD::migration_abort(IoCtx& io_ctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+ }
+
+ int RBD::migration_abort_with_progress(IoCtx& io_ctx, const char *image_name,
+ librbd::ProgressContext &prog_ctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+ }
+
+ int RBD::migration_commit(IoCtx& io_ctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+ }
+
+ int RBD::migration_commit_with_progress(IoCtx& io_ctx, const char *image_name,
+ librbd::ProgressContext &prog_ctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+ }
+
+ int RBD::migration_status(IoCtx& io_ctx, const char *image_name,
+ image_migration_status_t *status,
+ size_t status_size)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+
+ if (status_size != sizeof(image_migration_status_t)) {
+ tracepoint(librbd, migration_status_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Migration<>::status(io_ctx, image_name, status);
+ tracepoint(librbd, migration_status_exit, r);
+ return r;
+ }
+
+ int RBD::mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode) {
+ return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode);
+ }
+
+ int RBD::mirror_site_name_get(librados::Rados& rados,
+ std::string* site_name) {
+ return librbd::api::Mirror<>::site_name_get(rados, site_name);
+ }
+
+ int RBD::mirror_site_name_set(librados::Rados& rados,
+ const std::string& site_name) {
+ return librbd::api::Mirror<>::site_name_set(rados, site_name);
+ }
+
+ int RBD::mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode) {
+ return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode);
+ }
+
+ int RBD::mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token) {
+ return librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, token);
+ }
+
+ int RBD::mirror_peer_bootstrap_import(IoCtx& io_ctx,
+ rbd_mirror_peer_direction_t direction,
+ const std::string& token) {
+ return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction,
+ token);
+ }
+
+ int RBD::mirror_peer_add(IoCtx& io_ctx, std::string *uuid,
+ const std::string &cluster_name,
+ const std::string &client_name) {
+ return librbd::api::Mirror<>::peer_add(io_ctx, uuid, cluster_name,
+ client_name);
+ }
+
+ int RBD::mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid) {
+ return librbd::api::Mirror<>::peer_remove(io_ctx, uuid);
+ }
+
+ int RBD::mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers) {
+ return librbd::api::Mirror<>::peer_list(io_ctx, peers);
+ }
+
+ int RBD::mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &client_name) {
+ return librbd::api::Mirror<>::peer_set_client(io_ctx, uuid, client_name);
+ }
+
+ int RBD::mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &cluster_name) {
+ return librbd::api::Mirror<>::peer_set_cluster(io_ctx, uuid, cluster_name);
+ }
+
+ int RBD::mirror_peer_get_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ std::map<std::string, std::string> *key_vals) {
+ return librbd::api::Mirror<>::peer_get_attributes(io_ctx, uuid, key_vals);
+ }
+
+ int RBD::mirror_peer_set_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ const std::map<std::string, std::string>& key_vals) {
+ return librbd::api::Mirror<>::peer_set_attributes(io_ctx, uuid, key_vals);
+ }
+
+ int RBD::mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id,
+ size_t max, std::map<std::string, mirror_image_status_t> *images) {
+ return librbd::api::Mirror<>::image_status_list(io_ctx, start_id, max,
+ images);
+ }
+
+ int RBD::mirror_image_status_summary(IoCtx& io_ctx,
+ std::map<mirror_image_status_state_t, int> *states) {
+ return librbd::api::Mirror<>::image_status_summary(io_ctx, states);
+ }
+
+ int RBD::mirror_image_instance_id_list(IoCtx& io_ctx,
+ const std::string &start_id, size_t max,
+ std::map<std::string, std::string> *instance_ids) {
+ return librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max,
+ instance_ids);
+ }
+
+ int RBD::group_create(IoCtx& io_ctx, const char *group_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), group_name);
+ int r = librbd::api::Group<>::create(io_ctx, group_name);
+ tracepoint(librbd, group_create_exit, r);
+ return r;
+ }
+
+ int RBD::group_remove(IoCtx& io_ctx, const char *group_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), group_name);
+ int r = librbd::api::Group<>::remove(io_ctx, group_name);
+ tracepoint(librbd, group_remove_exit, r);
+ return r;
+ }
+
+ int RBD::group_list(IoCtx& io_ctx, vector<string> *names)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+
+ int r = librbd::api::Group<>::list(io_ctx, names);
+ if (r >= 0) {
+ for (auto itr : *names) {
+ tracepoint(librbd, group_list_entry, itr.c_str());
+ }
+ }
+ tracepoint(librbd, group_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_rename(IoCtx& io_ctx, const char *src_name,
+ const char *dest_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), src_name, dest_name);
+ int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name);
+ tracepoint(librbd, group_rename_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_add(IoCtx& group_ioctx, const char *group_name,
+ IoCtx& image_ioctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_add_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+ int r = librbd::api::Group<>::image_add(group_ioctx, group_name,
+ image_ioctx, image_name);
+ tracepoint(librbd, group_image_add_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_remove(IoCtx& group_ioctx, const char *group_name,
+ IoCtx& image_ioctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+ int r = librbd::api::Group<>::image_remove(group_ioctx, group_name,
+ image_ioctx, image_name);
+ tracepoint(librbd, group_image_remove_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_remove_by_id(IoCtx& group_ioctx, const char *group_name,
+ IoCtx& image_ioctx, const char *image_id)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_by_id_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_id);
+ int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name,
+ image_ioctx, image_id);
+ tracepoint(librbd, group_image_remove_by_id_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_list(IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_image_info_t> *images,
+ size_t group_image_info_size)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_list_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+
+ if (group_image_info_size != sizeof(group_image_info_t)) {
+ tracepoint(librbd, group_image_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Group<>::image_list(group_ioctx, group_name, images);
+ tracepoint(librbd, group_image_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_create(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_create_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_create(group_ioctx, group_name,
+ snap_name);
+ tracepoint(librbd, group_snap_create_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_remove(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_remove_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name,
+ snap_name);
+ tracepoint(librbd, group_snap_remove_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_list(IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps,
+ size_t group_snap_info_size)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_list_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+
+ if (group_snap_info_size != sizeof(group_snap_info_t)) {
+ tracepoint(librbd, group_snap_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, snaps);
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_rename(IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rename_enter,
+ group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(),
+ group_name, old_snap_name, new_snap_name);
+ int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name,
+ old_snap_name, new_snap_name);
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_rollback(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+ tracepoint(librbd, group_snap_rollback_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_rollback_with_progress(IoCtx& group_ioctx,
+ const char *group_name,
+ const char *snap_name,
+ ProgressContext& prog_ctx) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+ tracepoint(librbd, group_snap_rollback_exit, r);
+ return r;
+ }
+
+ int RBD::pool_metadata_get(IoCtx& ioctx, const std::string &key,
+ std::string *value)
+ {
+ int r = librbd::api::PoolMetadata<>::get(ioctx, key, value);
+ return r;
+ }
+
+ int RBD::pool_metadata_set(IoCtx& ioctx, const std::string &key,
+ const std::string &value)
+ {
+ int r = librbd::api::PoolMetadata<>::set(ioctx, key, value);
+ return r;
+ }
+
+ int RBD::pool_metadata_remove(IoCtx& ioctx, const std::string &key)
+ {
+ int r = librbd::api::PoolMetadata<>::remove(ioctx, key);
+ return r;
+ }
+
+ int RBD::pool_metadata_list(IoCtx& ioctx, const std::string &start,
+ uint64_t max, map<string, bufferlist> *pairs)
+ {
+ int r = librbd::api::PoolMetadata<>::list(ioctx, start, max, pairs);
+ return r;
+ }
+
+ int RBD::config_list(IoCtx& io_ctx, std::vector<config_option_t> *options) {
+ return librbd::api::Config<>::list(io_ctx, options);
+ }
+
+ RBD::AioCompletion::AioCompletion(void *cb_arg, callback_t complete_cb)
+ {
+ pc = reinterpret_cast<void*>(librbd::io::AioCompletion::create(
+ cb_arg, complete_cb, this));
+ }
+
+ bool RBD::AioCompletion::is_complete()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->is_complete();
+ }
+
+ int RBD::AioCompletion::wait_for_complete()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->wait_for_complete();
+ }
+
+ ssize_t RBD::AioCompletion::get_return_value()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->get_return_value();
+ }
+
+ void *RBD::AioCompletion::get_arg()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->get_arg();
+ }
+
+ void RBD::AioCompletion::release()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ c->release();
+ delete this;
+ }
+
+ /*
+ ImageOptions
+ */
+
+ ImageOptions::ImageOptions()
+ {
+ librbd::image_options_create(&opts);
+ }
+
+ ImageOptions::ImageOptions(rbd_image_options_t opts_)
+ {
+ librbd::image_options_create_ref(&opts, opts_);
+ }
+
+ ImageOptions::ImageOptions(const ImageOptions &imgopts)
+ {
+ librbd::image_options_copy(&opts, imgopts);
+ }
+
+ ImageOptions::~ImageOptions()
+ {
+ librbd::image_options_destroy(opts);
+ }
+
+ int ImageOptions::set(int optname, const std::string& optval)
+ {
+ return librbd::image_options_set(opts, optname, optval);
+ }
+
+ int ImageOptions::set(int optname, uint64_t optval)
+ {
+ return librbd::image_options_set(opts, optname, optval);
+ }
+
+ int ImageOptions::get(int optname, std::string* optval) const
+ {
+ return librbd::image_options_get(opts, optname, optval);
+ }
+
+ int ImageOptions::get(int optname, uint64_t* optval) const
+ {
+ return librbd::image_options_get(opts, optname, optval);
+ }
+
+ int ImageOptions::is_set(int optname, bool* is_set)
+ {
+ return librbd::image_options_is_set(opts, optname, is_set);
+ }
+
+ int ImageOptions::unset(int optname)
+ {
+ return librbd::image_options_unset(opts, optname);
+ }
+
+ void ImageOptions::clear()
+ {
+ librbd::image_options_clear(opts);
+ }
+
+ bool ImageOptions::empty() const
+ {
+ return librbd::image_options_is_empty(opts);
+ }
+
+ /*
+ Image
+ */
+
+ Image::Image() : ctx(NULL)
+ {
+ }
+
+ Image::~Image()
+ {
+ close();
+ }
+
+ int Image::close()
+ {
+ int r = 0;
+ if (ctx) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+
+ r = ictx->state->close();
+ ctx = NULL;
+
+ tracepoint(librbd, close_image_exit, r);
+ }
+ return r;
+ }
+
+ int Image::aio_close(RBD::AioCompletion *c)
+ {
+ if (!ctx) {
+ return -EINVAL;
+ }
+
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), c->pc);
+
+ ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE,
+ get_aio_completion(c)));
+ ctx = NULL;
+
+ tracepoint(librbd, aio_close_image_exit, 0);
+ return 0;
+ }
+
+ int Image::resize(uint64_t size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->resize(size, true, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+ }
+
+ int Image::resize2(uint64_t size, bool allow_shrink, librbd::ProgressContext& pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ int r = ictx->operations->resize(size, allow_shrink, pctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+ }
+
+ int Image::resize_with_progress(uint64_t size, librbd::ProgressContext& pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ int r = ictx->operations->resize(size, true, pctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+ }
+
+ int Image::stat(image_info_t& info, size_t infosize)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::info(ictx, info, infosize);
+ tracepoint(librbd, stat_exit, r, &info);
+ return r;
+ }
+
+ int Image::old_format(uint8_t *old)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_old_format(ictx, old);
+ tracepoint(librbd, get_old_format_exit, r, *old);
+ return r;
+ }
+
+ int Image::size(uint64_t *size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_size(ictx, size);
+ tracepoint(librbd, get_size_exit, r, *size);
+ return r;
+ }
+
+ int Image::get_group(group_info_t *group_info, size_t group_info_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, image_get_group_enter, ictx->name.c_str());
+
+ if (group_info_size != sizeof(group_info_t)) {
+ tracepoint(librbd, image_get_group_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Group<>::image_get_group(ictx, group_info);
+ tracepoint(librbd, image_get_group_exit, r);
+ return r;
+ }
+
+ int Image::features(uint64_t *features)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_features(ictx, features);
+ tracepoint(librbd, get_features_exit, r, *features);
+ return r;
+ }
+
+ int Image::update_features(uint64_t features, bool enabled)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ tracepoint(librbd, update_features_enter, ictx, features, enabled);
+ int r = ictx->operations->update_features(features, enabled);
+ tracepoint(librbd, update_features_exit, r);
+ return r;
+ }
+
+ int Image::get_op_features(uint64_t *op_features)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Image<>::get_op_features(ictx, op_features);
+ }
+
+ uint64_t Image::get_stripe_unit() const
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ uint64_t stripe_unit = ictx->get_stripe_unit();
+ tracepoint(librbd, get_stripe_unit_exit, 0, stripe_unit);
+ return stripe_unit;
+ }
+
+ uint64_t Image::get_stripe_count() const
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ uint64_t stripe_count = ictx->get_stripe_count();
+ tracepoint(librbd, get_stripe_count_exit, 0, stripe_count);
+ return stripe_count;
+ }
+
+ int Image::get_create_timestamp(struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_create_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_create_timestamp_exit, 0, timestamp);
+ return 0;
+ }
+
+ int Image::get_access_timestamp(struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ {
+ RWLock::RLocker timestamp_locker(ictx->timestamp_lock);
+ utime_t time = ictx->get_access_timestamp();
+ time.to_timespec(timestamp);
+ }
+ tracepoint(librbd, get_access_timestamp_exit, 0, timestamp);
+ return 0;
+ }
+
+ int Image::get_modify_timestamp(struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ {
+ RWLock::RLocker timestamp_locker(ictx->timestamp_lock);
+ utime_t time = ictx->get_modify_timestamp();
+ time.to_timespec(timestamp);
+ }
+ tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp);
+ return 0;
+ }
+
+ int Image::overlap(uint64_t *overlap)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_overlap(ictx, overlap);
+ tracepoint(librbd, get_overlap_exit, r, *overlap);
+ return r;
+ }
+
+ int Image::get_name(std::string *name)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ *name = ictx->name;
+ return 0;
+ }
+
+ int Image::get_id(std::string *id)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ if (ictx->old_format) {
+ return -EINVAL;
+ }
+ *id = ictx->id;
+ return 0;
+ }
+
+ std::string Image::get_block_name_prefix()
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ return ictx->object_prefix;
+ }
+
+ int64_t Image::get_data_pool_id()
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ return librbd::api::Image<>::get_data_pool_id(ictx);
+ }
+
+ int Image::parent_info(string *parent_pool_name, string *parent_name,
+ string *parent_snap_name)
+ {
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = get_parent(&parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name != nullptr) {
+ *parent_pool_name = parent_image.pool_name;
+ }
+ if (parent_name != nullptr) {
+ *parent_name = parent_image.image_name;
+ }
+ if (parent_snap_name != nullptr) {
+ *parent_snap_name = parent_snap.name;
+ }
+ }
+ return r;
+ }
+
+ int Image::parent_info2(string *parent_pool_name, string *parent_name,
+ string *parent_id, string *parent_snap_name)
+ {
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = get_parent(&parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name != nullptr) {
+ *parent_pool_name = parent_image.pool_name;
+ }
+ if (parent_name != nullptr) {
+ *parent_name = parent_image.image_name;
+ }
+ if (parent_id != nullptr) {
+ *parent_id = parent_image.image_id;
+ }
+ if (parent_snap_name != nullptr) {
+ *parent_snap_name = parent_snap.name;
+ }
+ }
+ return r;
+ }
+
+ int Image::get_parent(linked_image_spec_t *parent_image,
+ snap_spec_t *parent_snap)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = librbd::api::Image<>::get_parent(ictx, parent_image, parent_snap);
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image->pool_name.c_str(),
+ parent_image->image_name.c_str(),
+ parent_image->image_id.c_str(),
+ parent_snap->name.c_str());
+ return r;
+ }
+
+ int Image::get_flags(uint64_t *flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_flags_enter, ictx);
+ int r = librbd::get_flags(ictx, flags);
+ tracepoint(librbd, get_flags_exit, ictx, r, *flags);
+ return r;
+ }
+
+ int Image::set_image_notification(int fd, int type)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, set_image_notification_enter, ictx, fd, type);
+ int r = librbd::set_image_notification(ictx, fd, type);
+ tracepoint(librbd, set_image_notification_exit, ictx, r);
+ return r;
+ }
+
+ int Image::is_exclusive_lock_owner(bool *is_owner)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, is_exclusive_lock_owner_enter, ictx);
+ int r = librbd::is_exclusive_lock_owner(ictx, is_owner);
+ tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner);
+ return r;
+ }
+
+ int Image::lock_acquire(rbd_lock_mode_t lock_mode)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_acquire_enter, ictx, lock_mode);
+ int r = librbd::lock_acquire(ictx, lock_mode);
+ tracepoint(librbd, lock_acquire_exit, ictx, r);
+ return r;
+ }
+
+ int Image::lock_release()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_release_enter, ictx);
+ int r = librbd::lock_release(ictx);
+ tracepoint(librbd, lock_release_exit, ictx, r);
+ return r;
+ }
+
+ int Image::lock_get_owners(rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_get_owners_enter, ictx);
+ int r = librbd::lock_get_owners(ictx, lock_mode, lock_owners);
+ tracepoint(librbd, lock_get_owners_exit, ictx, r);
+ return r;
+ }
+
+ int Image::lock_break(rbd_lock_mode_t lock_mode,
+ const std::string &lock_owner)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner.c_str());
+ int r = librbd::lock_break(ictx, lock_mode, lock_owner);
+ tracepoint(librbd, lock_break_exit, ictx, r);
+ return r;
+ }
+
+ int Image::rebuild_object_map(ProgressContext &prog_ctx)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx);
+ return ictx->operations->rebuild_object_map(prog_ctx);
+ }
+
+ int Image::check_object_map(ProgressContext &prog_ctx)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx);
+ return ictx->operations->check_object_map(prog_ctx);
+ }
+
+ int Image::copy(IoCtx& dest_io_ctx, const char *destname)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ ImageOptions opts;
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy_exit, r);
+ return r;
+ }
+
+ int Image::copy2(Image& dest)
+ {
+ ImageCtx *srcctx = (ImageCtx *)ctx;
+ ImageCtx *destctx = (ImageCtx *)dest.ctx;
+ tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(srcctx, destctx, prog_ctx, 0);
+ tracepoint(librbd, copy2_exit, r);
+ return r;
+ }
+
+ int Image::copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy3_exit, r);
+ return r;
+ }
+
+ int Image::copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, size_t sparse_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, sparse_size);
+ tracepoint(librbd, copy4_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ librbd::ProgressContext &pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ ImageOptions opts;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0);
+ tracepoint(librbd, copy_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress2(Image& dest, librbd::ProgressContext &pctx)
+ {
+ ImageCtx *srcctx = (ImageCtx *)ctx;
+ ImageCtx *destctx = (ImageCtx *)dest.ctx;
+ tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only);
+ int r = librbd::copy(srcctx, destctx, pctx, 0);
+ tracepoint(librbd, copy2_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts,
+ librbd::ProgressContext &pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts);
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0);
+ tracepoint(librbd, copy3_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress4(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts,
+ librbd::ProgressContext &pctx,
+ size_t sparse_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size);
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, sparse_size);
+ tracepoint(librbd, copy4_exit, r);
+ return r;
+ }
+
+ int Image::deep_copy(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, opts.opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, r);
+ return r;
+ }
+
+ int Image::deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts,
+ librbd::ProgressContext &prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, opts.opts);
+ int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, r);
+ return r;
+ }
+
+ int Image::flatten()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+ }
+
+ int Image::flatten_with_progress(librbd::ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+ }
+
+ int Image::sparsify(size_t sparse_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+ }
+
+ int Image::sparsify_with_progress(size_t sparse_size,
+ librbd::ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+ }
+
+ int Image::list_children(set<pair<string, string> > *children)
+ {
+ std::vector<linked_image_spec_t> images;
+ int r = list_children3(&images);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto& image : images) {
+ if (!image.trash) {
+ children->insert({image.pool_name, image.image_name});
+ }
+ }
+ return 0;
+ }
+
+ int Image::list_children2(vector<librbd::child_info_t> *children)
+ {
+ std::vector<linked_image_spec_t> images;
+ int r = list_children3(&images);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto& image : images) {
+ children->push_back({
+ .pool_name = image.pool_name,
+ .image_name = image.image_name,
+ .image_id = image.image_id,
+ .trash = image.trash});
+ }
+
+ return 0;
+ }
+
+ int Image::list_children3(std::vector<linked_image_spec_t> *images)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = librbd::api::Image<>::list_children(ictx, images);
+ if (r >= 0) {
+ for (auto& it : *images) {
+ tracepoint(librbd, list_children_entry, it.pool_name.c_str(),
+ it.image_name.c_str());
+ }
+ }
+
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ int Image::list_descendants(std::vector<linked_image_spec_t> *images)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+
+ images->clear();
+ int r = librbd::api::Image<>::list_descendants(ictx, {}, images);
+ return r;
+ }
+
+ int Image::list_lockers(std::list<librbd::locker_t> *lockers,
+ bool *exclusive, string *tag)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::list_lockers(ictx, lockers, exclusive, tag);
+ if (r >= 0) {
+ for (std::list<librbd::locker_t>::const_iterator it = lockers->begin();
+ it != lockers->end(); ++it) {
+ tracepoint(librbd, list_lockers_entry, it->client.c_str(), it->cookie.c_str(), it->address.c_str());
+ }
+ }
+ tracepoint(librbd, list_lockers_exit, r);
+ return r;
+ }
+
+ int Image::lock_exclusive(const string& cookie)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str());
+ int r = librbd::lock(ictx, true, cookie, "");
+ tracepoint(librbd, lock_exclusive_exit, r);
+ return r;
+ }
+
+ int Image::lock_shared(const string& cookie, const std::string& tag)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str(), tag.c_str());
+ int r = librbd::lock(ictx, false, cookie, tag);
+ tracepoint(librbd, lock_shared_exit, r);
+ return r;
+ }
+
+ int Image::unlock(const string& cookie)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str());
+ int r = librbd::unlock(ictx, cookie);
+ tracepoint(librbd, unlock_exit, r);
+ return r;
+ }
+
+ int Image::break_lock(const string& client, const string& cookie)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client.c_str(), cookie.c_str());
+ int r = librbd::break_lock(ictx, client, cookie);
+ tracepoint(librbd, break_lock_exit, r);
+ return r;
+ }
+
+ int Image::snap_create(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(),
+ snap_name);
+ tracepoint(librbd, snap_create_exit, r);
+ return r;
+ }
+
+ int Image::snap_remove(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::snap_remove(ictx, snap_name, 0, prog_ctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+ }
+
+ int Image::snap_remove2(const char *snap_name, uint32_t flags, ProgressContext& pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags);
+ int r = librbd::snap_remove(ictx, snap_name, flags, pctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+ }
+
+ int Image::snap_remove_by_id(uint64_t snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Snapshot<>::remove(ictx, snap_id);
+ }
+
+ int Image::snap_rollback(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+ }
+
+ int Image::snap_rename(const char *srcname, const char *dstname)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname);
+ int r = ictx->operations->snap_rename(srcname, dstname);
+ tracepoint(librbd, snap_rename_exit, r);
+ return r;
+ }
+
+ int Image::snap_rollback_with_progress(const char *snap_name,
+ ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+ }
+
+ int Image::snap_protect(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_protect_exit, r);
+ return r;
+ }
+
+ int Image::snap_unprotect(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_unprotect_exit, r);
+ return r;
+ }
+
+ int Image::snap_is_protected(const char *snap_name, bool *is_protected)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::snap_is_protected(ictx, snap_name, is_protected);
+ tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0);
+ return r;
+ }
+
+ int Image::snap_list(vector<librbd::snap_info_t>& snaps)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, &snaps);
+ int r = librbd::snap_list(ictx, snaps);
+ if (r >= 0) {
+ for (int i = 0, n = snaps.size(); i < n; i++) {
+ tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name.c_str());
+ }
+ }
+ tracepoint(librbd, snap_list_exit, r, snaps.size());
+ if (r >= 0) {
+ // A little ugly, but the C++ API doesn't need a Image::snap_list_end,
+ // and we want the tracepoints to mirror the C API
+ tracepoint(librbd, snap_list_end_enter, &snaps);
+ tracepoint(librbd, snap_list_end_exit);
+ }
+ return r;
+ }
+
+ bool Image::snap_exists(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ bool exists;
+ int r = librbd::snap_exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, &exists);
+ tracepoint(librbd, snap_exists_exit, r, exists);
+ if (r < 0) {
+ // lie to caller since we don't know the real answer yet.
+ return false;
+ }
+ return exists;
+ }
+
+ // A safer verion of snap_exists.
+ int Image::snap_exists2(const char *snap_name, bool *exists)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::snap_exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, exists);
+ tracepoint(librbd, snap_exists_exit, r, *exists);
+ return r;
+ }
+
+ int Image::snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str());
+ int r = librbd::snap_get_timestamp(ictx, snap_id, timestamp);
+ tracepoint(librbd, snap_get_timestamp_exit, r);
+ return r;
+ }
+
+ int Image::snap_get_limit(uint64_t *limit)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str());
+ int r = librbd::snap_get_limit(ictx, limit);
+ tracepoint(librbd, snap_get_limit_exit, r, *limit);
+ return r;
+ }
+
+ int Image::snap_get_namespace_type(uint64_t snap_id,
+ snap_namespace_type_t *namespace_type) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id, namespace_type);
+ tracepoint(librbd, snap_get_namespace_type_exit, r);
+ return r;
+ }
+
+ int Image::snap_get_group_namespace(uint64_t snap_id,
+ snap_group_namespace_t *group_snap,
+ size_t group_snap_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_group_namespace_enter, ictx,
+ ictx->name.c_str());
+
+ if (group_snap_size != sizeof(snap_group_namespace_t)) {
+ tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id,
+ group_snap);
+ tracepoint(librbd, snap_get_group_namespace_exit, r);
+ return r;
+ }
+
+ int Image::snap_get_trash_namespace(uint64_t snap_id,
+ std::string* original_name) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id,
+ original_name);
+ }
+
+ int Image::snap_set_limit(uint64_t limit)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit);
+ int r = ictx->operations->snap_set_limit(limit);
+ tracepoint(librbd, snap_set_limit_exit, r);
+ return r;
+ }
+
+ int Image::snap_set(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Image<>::snap_set(
+ ictx, cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_set_exit, r);
+ return r;
+ }
+
+ int Image::snap_set_by_id(uint64_t snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Image<>::snap_set(ictx, snap_id);
+ }
+
+ ssize_t Image::read(uint64_t ofs, size_t len, bufferlist& bl)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ bufferptr ptr(len);
+ bl.push_back(std::move(ptr));
+
+ int r = ictx->io_work_queue->read(ofs, len, io::ReadResult{&bl}, 0);
+ tracepoint(librbd, read_exit, r);
+ return r;
+ }
+
+ ssize_t Image::read2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, op_flags);
+ bufferptr ptr(len);
+ bl.push_back(std::move(ptr));
+
+ int r = ictx->io_work_queue->read(ofs, len, io::ReadResult{&bl}, op_flags);
+ tracepoint(librbd, read_exit, r);
+ return r;
+ }
+
+ int64_t Image::read_iterate(uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ tracepoint(librbd, read_iterate_exit, r);
+ return r;
+ }
+
+ int Image::read_iterate2(uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ if (r > 0)
+ r = 0;
+ tracepoint(librbd, read_iterate2_exit, r);
+ return (int)r;
+ }
+
+ int Image::diff_iterate(const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ true, false);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs,
+ len, true, false, cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+ }
+
+ int Image::diff_iterate2(const char *fromsnapname, uint64_t ofs, uint64_t len,
+ bool include_parent, bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *), void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ include_parent, whole_object);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs,
+ len, include_parent,
+ whole_object, cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+ }
+
+ ssize_t Image::write(uint64_t ofs, size_t len, bufferlist& bl)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, bl.length() < len ? NULL : bl.c_str());
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = ictx->io_work_queue->write(ofs, len, bufferlist{bl}, 0);
+ tracepoint(librbd, write_exit, r);
+ return r;
+ }
+
+ ssize_t Image::write2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only,
+ ofs, len, bl.length() < len ? NULL : bl.c_str(), op_flags);
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = ictx->io_work_queue->write(ofs, len, bufferlist{bl}, op_flags);
+ tracepoint(librbd, write_exit, r);
+ return r;
+ }
+
+ int Image::discard(uint64_t ofs, uint64_t len)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ if (len > std::numeric_limits<int32_t>::max()) {
+ tracepoint(librbd, discard_exit, -EINVAL);
+ return -EINVAL;
+ }
+ int r = ictx->io_work_queue->discard(
+ ofs, len, ictx->discard_granularity_bytes);
+ tracepoint(librbd, discard_exit, r);
+ return r;
+ }
+
+ ssize_t Image::writesame(uint64_t ofs, size_t len, bufferlist& bl, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, bl.length() <= 0 ? NULL : bl.c_str(), bl.length(),
+ op_flags);
+ if (bl.length() <= 0 || len % bl.length() ||
+ len > std::numeric_limits<int>::max()) {
+ tracepoint(librbd, writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && bl.is_zero()) {
+ int r = ictx->io_work_queue->write_zeroes(ofs, len, 0U, op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+ }
+
+ int r = ictx->io_work_queue->writesame(ofs, len, bufferlist{bl}, op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+ }
+
+ ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags,
+ int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
+ }
+
+ ssize_t Image::compare_and_write(uint64_t ofs, size_t len,
+ ceph::bufferlist &cmp_bl, ceph::bufferlist& bl,
+ uint64_t *mismatch_off, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(),
+ bl.length() < len ? NULL : bl.c_str(), op_flags);
+
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = ictx->io_work_queue->compare_and_write(ofs, len, bufferlist{cmp_bl},
+ bufferlist{bl}, mismatch_off,
+ op_flags);
+
+ tracepoint(librbd, compare_and_write_exit, r);
+
+ return r;
+ }
+
+ int Image::aio_write(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc);
+ if (bl.length() < len) {
+ tracepoint(librbd, aio_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+ ictx->io_work_queue->aio_write(get_aio_completion(c), off, len,
+ bufferlist{bl}, 0);
+
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_write2(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags);
+ if (bl.length() < len) {
+ tracepoint(librbd, aio_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+ ictx->io_work_queue->aio_write(get_aio_completion(c), off, len,
+ bufferlist{bl}, op_flags);
+
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, c->pc);
+ ictx->io_work_queue->aio_discard(
+ get_aio_completion(c), off, len, ictx->discard_granularity_bytes);
+ tracepoint(librbd, aio_discard_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_read(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.c_str(), c->pc);
+ ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
+ << (void *)(bl.c_str() + len - 1) << dendl;
+
+ ictx->io_work_queue->aio_read(get_aio_completion(c), off, len,
+ io::ReadResult{&bl}, 0);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_read2(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, bl.c_str(), c->pc, op_flags);
+ ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
+ << (void *)(bl.c_str() + len - 1) << dendl;
+
+ ictx->io_work_queue->aio_read(get_aio_completion(c), off, len,
+ io::ReadResult{&bl}, op_flags);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+ }
+
+ int Image::flush()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = ictx->io_work_queue->flush();
+ tracepoint(librbd, flush_exit, r);
+ return r;
+ }
+
+ int Image::aio_flush(RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
+ ictx->io_work_queue->aio_flush(get_aio_completion(c));
+ tracepoint(librbd, aio_flush_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_writesame(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, bl.length() <= len ? NULL : bl.c_str(), bl.length(),
+ c->pc, op_flags);
+ if (bl.length() <= 0 || len % bl.length()) {
+ tracepoint(librbd, aio_writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && bl.is_zero()) {
+ ictx->io_work_queue->aio_write_zeroes(get_aio_completion(c), off, len, 0U,
+ op_flags, true);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+ }
+
+ ictx->io_work_queue->aio_writesame(get_aio_completion(c), off, len,
+ bufferlist{bl}, op_flags);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c,
+ int zero_flags, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ ictx->io_work_queue->aio_write_zeroes(
+ get_aio_completion(c), off, len, zero_flags, op_flags, true);
+ return 0;
+ }
+
+ int Image::aio_compare_and_write(uint64_t off, size_t len,
+ ceph::bufferlist& cmp_bl, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, uint64_t *mismatch_off,
+ int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(),
+ ictx->read_only, off, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(),
+ bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags);
+
+ if (bl.length() < len) {
+ tracepoint(librbd, compare_and_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ ictx->io_work_queue->aio_compare_and_write(get_aio_completion(c), off, len,
+ bufferlist{cmp_bl}, bufferlist{bl},
+ mismatch_off, op_flags, false);
+
+ tracepoint(librbd, aio_compare_and_write_exit, 0);
+
+ return 0;
+ }
+
+ int Image::invalidate_cache()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::invalidate_cache(ictx);
+ tracepoint(librbd, invalidate_cache_exit, r);
+ return r;
+ }
+
+ int Image::poll_io_events(RBD::AioCompletion **comps, int numcomp)
+ {
+ io::AioCompletion *cs[numcomp];
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, poll_io_events_enter, ictx, numcomp);
+ int r = librbd::poll_io_events(ictx, cs, numcomp);
+ tracepoint(librbd, poll_io_events_exit, r);
+ if (r > 0) {
+ for (int i = 0; i < r; ++i)
+ comps[i] = (RBD::AioCompletion *)cs[i]->rbd_comp;
+ }
+ return r;
+ }
+
+ int Image::metadata_get(const std::string &key, std::string *value)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_get_enter, ictx, key.c_str());
+ int r = librbd::metadata_get(ictx, key, value);
+ if (r < 0) {
+ tracepoint(librbd, metadata_get_exit, r, key.c_str(), NULL);
+ } else {
+ tracepoint(librbd, metadata_get_exit, r, key.c_str(), value->c_str());
+ }
+ return r;
+ }
+
+ int Image::metadata_set(const std::string &key, const std::string &value)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_set_enter, ictx, key.c_str(), value.c_str());
+ int r = ictx->operations->metadata_set(key, value);
+ tracepoint(librbd, metadata_set_exit, r);
+ return r;
+ }
+
+ int Image::metadata_remove(const std::string &key)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_remove_enter, ictx, key.c_str());
+ int r = ictx->operations->metadata_remove(key);
+ tracepoint(librbd, metadata_remove_exit, r);
+ return r;
+ }
+
+ int Image::metadata_list(const std::string &start, uint64_t max, map<string, bufferlist> *pairs)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_list_enter, ictx);
+ int r = librbd::metadata_list(ictx, start, max, pairs);
+ if (r >= 0) {
+ for (map<string, bufferlist>::iterator it = pairs->begin();
+ it != pairs->end(); ++it) {
+ tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str());
+ }
+ }
+ tracepoint(librbd, metadata_list_exit, r);
+ return r;
+ }
+
+ int Image::mirror_image_enable() {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_enable(ictx, false);
+ }
+
+ int Image::mirror_image_disable(bool force) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_disable(ictx, force);
+ }
+
+ int Image::mirror_image_promote(bool force) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_promote(ictx, force);
+ }
+
+ int Image::mirror_image_demote() {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_demote(ictx);
+ }
+
+ int Image::mirror_image_resync()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_resync(ictx);
+ }
+
+ int Image::mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ return librbd::api::Mirror<>::image_get_info(ictx, mirror_image_info);
+ }
+
+ int Image::mirror_image_get_status(mirror_image_status_t *mirror_image_status,
+ size_t status_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ return librbd::api::Mirror<>::image_get_status(ictx, mirror_image_status);
+ }
+
+ int Image::mirror_image_get_instance_id(std::string *instance_id) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ return librbd::api::Mirror<>::image_get_instance_id(ictx, instance_id);
+ }
+
+ int Image::aio_mirror_image_promote(bool force, RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ librbd::api::Mirror<>::image_promote(
+ ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_demote(RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ librbd::api::Mirror<>::image_demote(
+ ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ librbd::api::Mirror<>::image_get_info(
+ ictx, mirror_image_info,
+ new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_get_status(mirror_image_status_t *status,
+ size_t status_size,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ librbd::api::Mirror<>::image_get_status(
+ ictx, status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::update_watch(UpdateWatchCtx *wctx, uint64_t *handle) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, update_watch_enter, ictx, wctx);
+ int r = ictx->state->register_update_watcher(wctx, handle);
+ tracepoint(librbd, update_watch_exit, r, *handle);
+ return r;
+ }
+
+ int Image::update_unwatch(uint64_t handle) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, update_unwatch_enter, ictx, handle);
+ int r = ictx->state->unregister_update_watcher(handle);
+ tracepoint(librbd, update_unwatch_exit, r);
+ return r;
+ }
+
+ int Image::list_watchers(std::list<librbd::image_watcher_t> &watchers) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::list_watchers(ictx, watchers);
+ if (r >= 0) {
+ for (auto &watcher : watchers) {
+ tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie);
+ }
+ }
+ tracepoint(librbd, list_watchers_exit, r, watchers.size());
+ return r;
+ }
+
+ int Image::config_list(std::vector<config_option_t> *options) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Config<>::list(ictx, options);
+ }
+
+} // namespace librbd
+
+extern "C" void rbd_version(int *major, int *minor, int *extra)
+{
+ if (major)
+ *major = LIBRBD_VER_MAJOR;
+ if (minor)
+ *minor = LIBRBD_VER_MINOR;
+ if (extra)
+ *extra = LIBRBD_VER_EXTRA;
+}
+
+extern "C" void rbd_image_options_create(rbd_image_options_t* opts)
+{
+ librbd::image_options_create(opts);
+}
+
+extern "C" void rbd_image_options_destroy(rbd_image_options_t opts)
+{
+ librbd::image_options_destroy(opts);
+}
+
+extern "C" int rbd_image_options_set_string(rbd_image_options_t opts, int optname,
+ const char* optval)
+{
+ return librbd::image_options_set(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_set_uint64(rbd_image_options_t opts, int optname,
+ uint64_t optval)
+{
+ return librbd::image_options_set(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_get_string(rbd_image_options_t opts, int optname,
+ char* optval, size_t maxlen)
+{
+ std::string optval_;
+
+ int r = librbd::image_options_get(opts, optname, &optval_);
+
+ if (r < 0) {
+ return r;
+ }
+
+ if (optval_.size() >= maxlen) {
+ return -E2BIG;
+ }
+
+ strncpy(optval, optval_.c_str(), maxlen);
+
+ return 0;
+}
+
+extern "C" int rbd_image_options_get_uint64(rbd_image_options_t opts, int optname,
+ uint64_t* optval)
+{
+ return librbd::image_options_get(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_is_set(rbd_image_options_t opts, int optname,
+ bool* is_set)
+{
+ return librbd::image_options_is_set(opts, optname, is_set);
+}
+
+extern "C" int rbd_image_options_unset(rbd_image_options_t opts, int optname)
+{
+ return librbd::image_options_unset(opts, optname);
+}
+
+extern "C" void rbd_image_options_clear(rbd_image_options_t opts)
+{
+ librbd::image_options_clear(opts);
+}
+
+extern "C" int rbd_image_options_is_empty(rbd_image_options_t opts)
+{
+ return librbd::image_options_is_empty(opts);
+}
+
+/* pool mirroring */
+extern "C" int rbd_mirror_site_name_get(rados_t cluster, char *name,
+ size_t *max_len) {
+ librados::Rados rados;
+ librados::Rados::from_rados_t(cluster, rados);
+
+ std::string site_name;
+ int r = librbd::api::Mirror<>::site_name_get(rados, &site_name);
+ if (r < 0) {
+ return r;
+ }
+
+ auto total_len = site_name.size() + 1;
+ if (*max_len < total_len) {
+ *max_len = total_len;
+ return -ERANGE;
+ }
+ *max_len = total_len;
+
+ strcpy(name, site_name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_mirror_site_name_set(rados_t cluster, const char *name) {
+ librados::Rados rados;
+ librados::Rados::from_rados_t(cluster, rados);
+ return librbd::api::Mirror<>::site_name_set(rados, name);
+}
+
+extern "C" int rbd_mirror_mode_get(rados_ioctx_t p,
+ rbd_mirror_mode_t *mirror_mode) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode);
+}
+
+extern "C" int rbd_mirror_mode_set(rados_ioctx_t p,
+ rbd_mirror_mode_t mirror_mode) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode);
+}
+
+extern "C" int rbd_mirror_peer_bootstrap_create(rados_ioctx_t p, char *token,
+ size_t *max_len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::string token_str;
+ int r = librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, &token_str);
+ if (r < 0) {
+ return r;
+ }
+
+ auto total_len = token_str.size() + 1;
+ if (*max_len < total_len) {
+ *max_len = total_len;
+ return -ERANGE;
+ }
+ *max_len = total_len;
+
+ strcpy(token, token_str.c_str());
+ return 0;
+}
+
+extern "C" int rbd_mirror_peer_bootstrap_import(
+ rados_ioctx_t p, rbd_mirror_peer_direction_t direction,
+ const char *token) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction, token);
+}
+
+extern "C" int rbd_mirror_peer_add(rados_ioctx_t p, char *uuid,
+ size_t uuid_max_length,
+ const char *cluster_name,
+ const char *client_name) {
+ static const std::size_t UUID_LENGTH = 36;
+
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ if (uuid_max_length < UUID_LENGTH + 1) {
+ return -E2BIG;
+ }
+
+ std::string uuid_str;
+ int r = librbd::api::Mirror<>::peer_add(io_ctx, &uuid_str, cluster_name,
+ client_name);
+ if (r >= 0) {
+ strncpy(uuid, uuid_str.c_str(), uuid_max_length);
+ uuid[uuid_max_length - 1] = '\0';
+ }
+ return r;
+}
+
+extern "C" int rbd_mirror_peer_remove(rados_ioctx_t p, const char *uuid) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ int r = librbd::api::Mirror<>::peer_remove(io_ctx, uuid);
+ return r;
+}
+
+extern "C" int rbd_mirror_peer_list(rados_ioctx_t p,
+ rbd_mirror_peer_t *peers, int *max_peers) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::vector<librbd::mirror_peer_t> peer_vector;
+ int r = librbd::api::Mirror<>::peer_list(io_ctx, &peer_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_peers < static_cast<int>(peer_vector.size())) {
+ *max_peers = static_cast<int>(peer_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(peer_vector.size()); ++i) {
+ peers[i].uuid = strdup(peer_vector[i].uuid.c_str());
+ peers[i].cluster_name = strdup(peer_vector[i].cluster_name.c_str());
+ peers[i].client_name = strdup(peer_vector[i].client_name.c_str());
+ }
+ *max_peers = static_cast<int>(peer_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers,
+ int max_peers) {
+ for (int i = 0; i < max_peers; ++i) {
+ free(peers[i].uuid);
+ free(peers[i].cluster_name);
+ free(peers[i].client_name);
+ }
+}
+
+extern "C" int rbd_mirror_peer_set_client(rados_ioctx_t p, const char *uuid,
+ const char *client_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::peer_set_client(io_ctx, uuid, client_name);
+}
+
+extern "C" int rbd_mirror_peer_set_cluster(rados_ioctx_t p, const char *uuid,
+ const char *cluster_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::peer_set_cluster(io_ctx, uuid, cluster_name);
+}
+
+extern "C" int rbd_mirror_peer_get_attributes(
+ rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+ char *values, size_t *max_val_len, size_t *key_value_count) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::map<std::string, std::string> attributes;
+ int r = librbd::api::Mirror<>::peer_get_attributes(io_ctx, uuid, &attributes);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t key_total_len = 0, val_total_len = 0;
+ for (auto& it : attributes) {
+ key_total_len += it.first.size() + 1;
+ val_total_len += it.second.length() + 1;
+ }
+
+ bool too_short = ((*max_key_len < key_total_len) ||
+ (*max_val_len < val_total_len));
+
+ *max_key_len = key_total_len;
+ *max_val_len = val_total_len;
+ *key_value_count = attributes.size();
+ if (too_short) {
+ return -ERANGE;
+ }
+
+ char *keys_p = keys;
+ char *values_p = values;
+ for (auto& it : attributes) {
+ strncpy(keys_p, it.first.c_str(), it.first.size() + 1);
+ keys_p += it.first.size() + 1;
+
+ strncpy(values_p, it.second.c_str(), it.second.length() + 1);
+ values_p += it.second.length() + 1;
+ }
+
+ return 0;
+}
+
+extern "C" int rbd_mirror_peer_set_attributes(
+ rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+ size_t count) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::map<std::string, std::string> attributes;
+
+ for (size_t i = 0; i < count; ++i) {
+ const char* key = keys;
+ keys += strlen(key) + 1;
+ const char* value = values;
+ values += strlen(value) + 1;
+ attributes[key] = value;
+ }
+
+ return librbd::api::Mirror<>::peer_set_attributes(io_ctx, uuid, attributes);
+}
+
+extern "C" int rbd_mirror_image_status_list(rados_ioctx_t p,
+ const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_status_t *images, size_t *len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ std::map<std::string, librbd::mirror_image_status_t> cpp_images;
+
+ int r = librbd::api::Mirror<>::image_status_list(io_ctx, start_id, max,
+ &cpp_images);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : cpp_images) {
+ ceph_assert(i < max);
+ const std::string &image_id = it.first;
+ image_ids[i] = strdup(image_id.c_str());
+ mirror_image_status_cpp_to_c(it.second, &images[i]);
+ i++;
+ }
+ *len = i;
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_status_list_cleanup(char **image_ids,
+ rbd_mirror_image_status_t *images, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ free(image_ids[i]);
+ free(images[i].name);
+ free(images[i].info.global_id);
+ free(images[i].description);
+ }
+}
+
+extern "C" int rbd_mirror_image_status_summary(rados_ioctx_t p,
+ rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen) {
+
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::map<librbd::mirror_image_status_state_t, int> states_;
+ int r = librbd::api::Mirror<>::image_status_summary(io_ctx, &states_);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : states_) {
+ if (i == *maxlen) {
+ return -ERANGE;
+ }
+ states[i] = it.first;
+ counts[i] = it.second;
+ i++;
+ }
+ *maxlen = i;
+ return 0;
+}
+
+extern "C" int rbd_mirror_image_instance_id_list(
+ rados_ioctx_t p, const char *start_id, size_t max, char **image_ids,
+ char **instance_ids, size_t *len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ std::map<std::string, std::string> cpp_instance_ids;
+
+ int r = librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max,
+ &cpp_instance_ids);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : cpp_instance_ids) {
+ ceph_assert(i < max);
+ image_ids[i] = strdup(it.first.c_str());
+ instance_ids[i] = strdup(it.second.c_str());
+ i++;
+ }
+ *len = i;
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_instance_id_list_cleanup(
+ char **image_ids, char **instance_ids, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ free(image_ids[i]);
+ free(instance_ids[i]);
+ }
+}
+
+/* helpers */
+
+extern "C" void rbd_image_spec_cleanup(rbd_image_spec_t *image)
+{
+ free(image->id);
+ free(image->name);
+}
+
+extern "C" void rbd_image_spec_list_cleanup(rbd_image_spec_t *images,
+ size_t num_images)
+{
+ for (size_t idx = 0; idx < num_images; ++idx) {
+ rbd_image_spec_cleanup(&images[idx]);
+ }
+}
+
+extern "C" void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image)
+{
+ free(image->pool_name);
+ free(image->pool_namespace);
+ free(image->image_id);
+ free(image->image_name);
+}
+
+extern "C" void rbd_linked_image_spec_list_cleanup(
+ rbd_linked_image_spec_t *images, size_t num_images)
+{
+ for (size_t idx = 0; idx < num_images; ++idx) {
+ rbd_linked_image_spec_cleanup(&images[idx]);
+ }
+}
+
+extern "C" void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap)
+{
+ free(snap->name);
+}
+
+/* images */
+extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+ std::vector<librbd::image_spec_t> cpp_image_specs;
+ int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs);
+ if (r < 0) {
+ tracepoint(librbd, list_exit, r, *size);
+ return r;
+ }
+
+ size_t expected_size = 0;
+
+ for (auto& it : cpp_image_specs) {
+ expected_size += it.name.size() + 1;
+ }
+ if (*size < expected_size) {
+ *size = expected_size;
+ tracepoint(librbd, list_exit, -ERANGE, *size);
+ return -ERANGE;
+ }
+
+ if (names == NULL) {
+ tracepoint(librbd, list_exit, -EINVAL, *size);
+ return -EINVAL;
+ }
+
+ for (auto& it : cpp_image_specs) {
+ const char* name = it.name.c_str();
+ tracepoint(librbd, list_entry, name);
+ strcpy(names, name);
+ names += strlen(names) + 1;
+ }
+ tracepoint(librbd, list_exit, (int)expected_size, *size);
+ return (int)expected_size;
+}
+
+extern "C" int rbd_list2(rados_ioctx_t p, rbd_image_spec_t *images,
+ size_t *size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *size);
+ std::vector<librbd::image_spec_t> cpp_image_specs;
+ int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs);
+ if (r < 0) {
+ tracepoint(librbd, list_exit, r, *size);
+ return r;
+ }
+
+ size_t expected_size = cpp_image_specs.size();
+ if (*size < expected_size) {
+ *size = expected_size;
+ tracepoint(librbd, list_exit, -ERANGE, *size);
+ return -ERANGE;
+ }
+
+ *size = expected_size;
+ for (size_t idx = 0; idx < expected_size; ++idx) {
+ images[idx].id = strdup(cpp_image_specs[idx].id.c_str());
+ images[idx].name = strdup(cpp_image_specs[idx].name.c_str());
+ }
+ tracepoint(librbd, list_exit, 0, *size);
+ return 0;
+}
+
+extern "C" int rbd_create(rados_ioctx_t p, const char *name, uint64_t size, int *order)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order);
+ int r = librbd::create(io_ctx, name, size, order);
+ tracepoint(librbd, create_exit, r, *order);
+ return r;
+}
+
+extern "C" int rbd_create2(rados_ioctx_t p, const char *name,
+ uint64_t size, uint64_t features,
+ int *order)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order);
+ int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0);
+ tracepoint(librbd, create2_exit, r, *order);
+ return r;
+}
+
+extern "C" int rbd_create3(rados_ioctx_t p, const char *name,
+ uint64_t size, uint64_t features,
+ int *order,
+ uint64_t stripe_unit, uint64_t stripe_count)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count);
+ int r = librbd::create(io_ctx, name, size, false, features, order,
+ stripe_unit, stripe_count);
+ tracepoint(librbd, create3_exit, r, *order);
+ return r;
+}
+
+extern "C" int rbd_create4(rados_ioctx_t p, const char *name,
+ uint64_t size, rbd_image_options_t opts)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts);
+ librbd::ImageOptions opts_(opts);
+ int r = librbd::create(io_ctx, name, "", size, opts_, "", "", false);
+ tracepoint(librbd, create4_exit, r);
+ return r;
+}
+
+extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snap_name, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order)
+{
+ librados::IoCtx p_ioc, c_ioc;
+ librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+ librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+ tracepoint(librbd, clone_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features);
+ int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name,
+ features, c_order, 0, 0);
+ tracepoint(librbd, clone_exit, r, *c_order);
+ return r;
+}
+
+extern "C" int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snap_name, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count)
+{
+ librados::IoCtx p_ioc, c_ioc;
+ librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+ librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+ tracepoint(librbd, clone2_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features, stripe_unit, stripe_count);
+ int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name,
+ features, c_order, stripe_unit, stripe_count);
+ tracepoint(librbd, clone2_exit, r, *c_order);
+ return r;
+}
+
+extern "C" int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snap_name, rados_ioctx_t c_ioctx,
+ const char *c_name, rbd_image_options_t c_opts)
+{
+ librados::IoCtx p_ioc, c_ioc;
+ librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+ librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+ tracepoint(librbd, clone3_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, c_opts);
+ librbd::ImageOptions c_opts_(c_opts);
+ int r = librbd::clone(p_ioc, nullptr, p_name, p_snap_name, c_ioc, nullptr,
+ c_name, c_opts_, "", "");
+ tracepoint(librbd, clone3_exit, r);
+ return r;
+}
+
+extern "C" int rbd_remove(rados_ioctx_t p, const char *name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_remove_with_progress(rados_ioctx_t p, const char *name,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_move(rados_ioctx_t p, const char *name,
+ uint64_t delay) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER, name,
+ delay);
+ tracepoint(librbd, trash_move_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_get(rados_ioctx_t io, const char *id,
+ rbd_trash_image_info_t *info) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ librbd::trash_image_info_t cpp_info;
+ int r = librbd::api::Trash<>::get(io_ctx, id, &cpp_info);
+ if (r < 0) {
+ return r;
+ }
+
+ trash_image_info_cpp_to_c(cpp_info, info);
+ return 0;
+}
+
+extern "C" void rbd_trash_get_cleanup(rbd_trash_image_info_t *info) {
+ free(info->id);
+ free(info->name);
+}
+
+extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries,
+ size_t *num_entries) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_list_enter,
+ io_ctx.get_pool_name().c_str(), io_ctx.get_id());
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(entries, 0, sizeof(*entries) * *num_entries);
+
+ vector<librbd::trash_image_info_t> cpp_entries;
+ int r = librbd::api::Trash<>::list(io_ctx, cpp_entries, true);
+ if (r < 0) {
+ tracepoint(librbd, trash_list_exit, r, *num_entries);
+ return r;
+ }
+
+ if (*num_entries < cpp_entries.size()) {
+ *num_entries = cpp_entries.size();
+ tracepoint(librbd, trash_list_exit, -ERANGE, *num_entries);
+ return -ERANGE;
+ }
+
+ int i=0;
+ for (const auto &entry : cpp_entries) {
+ trash_image_info_cpp_to_c(entry, &entries[i++]);
+ }
+ *num_entries = cpp_entries.size();
+
+ return *num_entries;
+}
+
+extern "C" void rbd_trash_list_cleanup(rbd_trash_image_info_t *entries,
+ size_t num_entries) {
+ for (size_t i=0; i < num_entries; i++) {
+ rbd_trash_get_cleanup(&entries[i]);
+ }
+}
+
+extern "C" int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts,
+ float threshold) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ librbd::NoOpProgressContext nop_pctx;
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts,
+ float threshold, librbd_progress_fn_t cb, void* cbdata) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ librbd::CProgressContext pctx(cb, cbdata);
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_remove(rados_ioctx_t p, const char *image_id,
+ bool force) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_remove_with_progress(rados_ioctx_t p,
+ const char *image_id,
+ bool force,
+ librbd_progress_fn_t cb,
+ void *cbdata) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_restore(rados_ioctx_t p, const char *id,
+ const char *name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), id, name);
+ int r = librbd::api::Trash<>::restore(
+ io_ctx, librbd::api::Trash<>::RESTORE_SOURCE_WHITELIST, id, name);
+ tracepoint(librbd, trash_undelete_exit, r);
+ return r;
+}
+
+extern "C" int rbd_namespace_create(rados_ioctx_t io,
+ const char *namespace_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Namespace<>::create(io_ctx, namespace_name);
+}
+
+extern "C" int rbd_namespace_remove(rados_ioctx_t io,
+ const char *namespace_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Namespace<>::remove(io_ctx, namespace_name);
+}
+
+extern "C" int rbd_namespace_list(rados_ioctx_t io, char *names, size_t *size) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ if (names == nullptr || size == nullptr) {
+ return -EINVAL;
+ }
+
+ std::vector<std::string> cpp_names;
+ int r = librbd::api::Namespace<>::list(io_ctx, &cpp_names);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t expected_size = 0;
+ for (size_t i = 0; i < cpp_names.size(); i++) {
+ expected_size += cpp_names[i].size() + 1;
+ }
+ if (*size < expected_size) {
+ *size = expected_size;
+ return -ERANGE;
+ }
+
+ *size = expected_size;
+ for (int i = 0; i < (int)cpp_names.size(); i++) {
+ const char* name = cpp_names[i].c_str();
+ strcpy(names, name);
+ names += strlen(names) + 1;
+ }
+
+ return (int)expected_size;
+}
+
+extern "C" int rbd_namespace_exists(rados_ioctx_t io,
+ const char *namespace_name,
+ bool *exists) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists);
+}
+
+extern "C" int rbd_pool_init(rados_ioctx_t io, bool force) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Pool<>::init(io_ctx, force);
+}
+
+extern "C" void rbd_pool_stats_create(rbd_pool_stats_t *stats) {
+ *stats = reinterpret_cast<rbd_pool_stats_t>(
+ new librbd::api::Pool<>::StatOptions{});
+}
+
+extern "C" void rbd_pool_stats_destroy(rbd_pool_stats_t stats) {
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats);
+ delete pool_stat_options;
+}
+
+extern "C" int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats,
+ int stat_option,
+ uint64_t* stat_val) {
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats);
+ return librbd::api::Pool<>::add_stat_option(
+ pool_stat_options, static_cast<rbd_pool_stat_option_t>(stat_option),
+ stat_val);
+}
+
+extern "C" int rbd_pool_stats_get(
+ rados_ioctx_t io, rbd_pool_stats_t pool_stats) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(pool_stats);
+ return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options);
+}
+
+extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ librbd::ImageOptions opts;
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy2(rbd_image_t srcp, rbd_image_t destp)
+{
+ librbd::ImageCtx *src = (librbd::ImageCtx *)srcp;
+ librbd::ImageCtx *dest = (librbd::ImageCtx *)destp;
+ tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(src, dest, prog_ctx, 0);
+ tracepoint(librbd, copy2_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy3(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname, rbd_image_options_t c_opts)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts);
+ librbd::ImageOptions c_opts_(c_opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, 0);
+ tracepoint(librbd, copy3_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy4(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname, rbd_image_options_t c_opts, size_t sparse_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts, sparse_size);
+ librbd::ImageOptions c_opts_(c_opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, sparse_size);
+ tracepoint(librbd, copy4_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ librbd::ImageOptions opts;
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_copy_with_progress2(rbd_image_t srcp, rbd_image_t destp,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *src = (librbd::ImageCtx *)srcp;
+ librbd::ImageCtx *dest = (librbd::ImageCtx *)destp;
+ tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(src, dest, prog_ctx, 0);
+ tracepoint(librbd, copy2_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_copy_with_progress3(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts);
+ librbd::ImageOptions dest_opts_(dest_opts);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, 0);
+ tracepoint(librbd, copy3_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_copy_with_progress4(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t fn, void *data, size_t sparse_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts, sparse_size);
+ librbd::ImageOptions dest_opts_(dest_opts);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, sparse_size);
+ tracepoint(librbd, copy4_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_deep_copy(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname, rbd_image_options_t c_opts)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, c_opts);
+ librbd::ImageOptions opts(c_opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, r);
+ return r;
+}
+
+extern "C" int rbd_deep_copy_with_progress(rbd_image_t image,
+ rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, dest_opts);
+ librbd::ImageOptions opts(dest_opts);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_flatten(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+}
+
+extern "C" int rbd_flatten_with_progress(rbd_image_t image,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+}
+
+extern "C" int rbd_sparsify(rbd_image_t image, size_t sparse_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+}
+
+extern "C" int rbd_sparsify_with_progress(rbd_image_t image, size_t sparse_size,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+}
+
+extern "C" int rbd_rename(rados_ioctx_t src_p, const char *srcname,
+ const char *destname)
+{
+ librados::IoCtx src_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(src_p, src_io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx));
+ tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname);
+ int r = librbd::rename(src_io_ctx, srcname, destname);
+ tracepoint(librbd, rename_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_prepare(rados_ioctx_t p, const char *image_name,
+ rados_ioctx_t dest_p,
+ const char *dest_image_name,
+ rbd_image_options_t opts_)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(),
+ dest_io_ctx.get_id(), dest_image_name, opts_);
+ librbd::ImageOptions opts(opts_);
+ int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx,
+ dest_image_name, opts);
+ tracepoint(librbd, migration_prepare_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_execute(rados_ioctx_t p, const char *image_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_execute_with_progress(rados_ioctx_t p,
+ const char *name,
+ librbd_progress_fn_t fn,
+ void *data)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int r = librbd::api::Migration<>::execute(io_ctx, name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_abort(rados_ioctx_t p, const char *image_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_abort_with_progress(rados_ioctx_t p,
+ const char *name,
+ librbd_progress_fn_t fn,
+ void *data)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int r = librbd::api::Migration<>::abort(io_ctx, name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_commit(rados_ioctx_t p, const char *image_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_commit_with_progress(rados_ioctx_t p,
+ const char *name,
+ librbd_progress_fn_t fn,
+ void *data)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int r = librbd::api::Migration<>::commit(io_ctx, name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_status(rados_ioctx_t p, const char *image_name,
+ rbd_image_migration_status_t *status,
+ size_t status_size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+
+ if (status_size != sizeof(rbd_image_migration_status_t)) {
+ tracepoint(librbd, migration_status_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ librbd::image_migration_status_t cpp_status;
+ int r = librbd::api::Migration<>::status(io_ctx, image_name, &cpp_status);
+ if (r >= 0) {
+ status->source_pool_id = cpp_status.source_pool_id;
+ status->source_pool_namespace =
+ strdup(cpp_status.source_pool_namespace.c_str());
+ status->source_image_name = strdup(cpp_status.source_image_name.c_str());
+ status->source_image_id = strdup(cpp_status.source_image_id.c_str());
+ status->dest_pool_id = cpp_status.dest_pool_id;
+ status->dest_pool_namespace =
+ strdup(cpp_status.dest_pool_namespace.c_str());
+ status->dest_image_name = strdup(cpp_status.dest_image_name.c_str());
+ status->dest_image_id = strdup(cpp_status.dest_image_id.c_str());
+ status->state = cpp_status.state;
+ status->state_description = strdup(cpp_status.state_description.c_str());
+ }
+
+ tracepoint(librbd, migration_status_exit, r);
+ return r;
+}
+
+extern "C" void rbd_migration_status_cleanup(rbd_image_migration_status_t *s)
+{
+ free(s->source_pool_namespace);
+ free(s->source_image_name);
+ free(s->source_image_id);
+ free(s->dest_pool_namespace);
+ free(s->dest_image_name);
+ free(s->dest_image_id);
+ free(s->state_description);
+}
+
+extern "C" int rbd_pool_metadata_get(rados_ioctx_t p, const char *key,
+ char *value, size_t *vallen)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ string val_s;
+ int r = librbd::api::PoolMetadata<>::get(io_ctx, key, &val_s);
+ if (*vallen < val_s.size() + 1) {
+ r = -ERANGE;
+ *vallen = val_s.size() + 1;
+ } else {
+ strncpy(value, val_s.c_str(), val_s.size() + 1);
+ }
+
+ return r;
+}
+
+extern "C" int rbd_pool_metadata_set(rados_ioctx_t p, const char *key,
+ const char *value)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ int r = librbd::api::PoolMetadata<>::set(io_ctx, key, value);
+ return r;
+}
+
+extern "C" int rbd_pool_metadata_remove(rados_ioctx_t p, const char *key)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ int r = librbd::api::PoolMetadata<>::remove(io_ctx, key);
+ return r;
+}
+
+extern "C" int rbd_pool_metadata_list(rados_ioctx_t p, const char *start,
+ uint64_t max, char *key, size_t *key_len,
+ char *value, size_t *val_len)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ map<string, bufferlist> pairs;
+ int r = librbd::api::PoolMetadata<>::list(io_ctx, start, max, &pairs);
+ if (r < 0) {
+ return r;
+ }
+ size_t key_total_len = 0, val_total_len = 0;
+ for (auto &it : pairs) {
+ key_total_len += it.first.size() + 1;
+ val_total_len += it.second.length() + 1;
+ }
+ if (*key_len < key_total_len || *val_len < val_total_len) {
+ *key_len = key_total_len;
+ *val_len = val_total_len;
+ return -ERANGE;
+ }
+ *key_len = key_total_len;
+ *val_len = val_total_len;
+
+ char *key_p = key, *value_p = value;
+ for (auto &it : pairs) {
+ strncpy(key_p, it.first.c_str(), it.first.size() + 1);
+ key_p += it.first.size() + 1;
+ strncpy(value_p, it.second.c_str(), it.second.length());
+ value_p += it.second.length();
+ *value_p = '\0';
+ value_p++;
+ }
+ return 0;
+}
+
+extern "C" int rbd_config_pool_list(rados_ioctx_t p,
+ rbd_config_option_t *options,
+ int *max_options) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::vector<librbd::config_option_t> option_vector;
+ int r = librbd::api::Config<>::list(io_ctx, &option_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_options < static_cast<int>(option_vector.size())) {
+ *max_options = static_cast<int>(option_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(option_vector.size()); ++i) {
+ config_option_cpp_to_c(option_vector[i], &options[i]);
+ }
+ *max_options = static_cast<int>(option_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_config_pool_list_cleanup(rbd_config_option_t *options,
+ int max_options) {
+ for (int i = 0; i < max_options; ++i) {
+ config_option_cleanup(options[i]);
+ }
+}
+
+extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image,
+ const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ false);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_open_by_id(rados_ioctx_t p, const char *id,
+ rbd_image_t *image, const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ false);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_open(rados_ioctx_t p, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ false);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_open_by_id(rados_ioctx_t p, const char *id,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ false);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only,
+ comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_open_read_only(rados_ioctx_t p, const char *name,
+ rbd_image_t *image, const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ true);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_open_by_id_read_only(rados_ioctx_t p, const char *id,
+ rbd_image_t *image, const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ true);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_open_read_only(rados_ioctx_t p, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ true);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_open_by_id_read_only(rados_ioctx_t p, const char *id,
+ rbd_image_t *image,
+ const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ true);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_close(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+
+ int r = ictx->state->close();
+
+ tracepoint(librbd, close_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_close(rbd_image_t image, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), comp->pc);
+ ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE,
+ get_aio_completion(comp)));
+ tracepoint(librbd, aio_close_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_resize(rbd_image_t image, uint64_t size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->resize(size, true, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+}
+
+extern "C" int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->resize(size, allow_shrink, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+}
+
+extern "C" int rbd_resize_with_progress(rbd_image_t image, uint64_t size,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->resize(size, true, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+}
+
+extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
+ size_t infosize)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::info(ictx, *info, infosize);
+ tracepoint(librbd, stat_exit, r, info);
+ return r;
+}
+
+extern "C" int rbd_get_old_format(rbd_image_t image, uint8_t *old)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_old_format(ictx, old);
+ tracepoint(librbd, get_old_format_exit, r, *old);
+ return r;
+}
+
+extern "C" int rbd_get_size(rbd_image_t image, uint64_t *size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_size(ictx, size);
+ tracepoint(librbd, get_size_exit, r, *size);
+ return r;
+}
+
+extern "C" int rbd_get_features(rbd_image_t image, uint64_t *features)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_features(ictx, features);
+ tracepoint(librbd, get_features_exit, r, *features);
+ return r;
+}
+
+extern "C" int rbd_update_features(rbd_image_t image, uint64_t features,
+ uint8_t enabled)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ bool features_enabled = enabled != 0;
+ tracepoint(librbd, update_features_enter, ictx, features, features_enabled);
+ int r = ictx->operations->update_features(features, features_enabled);
+ tracepoint(librbd, update_features_exit, r);
+ return r;
+}
+
+extern "C" int rbd_get_op_features(rbd_image_t image, uint64_t *op_features)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Image<>::get_op_features(ictx, op_features);
+}
+
+extern "C" int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ *stripe_unit = ictx->get_stripe_unit();
+ tracepoint(librbd, get_stripe_unit_exit, 0, *stripe_unit);
+ return 0;
+}
+
+extern "C" int rbd_get_stripe_count(rbd_image_t image, uint64_t *stripe_count)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ *stripe_count = ictx->get_stripe_count();
+ tracepoint(librbd, get_stripe_count_exit, 0, *stripe_count);
+ return 0;
+}
+
+extern "C" int rbd_get_create_timestamp(rbd_image_t image,
+ struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_create_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_create_timestamp_exit, 0, timestamp);
+ return 0;
+}
+
+extern "C" int rbd_get_access_timestamp(rbd_image_t image,
+ struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_access_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_access_timestamp_exit, 0, timestamp);
+ return 0;
+}
+
+extern "C" int rbd_get_modify_timestamp(rbd_image_t image,
+ struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_modify_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp);
+ return 0;
+}
+
+
+extern "C" int rbd_get_overlap(rbd_image_t image, uint64_t *overlap)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_overlap(ictx, overlap);
+ tracepoint(librbd, get_overlap_exit, r, *overlap);
+ return r;
+}
+
+extern "C" int rbd_get_name(rbd_image_t image, char *name, size_t *name_len)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ if (*name_len <= ictx->name.size()) {
+ *name_len = ictx->name.size() + 1;
+ return -ERANGE;
+ }
+
+ strncpy(name, ictx->name.c_str(), ictx->name.size());
+ name[ictx->name.size()] = '\0';
+ *name_len = ictx->name.size() + 1;
+ return 0;
+}
+
+extern "C" int rbd_get_id(rbd_image_t image, char *id, size_t id_len)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ if (ictx->old_format) {
+ return -EINVAL;
+ }
+ if (ictx->id.size() >= id_len) {
+ return -ERANGE;
+ }
+
+ strncpy(id, ictx->id.c_str(), id_len - 1);
+ id[id_len - 1] = '\0';
+ return 0;
+}
+
+extern "C" int rbd_get_block_name_prefix(rbd_image_t image, char *prefix,
+ size_t prefix_len)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ if (ictx->object_prefix.size() >= prefix_len) {
+ return -ERANGE;
+ }
+
+ strncpy(prefix, ictx->object_prefix.c_str(), prefix_len - 1);
+ prefix[prefix_len - 1] = '\0';
+ return 0;
+}
+
+extern "C" int64_t rbd_get_data_pool_id(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ return librbd::api::Image<>::get_data_pool_id(ictx);
+}
+
+extern "C" int rbd_get_parent_info(rbd_image_t image,
+ char *parent_pool_name, size_t ppool_namelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_snap_name, size_t psnap_namelen)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name) {
+ if (parent_image.pool_name.length() + 1 > ppool_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_pool_name, parent_image.pool_name.c_str());
+ }
+ }
+ if (parent_name) {
+ if (parent_image.image_name.length() + 1 > pnamelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_name, parent_image.image_name.c_str());
+ }
+ }
+ if (parent_snap_name) {
+ if (parent_snap.name.length() + 1 > psnap_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_snap_name, parent_snap.name.c_str());
+ }
+ }
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL);
+ return r;
+ }
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image.pool_name.c_str(),
+ parent_image.image_name.c_str(),
+ parent_image.image_id.c_str(),
+ parent_snap.name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_get_parent_info2(rbd_image_t image,
+ char *parent_pool_name,
+ size_t ppool_namelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_id, size_t pidlen,
+ char *parent_snap_name,
+ size_t psnap_namelen)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name) {
+ if (parent_image.pool_name.length() + 1 > ppool_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_pool_name, parent_image.pool_name.c_str());
+ }
+ }
+ if (parent_name) {
+ if (parent_image.image_name.length() + 1 > pnamelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_name, parent_image.image_name.c_str());
+ }
+ }
+ if (parent_id) {
+ if (parent_image.image_id.length() + 1 > pidlen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_id, parent_image.image_id.c_str());
+ }
+ }
+ if (parent_snap_name) {
+ if (parent_snap.name.length() + 1 > psnap_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_snap_name, parent_snap.name.c_str());
+ }
+ }
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL);
+ return r;
+ }
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image.pool_name.c_str(),
+ parent_image.image_name.c_str(),
+ parent_image.image_id.c_str(),
+ parent_snap.name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_get_parent(rbd_image_t image,
+ rbd_linked_image_spec_t *parent_image,
+ rbd_snap_spec_t *parent_snap)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ librbd::linked_image_spec_t cpp_parent_image;
+ librbd::snap_spec_t cpp_parent_snap;
+ int r = librbd::api::Image<>::get_parent(ictx, &cpp_parent_image,
+ &cpp_parent_snap);
+ if (r < 0) {
+ // FIPS zeroization audit 20191117: these memsets are not security related.
+ memset(parent_image, 0, sizeof(rbd_linked_image_spec_t));
+ memset(parent_snap, 0, sizeof(rbd_snap_spec_t));
+ } else {
+ *parent_image = {
+ .pool_id = cpp_parent_image.pool_id,
+ .pool_name = strdup(cpp_parent_image.pool_name.c_str()),
+ .pool_namespace = strdup(cpp_parent_image.pool_namespace.c_str()),
+ .image_id = strdup(cpp_parent_image.image_id.c_str()),
+ .image_name = strdup(cpp_parent_image.image_name.c_str()),
+ .trash = cpp_parent_image.trash};
+ *parent_snap = {
+ .id = cpp_parent_snap.id,
+ .namespace_type = cpp_parent_snap.namespace_type,
+ .name = strdup(cpp_parent_snap.name.c_str())};
+ }
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image->pool_name,
+ parent_image->image_name,
+ parent_image->image_id,
+ parent_snap->name);
+ return r;
+}
+
+extern "C" int rbd_get_flags(rbd_image_t image, uint64_t *flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_flags_enter, ictx);
+ int r = librbd::get_flags(ictx, flags);
+ tracepoint(librbd, get_flags_exit, ictx, r, *flags);
+ return r;
+}
+
+extern "C" int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info,
+ size_t group_info_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, image_get_group_enter, ictx->name.c_str());
+
+ if (group_info_size != sizeof(rbd_group_info_t)) {
+ tracepoint(librbd, image_get_group_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ librbd::group_info_t cpp_group_info;
+ int r = librbd::api::Group<>::image_get_group(ictx, &cpp_group_info);
+ if (r >= 0) {
+ group_info_cpp_to_c(cpp_group_info, group_info);
+ } else {
+ group_info->name = NULL;
+ }
+
+ tracepoint(librbd, image_get_group_exit, r);
+ return r;
+}
+
+extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, set_image_notification_enter, ictx, fd, type);
+ int r = librbd::set_image_notification(ictx, fd, type);
+ tracepoint(librbd, set_image_notification_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, is_exclusive_lock_owner_enter, ictx);
+ bool owner;
+ int r = librbd::is_exclusive_lock_owner(ictx, &owner);
+ *is_owner = owner ? 1 : 0;
+ tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner);
+ return r;
+}
+
+extern "C" int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_acquire_enter, ictx, lock_mode);
+ int r = librbd::lock_acquire(ictx, lock_mode);
+ tracepoint(librbd, lock_acquire_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_lock_release(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_release_enter, ictx);
+ int r = librbd::lock_release(ictx);
+ tracepoint(librbd, lock_release_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_lock_get_owners(rbd_image_t image,
+ rbd_lock_mode_t *lock_mode,
+ char **lock_owners,
+ size_t *max_lock_owners)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, lock_get_owners_enter, ictx);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(lock_owners, 0, sizeof(*lock_owners) * *max_lock_owners);
+ std::list<std::string> lock_owner_list;
+ int r = librbd::lock_get_owners(ictx, lock_mode, &lock_owner_list);
+ if (r >= 0) {
+ if (*max_lock_owners >= lock_owner_list.size()) {
+ *max_lock_owners = 0;
+ for (auto &lock_owner : lock_owner_list) {
+ lock_owners[(*max_lock_owners)++] = strdup(lock_owner.c_str());
+ }
+ } else {
+ *max_lock_owners = lock_owner_list.size();
+ r = -ERANGE;
+ }
+ }
+ tracepoint(librbd, lock_get_owners_exit, ictx, r);
+ return r;
+}
+
+extern "C" void rbd_lock_get_owners_cleanup(char **lock_owners,
+ size_t lock_owner_count)
+{
+ for (size_t i = 0; i < lock_owner_count; ++i) {
+ free(lock_owners[i]);
+ }
+}
+
+extern "C" int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode,
+ const char *lock_owner)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner);
+ int r = librbd::lock_break(ictx, lock_mode, lock_owner);
+ tracepoint(librbd, lock_break_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_rebuild_object_map(rbd_image_t image,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ return ictx->operations->rebuild_object_map(prog_ctx);
+}
+
+/* snapshots */
+extern "C" int rbd_snap_create(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(),
+ snap_name);
+ tracepoint(librbd, snap_create_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_rename(rbd_image_t image, const char *srcname, const char *dstname)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname);
+ int r = ictx->operations->snap_rename(srcname, dstname);
+ tracepoint(librbd, snap_rename_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_remove(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::snap_remove(ictx, snap_name, 0, prog_ctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_remove2(rbd_image_t image, const char *snap_name, uint32_t flags,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::snap_remove(ictx, snap_name, flags, prog_ctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Snapshot<>::remove(ictx, snap_id);
+}
+
+extern "C" int rbd_snap_rollback(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_rollback_with_progress(rbd_image_t image,
+ const char *snap_name,
+ librbd_progress_fn_t cb,
+ void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+ int *max_snaps)
+{
+ vector<librbd::snap_info_t> cpp_snaps;
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snaps);
+
+ if (!max_snaps) {
+ tracepoint(librbd, snap_list_exit, -EINVAL, 0);
+ return -EINVAL;
+ }
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(snaps, 0, sizeof(*snaps) * *max_snaps);
+
+ int r = librbd::snap_list(ictx, cpp_snaps);
+ if (r == -ENOENT) {
+ tracepoint(librbd, snap_list_exit, 0, *max_snaps);
+ return 0;
+ }
+ if (r < 0) {
+ tracepoint(librbd, snap_list_exit, r, *max_snaps);
+ return r;
+ }
+ if (*max_snaps < (int)cpp_snaps.size() + 1) {
+ *max_snaps = (int)cpp_snaps.size() + 1;
+ tracepoint(librbd, snap_list_exit, -ERANGE, *max_snaps);
+ return -ERANGE;
+ }
+
+ int i;
+
+ for (i = 0; i < (int)cpp_snaps.size(); i++) {
+ snaps[i].id = cpp_snaps[i].id;
+ snaps[i].size = cpp_snaps[i].size;
+ snaps[i].name = strdup(cpp_snaps[i].name.c_str());
+ if (!snaps[i].name) {
+ for (int j = 0; j < i; j++)
+ free((void *)snaps[j].name);
+ tracepoint(librbd, snap_list_exit, -ENOMEM, *max_snaps);
+ return -ENOMEM;
+ }
+ tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name);
+ }
+ snaps[i].id = 0;
+ snaps[i].size = 0;
+ snaps[i].name = NULL;
+
+ r = (int)cpp_snaps.size();
+ tracepoint(librbd, snap_list_exit, r, *max_snaps);
+ return r;
+}
+
+extern "C" void rbd_snap_list_end(rbd_snap_info_t *snaps)
+{
+ tracepoint(librbd, snap_list_end_enter, snaps);
+ while (snaps->name) {
+ free((void *)snaps->name);
+ snaps++;
+ }
+ tracepoint(librbd, snap_list_end_exit);
+}
+
+extern "C" int rbd_snap_protect(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_protect_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_unprotect(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_unprotect_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+ int *is_protected)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ bool protected_snap;
+ int r = librbd::snap_is_protected(ictx, snap_name, &protected_snap);
+ if (r < 0) {
+ tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0);
+ return r;
+ }
+ *is_protected = protected_snap ? 1 : 0;
+ tracepoint(librbd, snap_is_protected_exit, 0, *is_protected ? 1 : 0);
+ return 0;
+}
+
+extern "C" int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str());
+ int r = librbd::snap_get_limit(ictx, limit);
+ tracepoint(librbd, snap_get_limit_exit, r, *limit);
+ return r;
+}
+
+extern "C" int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str());
+ int r = librbd::snap_get_timestamp(ictx, snap_id, timestamp);
+ tracepoint(librbd, snap_get_timestamp_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_set_limit(rbd_image_t image, uint64_t limit)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit);
+ int r = librbd::snap_set_limit(ictx, limit);
+ tracepoint(librbd, snap_set_limit_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_set(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Image<>::snap_set(
+ ictx, cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_set_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Image<>::snap_set(ictx, snap_id);
+}
+
+extern "C" ssize_t rbd_list_children(rbd_image_t image, char *pools,
+ size_t *pools_len, char *images,
+ size_t *images_len)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ std::vector<librbd::linked_image_spec_t> cpp_images;
+ int r = librbd::api::Image<>::list_children(ictx, &cpp_images);
+ if (r < 0) {
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ std::set<std::pair<std::string, std::string>> image_set;
+ for (auto& image : cpp_images) {
+ if (!image.trash) {
+ image_set.insert({image.pool_name, image.image_name});
+ }
+ }
+
+ size_t pools_total = 0;
+ size_t images_total = 0;
+ for (auto it : image_set) {
+ pools_total += it.first.length() + 1;
+ images_total += it.second.length() + 1;
+ }
+
+ bool too_short = false;
+ if (pools_total > *pools_len)
+ too_short = true;
+ if (images_total > *images_len)
+ too_short = true;
+ *pools_len = pools_total;
+ *images_len = images_total;
+ if (too_short) {
+ tracepoint(librbd, list_children_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ char *pools_p = pools;
+ char *images_p = images;
+ for (auto it : image_set) {
+ const char* pool = it.first.c_str();
+ strcpy(pools_p, pool);
+ pools_p += it.first.length() + 1;
+ const char* image = it.second.c_str();
+ strcpy(images_p, image);
+ images_p += it.second.length() + 1;
+ tracepoint(librbd, list_children_entry, pool, image);
+ }
+
+ ssize_t ret = image_set.size();
+ tracepoint(librbd, list_children_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_list_children2(rbd_image_t image,
+ rbd_child_info_t *children,
+ int *max_children)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(children, 0, sizeof(*children) * *max_children);
+
+ if (!max_children) {
+ tracepoint(librbd, list_children_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ std::vector<librbd::linked_image_spec_t> cpp_children;
+ int r = librbd::api::Image<>::list_children(ictx, &cpp_children);
+ if (r < 0) {
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ if (*max_children < (int)cpp_children.size() + 1) {
+ *max_children = (int)cpp_children.size() + 1;
+ tracepoint(librbd, list_children_exit, *max_children);
+ return -ERANGE;
+ }
+
+ int i;
+ for (i = 0; i < (int)cpp_children.size(); i++) {
+ children[i].pool_name = strdup(cpp_children[i].pool_name.c_str());
+ children[i].image_name = strdup(cpp_children[i].image_name.c_str());
+ children[i].image_id = strdup(cpp_children[i].image_id.c_str());
+ children[i].trash = cpp_children[i].trash;
+ tracepoint(librbd, list_children_entry, children[i].pool_name,
+ children[i].image_name);
+ }
+ children[i].pool_name = NULL;
+ children[i].image_name = NULL;
+ children[i].image_id = NULL;
+
+ r = (int)cpp_children.size();
+ tracepoint(librbd, list_children_exit, *max_children);
+ return r;
+}
+
+extern "C" void rbd_list_child_cleanup(rbd_child_info_t *child)
+{
+ free((void *)child->pool_name);
+ free((void *)child->image_name);
+ free((void *)child->image_id);
+}
+
+extern "C" void rbd_list_children_cleanup(rbd_child_info_t *children,
+ size_t num_children)
+{
+ for (size_t i=0; i < num_children; i++) {
+ free((void *)children[i].pool_name);
+ free((void *)children[i].image_name);
+ free((void *)children[i].image_id);
+ }
+}
+
+extern "C" int rbd_list_children3(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *max_images);
+
+ std::vector<librbd::linked_image_spec_t> cpp_children;
+ int r = librbd::api::Image<>::list_children(ictx, &cpp_children);
+ if (r < 0) {
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ if (*max_images < cpp_children.size()) {
+ *max_images = cpp_children.size();
+ return -ERANGE;
+ }
+
+ *max_images = cpp_children.size();
+ for (size_t idx = 0; idx < cpp_children.size(); ++idx) {
+ images[idx] = {
+ .pool_id = cpp_children[idx].pool_id,
+ .pool_name = strdup(cpp_children[idx].pool_name.c_str()),
+ .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()),
+ .image_id = strdup(cpp_children[idx].image_id.c_str()),
+ .image_name = strdup(cpp_children[idx].image_name.c_str()),
+ .trash = cpp_children[idx].trash};
+ tracepoint(librbd, list_children_entry, images[idx].pool_name,
+ images[idx].image_name);
+ }
+ return 0;
+}
+
+extern "C" int rbd_list_descendants(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *max_images);
+
+ std::vector<librbd::linked_image_spec_t> cpp_children;
+ int r = librbd::api::Image<>::list_descendants(ictx, {}, &cpp_children);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_images < cpp_children.size()) {
+ *max_images = cpp_children.size();
+ return -ERANGE;
+ }
+
+ *max_images = cpp_children.size();
+ for (size_t idx = 0; idx < cpp_children.size(); ++idx) {
+ images[idx] = {
+ .pool_id = cpp_children[idx].pool_id,
+ .pool_name = strdup(cpp_children[idx].pool_name.c_str()),
+ .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()),
+ .image_id = strdup(cpp_children[idx].image_id.c_str()),
+ .image_name = strdup(cpp_children[idx].image_name.c_str()),
+ .trash = cpp_children[idx].trash};
+ }
+ return 0;
+}
+
+extern "C" ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ std::list<librbd::locker_t> lockers;
+ bool exclusive_bool;
+ string tag_str;
+
+ int r = list_lockers(ictx, &lockers, &exclusive_bool, &tag_str);
+ if (r < 0) {
+ tracepoint(librbd, list_lockers_exit, r);
+ return r;
+ }
+
+ ldout(ictx->cct, 20) << "list_lockers r = " << r << " lockers.size() = " << lockers.size() << dendl;
+
+ *exclusive = (int)exclusive_bool;
+ size_t clients_total = 0;
+ size_t cookies_total = 0;
+ size_t addrs_total = 0;
+ for (list<librbd::locker_t>::const_iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ clients_total += it->client.length() + 1;
+ cookies_total += it->cookie.length() + 1;
+ addrs_total += it->address.length() + 1;
+ }
+
+ bool too_short = ((clients_total > *clients_len) ||
+ (cookies_total > *cookies_len) ||
+ (addrs_total > *addrs_len) ||
+ (tag_str.length() + 1 > *tag_len));
+ *clients_len = clients_total;
+ *cookies_len = cookies_total;
+ *addrs_len = addrs_total;
+ *tag_len = tag_str.length() + 1;
+ if (too_short) {
+ tracepoint(librbd, list_lockers_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ strcpy(tag, tag_str.c_str());
+ char *clients_p = clients;
+ char *cookies_p = cookies;
+ char *addrs_p = addrs;
+ for (list<librbd::locker_t>::const_iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ const char* client = it->client.c_str();
+ strcpy(clients_p, client);
+ clients_p += it->client.length() + 1;
+ const char* cookie = it->cookie.c_str();
+ strcpy(cookies_p, cookie);
+ cookies_p += it->cookie.length() + 1;
+ const char* address = it->address.c_str();
+ strcpy(addrs_p, address);
+ addrs_p += it->address.length() + 1;
+ tracepoint(librbd, list_lockers_entry, client, cookie, address);
+ }
+
+ ssize_t ret = lockers.size();
+ tracepoint(librbd, list_lockers_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_lock_exclusive(rbd_image_t image, const char *cookie)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie);
+ int r = librbd::lock(ictx, true, cookie ? cookie : "", "");
+ tracepoint(librbd, lock_exclusive_exit, r);
+ return r;
+}
+
+extern "C" int rbd_lock_shared(rbd_image_t image, const char *cookie,
+ const char *tag)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie, tag);
+ int r = librbd::lock(ictx, false, cookie ? cookie : "", tag ? tag : "");
+ tracepoint(librbd, lock_shared_exit, r);
+ return r;
+}
+
+extern "C" int rbd_unlock(rbd_image_t image, const char *cookie)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie);
+ int r = librbd::unlock(ictx, cookie ? cookie : "");
+ tracepoint(librbd, unlock_exit, r);
+ return r;
+}
+
+extern "C" int rbd_break_lock(rbd_image_t image, const char *client,
+ const char *cookie)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client, cookie);
+ int r = librbd::break_lock(ictx, client, cookie ? cookie : "");
+ tracepoint(librbd, break_lock_exit, r);
+ return r;
+}
+
+/* I/O */
+extern "C" ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ int r = ictx->io_work_queue->read(ofs, len, librbd::io::ReadResult{buf, len},
+ 0);
+ tracepoint(librbd, read_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs, len, op_flags);
+ int r = ictx->io_work_queue->read(ofs, len, librbd::io::ReadResult{buf, len},
+ op_flags);
+ tracepoint(librbd, read_exit, r);
+ return r;
+}
+
+
+extern "C" int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ tracepoint(librbd, read_iterate_exit, r);
+ return r;
+}
+
+extern "C" int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ if (r > 0)
+ r = 0;
+ tracepoint(librbd, read_iterate2_exit, r);
+ return (int)r;
+}
+
+extern "C" int rbd_diff_iterate(rbd_image_t image,
+ const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ true, false);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs, len,
+ true, false, cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+}
+
+extern "C" int rbd_diff_iterate2(rbd_image_t image, const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ uint8_t include_parent, uint8_t whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ include_parent != 0, whole_object != 0);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs, len,
+ include_parent, whole_object,
+ cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf);
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+ int r = ictx->io_work_queue->write(ofs, len, std::move(bl), 0);
+ tracepoint(librbd, write_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf, op_flags);
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+ int r = ictx->io_work_queue->write(ofs, len, std::move(bl), op_flags);
+ tracepoint(librbd, write_exit, r);
+ return r;
+}
+
+
+extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ if (len > std::numeric_limits<int>::max()) {
+ tracepoint(librbd, discard_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = ictx->io_work_queue->discard(
+ ofs, len, ictx->discard_granularity_bytes);
+ tracepoint(librbd, discard_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, size_t data_len, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, data_len == 0 ? NULL : buf, data_len, op_flags);
+
+ if (data_len == 0 || len % data_len ||
+ len > std::numeric_limits<int>::max()) {
+ tracepoint(librbd, writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && mem_is_zero(buf, data_len)) {
+ int r = ictx->io_work_queue->write_zeroes(ofs, len, 0, op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+ }
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, data_len));
+ int r = ictx->io_work_queue->writesame(ofs, len, std::move(bl), op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len,
+ int zero_flags, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
+}
+
+extern "C" ssize_t rbd_compare_and_write(rbd_image_t image,
+ uint64_t ofs, size_t len,
+ const char *cmp_buf,
+ const char *buf,
+ uint64_t *mismatch_off,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs,
+ len, cmp_buf, buf, op_flags);
+
+ bufferlist cmp_bl;
+ cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len));
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+
+ int r = ictx->io_work_queue->compare_and_write(ofs, len, std::move(cmp_bl),
+ std::move(bl), mismatch_off,
+ op_flags);
+ tracepoint(librbd, compare_and_write_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_create_completion(void *cb_arg,
+ rbd_callback_t complete_cb,
+ rbd_completion_t *c)
+{
+ librbd::RBD::AioCompletion *rbd_comp =
+ new librbd::RBD::AioCompletion(cb_arg, complete_cb);
+ *c = (rbd_completion_t) rbd_comp;
+ return 0;
+}
+
+extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc);
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+ ictx->io_work_queue->aio_write(get_aio_completion(comp), off, len,
+ std::move(bl), 0);
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, buf, comp->pc, op_flags);
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+ ictx->io_work_queue->aio_write(get_aio_completion(comp), off, len,
+ std::move(bl), op_flags);
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ // convert the scatter list into a bufferlist
+ ssize_t len = 0;
+ bufferlist bl;
+ for (int i = 0; i < iovcnt; ++i) {
+ const struct iovec &io = iov[i];
+ len += io.iov_len;
+ if (len < 0) {
+ break;
+ }
+
+ bl.push_back(create_write_raw(ictx, static_cast<char*>(io.iov_base),
+ io.iov_len));
+ }
+
+ int r = 0;
+ if (iovcnt <= 0 || len < 0) {
+ r = -EINVAL;
+ }
+
+ tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, off, len, NULL,
+ comp->pc);
+ if (r == 0) {
+ ictx->io_work_queue->aio_write(get_aio_completion(comp), off, len,
+ std::move(bl), 0);
+ }
+ tracepoint(librbd, aio_write_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
+ rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, comp->pc);
+ ictx->io_work_queue->aio_discard(
+ get_aio_completion(comp), off, len, ictx->discard_granularity_bytes);
+ tracepoint(librbd, aio_discard_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc);
+ ictx->io_work_queue->aio_read(get_aio_completion(comp), off, len,
+ librbd::io::ReadResult{buf, len}, 0);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, buf, comp->pc, op_flags);
+ ictx->io_work_queue->aio_read(get_aio_completion(comp), off, len,
+ librbd::io::ReadResult{buf, len},op_flags);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_readv(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ ssize_t len = 0;
+ for (int i = 0; i < iovcnt; ++i) {
+ len += iov[i].iov_len;
+ if (len < 0) {
+ break;
+ }
+ }
+
+ int r = 0;
+ if (iovcnt == 0 || len < 0) {
+ r = -EINVAL;
+ }
+
+ tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, off, len, NULL,
+ comp->pc);
+ if (r == 0) {
+ librbd::io::ReadResult read_result;
+ if (iovcnt == 1) {
+ read_result = librbd::io::ReadResult(
+ static_cast<char *>(iov[0].iov_base), iov[0].iov_len);
+ } else {
+ read_result = librbd::io::ReadResult(iov, iovcnt);
+ }
+ ictx->io_work_queue->aio_read(get_aio_completion(comp), off, len,
+ std::move(read_result), 0);
+ }
+ tracepoint(librbd, aio_read_exit, r);
+ return r;
+}
+
+extern "C" int rbd_flush(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = ictx->io_work_queue->flush();
+ tracepoint(librbd, flush_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->io_work_queue->aio_flush(get_aio_completion(comp));
+ tracepoint(librbd, aio_flush_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, size_t data_len, rbd_completion_t c,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, data_len == 0 ? NULL : buf, data_len, comp->pc,
+ op_flags);
+
+ if (data_len == 0 || len % data_len) {
+ tracepoint(librbd, aio_writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && mem_is_zero(buf, data_len)) {
+ ictx->io_work_queue->aio_write_zeroes(get_aio_completion(comp), off, len, 0,
+ op_flags, true);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+ }
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, data_len));
+ ictx->io_work_queue->aio_writesame(get_aio_completion(comp), off, len,
+ std::move(bl), op_flags);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len,
+ rbd_completion_t c, int zero_flags,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ ictx->io_work_queue->aio_write_zeroes(
+ get_aio_completion(comp), off, len, zero_flags, op_flags, true);
+ return 0;
+}
+
+extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off,
+ size_t len, const char *cmp_buf,
+ const char *buf, rbd_completion_t c,
+ uint64_t *mismatch_off,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, cmp_buf, buf, comp->pc, op_flags);
+
+ bufferlist cmp_bl;
+ cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len));
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+ ictx->io_work_queue->aio_compare_and_write(get_aio_completion(comp), off, len,
+ std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, false);
+
+ tracepoint(librbd, aio_compare_and_write_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_invalidate_cache(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::invalidate_cache(ictx);
+ tracepoint(librbd, invalidate_cache_exit, r);
+ return r;
+}
+
+extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::io::AioCompletion *cs[numcomp];
+ tracepoint(librbd, poll_io_events_enter, ictx, numcomp);
+ int r = librbd::poll_io_events(ictx, cs, numcomp);
+ tracepoint(librbd, poll_io_events_exit, r);
+ if (r > 0) {
+ for (int i = 0; i < r; ++i)
+ comps[i] = cs[i]->rbd_comp;
+ }
+ return r;
+}
+
+extern "C" int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *vallen)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ string val_s;
+ tracepoint(librbd, metadata_get_enter, ictx, key);
+ int r = librbd::metadata_get(ictx, key, &val_s);
+ if (r < 0) {
+ tracepoint(librbd, metadata_get_exit, r, key, NULL);
+ return r;
+ }
+ if (*vallen < val_s.size() + 1) {
+ r = -ERANGE;
+ *vallen = val_s.size() + 1;
+ tracepoint(librbd, metadata_get_exit, r, key, NULL);
+ } else {
+ strncpy(value, val_s.c_str(), val_s.size() + 1);
+ tracepoint(librbd, metadata_get_exit, r, key, value);
+ }
+ return r;
+}
+
+extern "C" int rbd_metadata_set(rbd_image_t image, const char *key, const char *value)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, metadata_set_enter, ictx, key, value);
+ int r = ictx->operations->metadata_set(key, value);
+ tracepoint(librbd, metadata_set_exit, r);
+ return r;
+}
+
+extern "C" int rbd_metadata_remove(rbd_image_t image, const char *key)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, metadata_remove_enter, ictx, key);
+ int r = ictx->operations->metadata_remove(key);
+ tracepoint(librbd, metadata_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+ char *key, size_t *key_len, char *value, size_t *val_len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, metadata_list_enter, ictx);
+ map<string, bufferlist> pairs;
+ int r = librbd::metadata_list(ictx, start, max, &pairs);
+ size_t key_total_len = 0, val_total_len = 0;
+ bool too_short = false;
+ for (map<string, bufferlist>::iterator it = pairs.begin();
+ it != pairs.end(); ++it) {
+ key_total_len += it->first.size() + 1;
+ val_total_len += it->second.length() + 1;
+ }
+ if (*key_len < key_total_len || *val_len < val_total_len)
+ too_short = true;
+ *key_len = key_total_len;
+ *val_len = val_total_len;
+ if (too_short) {
+ tracepoint(librbd, metadata_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ char *key_p = key, *value_p = value;
+
+ for (map<string, bufferlist>::iterator it = pairs.begin();
+ it != pairs.end(); ++it) {
+ strncpy(key_p, it->first.c_str(), it->first.size() + 1);
+ key_p += it->first.size() + 1;
+ strncpy(value_p, it->second.c_str(), it->second.length());
+ value_p += it->second.length();
+ *value_p = '\0';
+ value_p++;
+ tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str());
+ }
+ tracepoint(librbd, metadata_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_mirror_image_enable(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_enable(ictx, false);
+}
+
+extern "C" int rbd_mirror_image_disable(rbd_image_t image, bool force)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_disable(ictx, force);
+}
+
+extern "C" int rbd_mirror_image_promote(rbd_image_t image, bool force)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_promote(ictx, force);
+}
+
+extern "C" int rbd_mirror_image_demote(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_demote(ictx);
+}
+
+extern "C" int rbd_mirror_image_resync(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_resync(ictx);
+}
+
+extern "C" int rbd_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *mirror_image_info,
+ size_t info_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ if (sizeof(rbd_mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ librbd::mirror_image_info_t cpp_mirror_image;
+ int r = librbd::api::Mirror<>::image_get_info(ictx, &cpp_mirror_image);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror_image_info_cpp_to_c(cpp_mirror_image, mirror_image_info);
+ return 0;
+}
+
+extern "C" int rbd_mirror_image_get_status(rbd_image_t image,
+ rbd_mirror_image_status_t *status,
+ size_t status_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ if (sizeof(rbd_mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ librbd::mirror_image_status_t cpp_status;
+ int r = librbd::api::Mirror<>::image_get_status(ictx, &cpp_status);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror_image_status_cpp_to_c(cpp_status, status);
+ return 0;
+}
+
+extern "C" int rbd_mirror_image_get_instance_id(rbd_image_t image,
+ char *instance_id,
+ size_t *instance_id_max_length)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ std::string cpp_instance_id;
+ int r = librbd::api::Mirror<>::image_get_instance_id(ictx, &cpp_instance_id);
+ if (r < 0) {
+ return r;
+ }
+
+ if (cpp_instance_id.size() >= *instance_id_max_length) {
+ *instance_id_max_length = cpp_instance_id.size() + 1;
+ return -ERANGE;
+ }
+
+ strcpy(instance_id, cpp_instance_id.c_str());
+ *instance_id_max_length = cpp_instance_id.size() + 1;
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_promote(rbd_image_t image, bool force,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ librbd::api::Mirror<>::image_promote(
+ ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_demote(rbd_image_t image,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ librbd::api::Mirror<>::image_demote(
+ ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *info,
+ size_t info_size,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ if (sizeof(rbd_mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ auto ctx = new C_MirrorImageGetInfo(
+ info, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ librbd::api::Mirror<>::image_get_info(
+ ictx, &ctx->cpp_mirror_image_info, ctx);
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_get_status(rbd_image_t image,
+ rbd_mirror_image_status_t *status,
+ size_t status_size,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ if (sizeof(rbd_mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ auto ctx = new C_MirrorImageGetStatus(
+ status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ librbd::api::Mirror<>::image_get_status(ictx, &ctx->cpp_mirror_image_status,
+ ctx);
+ return 0;
+}
+
+extern "C" int rbd_update_watch(rbd_image_t image, uint64_t *handle,
+ rbd_update_callback_t watch_cb, void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ C_UpdateWatchCB *wctx = new C_UpdateWatchCB(watch_cb, arg);
+ tracepoint(librbd, update_watch_enter, ictx, wctx);
+ int r = ictx->state->register_update_watcher(wctx, &wctx->handle);
+ tracepoint(librbd, update_watch_exit, r, wctx->handle);
+ *handle = reinterpret_cast<uint64_t>(wctx);
+ return r;
+}
+
+extern "C" int rbd_update_unwatch(rbd_image_t image, uint64_t handle)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ C_UpdateWatchCB *wctx = reinterpret_cast<C_UpdateWatchCB *>(handle);
+ tracepoint(librbd, update_unwatch_enter, ictx, wctx->handle);
+ int r = ictx->state->unregister_update_watcher(wctx->handle);
+ delete wctx;
+ tracepoint(librbd, update_unwatch_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_is_complete(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->is_complete();
+}
+
+extern "C" int rbd_aio_wait_for_complete(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->wait_for_complete();
+}
+
+extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->get_return_value();
+}
+
+extern "C" void *rbd_aio_get_arg(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->get_arg();
+}
+
+extern "C" void rbd_aio_release(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ comp->release();
+}
+
+extern "C" int rbd_group_create(rados_ioctx_t p, const char *name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Group<>::create(io_ctx, name);
+ tracepoint(librbd, group_create_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_remove(rados_ioctx_t p, const char *name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Group<>::remove(io_ctx, name);
+ tracepoint(librbd, group_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_list(rados_ioctx_t p, char *names, size_t *size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+
+ vector<string> cpp_names;
+ int r = librbd::api::Group<>::list(io_ctx, &cpp_names);
+
+ if (r < 0) {
+ tracepoint(librbd, group_list_exit, r);
+ return r;
+ }
+
+ size_t expected_size = 0;
+
+ for (size_t i = 0; i < cpp_names.size(); i++) {
+ expected_size += cpp_names[i].size() + 1;
+ }
+ if (*size < expected_size) {
+ *size = expected_size;
+ tracepoint(librbd, group_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ if (names == NULL) {
+ tracepoint(librbd, group_list_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < (int)cpp_names.size(); i++) {
+ const char* name = cpp_names[i].c_str();
+ tracepoint(librbd, group_list_entry, name);
+ strcpy(names, name);
+ names += strlen(names) + 1;
+ }
+ tracepoint(librbd, group_list_exit, (int)expected_size);
+ return (int)expected_size;
+}
+
+extern "C" int rbd_group_rename(rados_ioctx_t p, const char *src_name,
+ const char *dest_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), src_name, dest_name);
+ int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name);
+ tracepoint(librbd, group_rename_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_add(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx image_ioctx;
+
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+ librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_add_enter, group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+
+ int r = librbd::api::Group<>::image_add(group_ioctx, group_name, image_ioctx,
+ image_name);
+
+ tracepoint(librbd, group_image_add_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx image_ioctx;
+
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+ librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_enter, group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+
+ int r = librbd::api::Group<>::image_remove(group_ioctx, group_name,
+ image_ioctx, image_name);
+
+ tracepoint(librbd, group_image_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_remove_by_id(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_id)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx image_ioctx;
+
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+ librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_by_id_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_id);
+
+ int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name,
+ image_ioctx, image_id);
+
+ tracepoint(librbd, group_image_remove_by_id_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t *image_size)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_list_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *image_size);
+
+ if (group_image_info_size != sizeof(rbd_group_image_info_t)) {
+ *image_size = 0;
+ tracepoint(librbd, group_image_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ std::vector<librbd::group_image_info_t> cpp_images;
+ int r = librbd::api::Group<>::image_list(group_ioctx, group_name,
+ &cpp_images);
+
+ if (r == -ENOENT) {
+ tracepoint(librbd, group_image_list_exit, 0);
+ *image_size = 0;
+ return 0;
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, group_image_list_exit, r);
+ return r;
+ }
+
+ if (*image_size < cpp_images.size()) {
+ *image_size = cpp_images.size();
+ tracepoint(librbd, group_image_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < cpp_images.size(); ++i) {
+ group_image_status_cpp_to_c(cpp_images[i], &images[i]);
+ }
+
+ r = *image_size = cpp_images.size();
+ tracepoint(librbd, group_image_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_info_cleanup(rbd_group_info_t *group_info,
+ size_t group_info_size) {
+ if (group_info_size != sizeof(rbd_group_info_t)) {
+ return -ERANGE;
+ }
+
+ free(group_info->name);
+ return 0;
+}
+
+extern "C" int rbd_group_image_list_cleanup(rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t len) {
+ if (group_image_info_size != sizeof(rbd_group_image_info_t)) {
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < len; ++i) {
+ free(images[i].name);
+ }
+ return 0;
+}
+
+extern "C" int rbd_group_snap_create(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_create_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, snap_name);
+
+ tracepoint(librbd, group_snap_create_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_remove_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name, snap_name);
+
+ tracepoint(librbd, group_snap_remove_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_rename(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rename_enter,
+ group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(),
+ group_name, old_snap_name, new_snap_name);
+
+ int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name,
+ old_snap_name, new_snap_name);
+
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_snap_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t *snaps_size)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_list_enter, group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(snaps, 0, sizeof(*snaps) * *snaps_size);
+
+ if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) {
+ *snaps_size = 0;
+ tracepoint(librbd, group_snap_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ std::vector<librbd::group_snap_info_t> cpp_snaps;
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps);
+
+ if (r == -ENOENT) {
+ *snaps_size = 0;
+ tracepoint(librbd, group_snap_list_exit, 0);
+ return 0;
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+ }
+
+ if (*snaps_size < cpp_snaps.size()) {
+ *snaps_size = cpp_snaps.size();
+ tracepoint(librbd, group_snap_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < cpp_snaps.size(); ++i) {
+ group_snap_info_cpp_to_c(cpp_snaps[i], &snaps[i]);
+ }
+
+ r = *snaps_size = cpp_snaps.size();
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t len) {
+ if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) {
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < len; ++i) {
+ free(snaps[i].name);
+ }
+ return 0;
+}
+
+extern "C" int rbd_group_snap_rollback(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+
+ tracepoint(librbd, group_snap_rollback_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name,
+ librbd_progress_fn_t cb,
+ void *cbdata)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+
+ tracepoint(librbd, group_snap_rollback_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_snap_get_namespace_type(rbd_image_t image,
+ uint64_t snap_id,
+ rbd_snap_namespace_type_t *namespace_type) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id,
+ namespace_type);
+ tracepoint(librbd, snap_get_namespace_type_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_get_group_namespace(rbd_image_t image, uint64_t snap_id,
+ rbd_snap_group_namespace_t *group_snap,
+ size_t snap_group_namespace_size) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_group_namespace_enter, ictx,
+ ictx->name.c_str());
+
+ if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) {
+ tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ librbd::snap_group_namespace_t group_namespace;
+ int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id,
+ &group_namespace);
+ if (r >= 0) {
+ group_snap->group_pool = group_namespace.group_pool;
+ group_snap->group_name = strdup(group_namespace.group_name.c_str());
+ group_snap->group_snap_name =
+ strdup(group_namespace.group_snap_name.c_str());
+ }
+
+ tracepoint(librbd, snap_get_group_namespace_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap,
+ size_t snap_group_namespace_size) {
+ if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) {
+ tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ free(group_snap->group_name);
+ free(group_snap->group_snap_name);
+ return 0;
+}
+
+extern "C" int rbd_snap_get_trash_namespace(rbd_image_t image, uint64_t snap_id,
+ char *original_name,
+ size_t max_length) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ std::string cpp_original_name;
+ int r = librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id,
+ &cpp_original_name);
+ if (r < 0) {
+ return r;
+ }
+
+ if (cpp_original_name.length() >= max_length) {
+ return -ERANGE;
+ }
+
+ strcpy(original_name, cpp_original_name.c_str());
+ return 0;
+}
+extern "C" int rbd_watchers_list(rbd_image_t image,
+ rbd_image_watcher_t *watchers,
+ size_t *max_watchers) {
+ std::list<librbd::image_watcher_t> watcher_list;
+ librbd::ImageCtx *ictx = (librbd::ImageCtx*)image;
+
+ tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(watchers, 0, sizeof(*watchers) * *max_watchers);
+ int r = librbd::list_watchers(ictx, watcher_list);
+ if (r < 0) {
+ tracepoint(librbd, list_watchers_exit, r, 0);
+ return r;
+ }
+
+ if (watcher_list.size() > *max_watchers) {
+ *max_watchers = watcher_list.size();
+ tracepoint(librbd, list_watchers_exit, -ERANGE, watcher_list.size());
+ return -ERANGE;
+ }
+
+ *max_watchers = 0;
+ for (auto &watcher : watcher_list) {
+ tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie);
+ watchers[*max_watchers].addr = strdup(watcher.addr.c_str());
+ watchers[*max_watchers].id = watcher.id;
+ watchers[*max_watchers].cookie = watcher.cookie;
+ *max_watchers += 1;
+ }
+
+ tracepoint(librbd, list_watchers_exit, r, watcher_list.size());
+ return 0;
+}
+
+extern "C" void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers,
+ size_t num_watchers) {
+ for (size_t i = 0; i < num_watchers; ++i) {
+ free(watchers[i].addr);
+ }
+}
+
+extern "C" int rbd_config_image_list(rbd_image_t image,
+ rbd_config_option_t *options,
+ int *max_options) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx*)image;
+
+ std::vector<librbd::config_option_t> option_vector;
+ int r = librbd::api::Config<>::list(ictx, &option_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_options < static_cast<int>(option_vector.size())) {
+ *max_options = static_cast<int>(option_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(option_vector.size()); ++i) {
+ config_option_cpp_to_c(option_vector[i], &options[i]);
+ }
+ *max_options = static_cast<int>(option_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_config_image_list_cleanup(rbd_config_option_t *options,
+ int max_options) {
+ for (int i = 0; i < max_options; ++i) {
+ config_option_cleanup(options[i]);
+ }
+}
diff --git a/src/librbd/managed_lock/AcquireRequest.cc b/src/librbd/managed_lock/AcquireRequest.cc
new file mode 100644
index 00000000..9b3e5e87
--- /dev/null
+++ b/src/librbd/managed_lock/AcquireRequest.cc
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/AcquireRequest.h"
+#include "librbd/Watcher.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/managed_lock/BreakRequest.h"
+#include "librbd/managed_lock/GetLockerRequest.h"
+#include "librbd/managed_lock/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::AcquireRequest: " << this \
+ << " " << __func__ << ": "
+
+using std::string;
+
+namespace librbd {
+
+using librbd::util::detail::C_AsyncCallback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace managed_lock {
+
+template <typename I>
+AcquireRequest<I>* AcquireRequest<I>::create(librados::IoCtx& ioctx,
+ Watcher *watcher,
+ ContextWQ *work_queue,
+ const string& oid,
+ const string& cookie,
+ bool exclusive,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds,
+ Context *on_finish) {
+ return new AcquireRequest(ioctx, watcher, work_queue, oid, cookie,
+ exclusive, blacklist_on_break_lock,
+ blacklist_expire_seconds, on_finish);
+}
+
+template <typename I>
+AcquireRequest<I>::AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ ContextWQ *work_queue, const string& oid,
+ const string& cookie, bool exclusive,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_watcher(watcher),
+ m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())),
+ m_work_queue(work_queue), m_oid(oid), m_cookie(cookie),
+ m_exclusive(exclusive),
+ m_blacklist_on_break_lock(blacklist_on_break_lock),
+ m_blacklist_expire_seconds(blacklist_expire_seconds),
+ m_on_finish(new C_AsyncCallback<ContextWQ>(work_queue, on_finish)) {
+}
+
+template <typename I>
+AcquireRequest<I>::~AcquireRequest() {
+}
+
+template <typename I>
+void AcquireRequest<I>::send() {
+ send_get_locker();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_get_locker() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ AcquireRequest<I>, &AcquireRequest<I>::handle_get_locker>(this);
+ auto req = GetLockerRequest<I>::create(m_ioctx, m_oid, m_exclusive,
+ &m_locker, ctx);
+ req->send();
+}
+
+template <typename I>
+void AcquireRequest<I>::handle_get_locker(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 20) << "no lockers detected" << dendl;
+ m_locker = {};
+ } else if (r == -EBUSY) {
+ ldout(m_cct, 5) << "incompatible lock detected" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_lock() {
+ ldout(m_cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", "
+ << "cookie=" << m_cookie << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::lock(&op, RBD_LOCK_NAME,
+ m_exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED, m_cookie,
+ util::get_watcher_lock_tag(), "", utime_t(), 0);
+
+ using klass = AcquireRequest;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_lock>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void AcquireRequest<I>::handle_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ finish(0);
+ return;
+ } else if (r == -EBUSY && m_locker.cookie.empty()) {
+ ldout(m_cct, 5) << "already locked, refreshing locker" << dendl;
+ send_get_locker();
+ return;
+ } else if (r != -EBUSY) {
+ lderr(m_cct) << "failed to lock: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_break_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_break_lock() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ AcquireRequest<I>, &AcquireRequest<I>::handle_break_lock>(this);
+ auto req = BreakRequest<I>::create(
+ m_ioctx, m_work_queue, m_oid, m_locker, m_exclusive,
+ m_blacklist_on_break_lock, m_blacklist_expire_seconds, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void AcquireRequest<I>::handle_break_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -EAGAIN) {
+ ldout(m_cct, 5) << "lock owner is still alive" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to break lock : " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_locker = {};
+ send_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::finish(int r) {
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::AcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/AcquireRequest.h b/src/librbd/managed_lock/AcquireRequest.h
new file mode 100644
index 00000000..20af0693
--- /dev/null
+++ b/src/librbd/managed_lock/AcquireRequest.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/watcher/Types.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+
+namespace librbd {
+
+class Watcher;
+
+namespace managed_lock {
+
+template <typename ImageCtxT>
+class AcquireRequest {
+private:
+ typedef watcher::Traits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Watcher Watcher;
+
+public:
+ static AcquireRequest* create(librados::IoCtx& ioctx, Watcher *watcher,
+ ContextWQ *work_queue, const std::string& oid,
+ const std::string& cookie,
+ bool exclusive,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds,
+ Context *on_finish);
+
+ ~AcquireRequest();
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_LOCKER
+ * | ^
+ * | . (EBUSY && no cached locker)
+ * | .
+ * | . (EBUSY && cached locker)
+ * \--> LOCK_IMAGE * * * * * * * * > BREAK_LOCK . . . . .
+ * | ^ | .
+ * | | | (success) .
+ * | \-------------------------/ .
+ * v .
+ * <finish> < . . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ ContextWQ *work_queue, const std::string& oid,
+ const std::string& cookie, bool exclusive,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds, Context *on_finish);
+
+ librados::IoCtx& m_ioctx;
+ Watcher *m_watcher;
+ CephContext *m_cct;
+ ContextWQ *m_work_queue;
+ std::string m_oid;
+ std::string m_cookie;
+ bool m_exclusive;
+ bool m_blacklist_on_break_lock;
+ uint32_t m_blacklist_expire_seconds;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ Locker m_locker;
+
+ void send_get_locker();
+ void handle_get_locker(int r);
+
+ void send_lock();
+ void handle_lock(int r);
+
+ void send_break_lock();
+ void handle_break_lock(int r);
+
+ void finish(int r);
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H
diff --git a/src/librbd/managed_lock/BreakRequest.cc b/src/librbd/managed_lock/BreakRequest.cc
new file mode 100644
index 00000000..8caaea6f
--- /dev/null
+++ b/src/librbd/managed_lock/BreakRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/BreakRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/managed_lock/GetLockerRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::BreakRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace managed_lock {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace {
+
+struct C_BlacklistClient : public Context {
+ librados::IoCtx &ioctx;
+ std::string locker_address;
+ uint32_t expire_seconds;
+ Context *on_finish;
+
+ C_BlacklistClient(librados::IoCtx &ioctx, const std::string &locker_address,
+ uint32_t expire_seconds, Context *on_finish)
+ : ioctx(ioctx), locker_address(locker_address),
+ expire_seconds(expire_seconds), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ librados::Rados rados(ioctx);
+ r = rados.blacklist_add(locker_address, expire_seconds);
+ on_finish->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+BreakRequest<I>::BreakRequest(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid, const Locker &locker,
+ bool exclusive, bool blacklist_locker,
+ uint32_t blacklist_expire_seconds,
+ bool force_break_lock, Context *on_finish)
+ : m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())),
+ m_work_queue(work_queue), m_oid(oid), m_locker(locker),
+ m_exclusive(exclusive), m_blacklist_locker(blacklist_locker),
+ m_blacklist_expire_seconds(blacklist_expire_seconds),
+ m_force_break_lock(force_break_lock), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void BreakRequest<I>::send() {
+ send_get_watchers();
+}
+
+template <typename I>
+void BreakRequest<I>::send_get_watchers() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_watchers(&m_watchers, &m_watchers_ret_val);
+
+ using klass = BreakRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_get_watchers>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void BreakRequest<I>::handle_get_watchers(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ r = m_watchers_ret_val;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve watchers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ bool found_alive_locker = false;
+ for (auto &watcher : m_watchers) {
+ ldout(m_cct, 20) << "watcher=["
+ << "addr=" << watcher.addr << ", "
+ << "entity=client." << watcher.watcher_id << "]" << dendl;
+
+ if ((strncmp(m_locker.address.c_str(),
+ watcher.addr, sizeof(watcher.addr)) == 0) &&
+ (m_locker.handle == watcher.cookie)) {
+ ldout(m_cct, 10) << "lock owner is still alive" << dendl;
+ found_alive_locker = true;
+ }
+ }
+
+ if (!m_force_break_lock && found_alive_locker) {
+ finish(-EAGAIN);
+ return;
+ }
+
+ send_get_locker();
+}
+
+template <typename I>
+void BreakRequest<I>::send_get_locker() {
+ ldout(m_cct, 10) << dendl;
+
+ using klass = BreakRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_get_locker>(
+ this);
+ auto req = GetLockerRequest<I>::create(m_ioctx, m_oid, m_exclusive,
+ &m_refreshed_locker, ctx);
+ req->send();
+}
+
+template <typename I>
+void BreakRequest<I>::handle_get_locker(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 5) << "no lock owner" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0 && r != -EBUSY) {
+ lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ m_refreshed_locker = {};
+ }
+
+ if (m_refreshed_locker != m_locker || m_refreshed_locker == Locker{}) {
+ ldout(m_cct, 5) << "no longer lock owner" << dendl;
+ finish(-EAGAIN);
+ return;
+ }
+
+ send_blacklist();
+}
+
+template <typename I>
+void BreakRequest<I>::send_blacklist() {
+ if (!m_blacklist_locker) {
+ send_break_lock();
+ return;
+ }
+
+ entity_name_t entity_name = entity_name_t::CLIENT(m_ioctx.get_instance_id());
+ ldout(m_cct, 10) << "local entity=" << entity_name << ", "
+ << "locker entity=" << m_locker.entity << dendl;
+
+ if (m_locker.entity == entity_name) {
+ lderr(m_cct) << "attempting to self-blacklist" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ // TODO: need async version of RadosClient::blacklist_add
+ using klass = BreakRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_blacklist>(
+ this);
+ m_work_queue->queue(new C_BlacklistClient(m_ioctx, m_locker.address,
+ m_blacklist_expire_seconds, ctx),
+ 0);
+}
+
+template <typename I>
+void BreakRequest<I>::handle_blacklist(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to blacklist lock owner: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+ send_break_lock();
+}
+
+template <typename I>
+void BreakRequest<I>::send_break_lock() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, m_locker.cookie,
+ m_locker.entity);
+
+ using klass = BreakRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_break_lock>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void BreakRequest<I>::handle_break_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void BreakRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::BreakRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/BreakRequest.h b/src/librbd/managed_lock/BreakRequest.h
new file mode 100644
index 00000000..4531a3c5
--- /dev/null
+++ b/src/librbd/managed_lock/BreakRequest.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "msg/msg_types.h"
+#include <list>
+#include <string>
+#include <boost/optional.hpp>
+#include "librbd/managed_lock/Types.h"
+
+class Context;
+class ContextWQ;
+class obj_watch_t;
+
+namespace librbd {
+
+class ImageCtx;
+template <typename> class Journal;
+
+namespace managed_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class BreakRequest {
+public:
+ static BreakRequest* create(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid, const Locker &locker,
+ bool exclusive, bool blacklist_locker,
+ uint32_t blacklist_expire_seconds,
+ bool force_break_lock, Context *on_finish) {
+ return new BreakRequest(ioctx, work_queue, oid, locker, exclusive,
+ blacklist_locker, blacklist_expire_seconds,
+ force_break_lock, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_WATCHERS
+ * |
+ * v
+ * GET_LOCKER
+ * |
+ * v
+ * BLACKLIST (skip if disabled)
+ * |
+ * v
+ * BREAK_LOCK
+ * |
+ * v
+ * <finish>
+ *
+ * @endvertbatim
+ */
+
+ librados::IoCtx &m_ioctx;
+ CephContext *m_cct;
+ ContextWQ *m_work_queue;
+ std::string m_oid;
+ Locker m_locker;
+ bool m_exclusive;
+ bool m_blacklist_locker;
+ uint32_t m_blacklist_expire_seconds;
+ bool m_force_break_lock;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ std::list<obj_watch_t> m_watchers;
+ int m_watchers_ret_val;
+
+ Locker m_refreshed_locker;
+
+ BreakRequest(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid, const Locker &locker,
+ bool exclusive, bool blacklist_locker,
+ uint32_t blacklist_expire_seconds, bool force_break_lock,
+ Context *on_finish);
+
+ void send_get_watchers();
+ void handle_get_watchers(int r);
+
+ void send_get_locker();
+ void handle_get_locker(int r);
+
+ void send_blacklist();
+ void handle_blacklist(int r);
+
+ void send_break_lock();
+ void handle_break_lock(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+extern template class librbd::managed_lock::BreakRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H
diff --git a/src/librbd/managed_lock/GetLockerRequest.cc b/src/librbd/managed_lock/GetLockerRequest.cc
new file mode 100644
index 00000000..fb5bb322
--- /dev/null
+++ b/src/librbd/managed_lock/GetLockerRequest.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/GetLockerRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/managed_lock/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::GetLockerRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace managed_lock {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+GetLockerRequest<I>::GetLockerRequest(librados::IoCtx& ioctx,
+ const std::string& oid, bool exclusive,
+ Locker *locker, Context *on_finish)
+ : m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())),
+ m_oid(oid), m_exclusive(exclusive), m_locker(locker),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void GetLockerRequest<I>::send() {
+ send_get_lockers();
+}
+
+template <typename I>
+void GetLockerRequest<I>::send_get_lockers() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = GetLockerRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_get_lockers>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void GetLockerRequest<I>::handle_get_lockers(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> lockers;
+ ClsLockType lock_type = LOCK_NONE;
+ std::string lock_tag;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = rados::cls::lock::get_lock_info_finish(&it, &lockers, &lock_type,
+ &lock_tag);
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (lockers.empty()) {
+ ldout(m_cct, 20) << "no lockers detected" << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ if (lock_tag != util::get_watcher_lock_tag()) {
+ ldout(m_cct, 5) <<"locked by external mechanism: tag=" << lock_tag << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ if (m_exclusive && lock_type == LOCK_SHARED) {
+ ldout(m_cct, 5) << "incompatible shared lock type detected" << dendl;
+ finish(-EBUSY);
+ return;
+ } else if (!m_exclusive && lock_type == LOCK_EXCLUSIVE) {
+ ldout(m_cct, 5) << "incompatible exclusive lock type detected" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t>::iterator iter = lockers.begin();
+ if (!util::decode_lock_cookie(iter->first.cookie, &m_locker->handle)) {
+ ldout(m_cct, 5) << "locked by external mechanism: "
+ << "cookie=" << iter->first.cookie << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ m_locker->entity = iter->first.locker;
+ m_locker->cookie = iter->first.cookie;
+ m_locker->address = iter->second.addr.get_legacy_str();
+ if (m_locker->cookie.empty() || m_locker->address.empty()) {
+ ldout(m_cct, 20) << "no valid lockers detected" << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ ldout(m_cct, 10) << "retrieved exclusive locker: "
+ << m_locker->entity << "@" << m_locker->address << dendl;
+ finish(0);
+}
+
+template <typename I>
+void GetLockerRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::GetLockerRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/GetLockerRequest.h b/src/librbd/managed_lock/GetLockerRequest.h
new file mode 100644
index 00000000..b8fd08f6
--- /dev/null
+++ b/src/librbd/managed_lock/GetLockerRequest.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace managed_lock {
+
+struct Locker;
+
+template <typename ImageCtxT = ImageCtx>
+class GetLockerRequest {
+public:
+ static GetLockerRequest* create(librados::IoCtx& ioctx,
+ const std::string& oid, bool exclusive,
+ Locker *locker, Context *on_finish) {
+ return new GetLockerRequest(ioctx, oid, exclusive, locker, on_finish);
+ }
+
+ void send();
+
+private:
+ librados::IoCtx &m_ioctx;
+ CephContext *m_cct;
+ std::string m_oid;
+ bool m_exclusive;
+ Locker *m_locker;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ GetLockerRequest(librados::IoCtx& ioctx, const std::string& oid,
+ bool exclusive, Locker *locker, Context *on_finish);
+
+ void send_get_lockers();
+ void handle_get_lockers(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+extern template class librbd::managed_lock::GetLockerRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H
diff --git a/src/librbd/managed_lock/ReacquireRequest.cc b/src/librbd/managed_lock/ReacquireRequest.cc
new file mode 100644
index 00000000..dc5624d4
--- /dev/null
+++ b/src/librbd/managed_lock/ReacquireRequest.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/ReacquireRequest.h"
+#include "librbd/Watcher.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/managed_lock/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::ReacquireRequest: " \
+ << this << ": " << __func__
+
+using std::string;
+
+namespace librbd {
+namespace managed_lock {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+ReacquireRequest<I>::ReacquireRequest(librados::IoCtx& ioctx,
+ const string& oid,
+ const string& old_cookie,
+ const string &new_cookie,
+ bool exclusive,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_oid(oid), m_old_cookie(old_cookie),
+ m_new_cookie(new_cookie), m_exclusive(exclusive), m_on_finish(on_finish) {
+}
+
+
+template <typename I>
+void ReacquireRequest<I>::send() {
+ set_cookie();
+}
+
+template <typename I>
+void ReacquireRequest<I>::set_cookie() {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::set_cookie(&op, RBD_LOCK_NAME,
+ m_exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED,
+ m_old_cookie, util::get_watcher_lock_tag(),
+ m_new_cookie);
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ ReacquireRequest, &ReacquireRequest::handle_set_cookie>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void ReacquireRequest<I>::handle_set_cookie(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << ": r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ ldout(cct, 10) << ": OSD doesn't support updating lock" << dendl;
+ } else if (r < 0) {
+ lderr(cct) << ": failed to update lock: " << cpp_strerror(r) << dendl;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::ReacquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/ReacquireRequest.h b/src/librbd/managed_lock/ReacquireRequest.h
new file mode 100644
index 00000000..3f2b7d7e
--- /dev/null
+++ b/src/librbd/managed_lock/ReacquireRequest.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/int_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class Watcher;
+
+namespace managed_lock {
+
+template <typename ImageCtxT>
+class ReacquireRequest {
+public:
+
+ static ReacquireRequest *create(librados::IoCtx& ioctx,
+ const std::string& oid,
+ const std::string& old_cookie,
+ const std::string &new_cookie,
+ bool exclusive,
+ Context *on_finish) {
+ return new ReacquireRequest(ioctx, oid, old_cookie, new_cookie, exclusive,
+ on_finish);
+ }
+
+ ReacquireRequest(librados::IoCtx& ioctx, const std::string& oid,
+ const std::string& old_cookie,
+ const std::string &new_cookie, bool exclusive,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SET_COOKIE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ librados::IoCtx& m_ioctx;
+ std::string m_oid;
+ std::string m_old_cookie;
+ std::string m_new_cookie;
+ bool m_exclusive;
+ Context *m_on_finish;
+
+ void set_cookie();
+ void handle_set_cookie(int r);
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H
diff --git a/src/librbd/managed_lock/ReleaseRequest.cc b/src/librbd/managed_lock/ReleaseRequest.cc
new file mode 100644
index 00000000..d74e1462
--- /dev/null
+++ b/src/librbd/managed_lock/ReleaseRequest.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/ReleaseRequest.h"
+#include "librbd/Watcher.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::ReleaseRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace managed_lock {
+
+using util::detail::C_AsyncCallback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+ReleaseRequest<I>* ReleaseRequest<I>::create(librados::IoCtx& ioctx,
+ Watcher *watcher,
+ ContextWQ *work_queue,
+ const string& oid,
+ const string& cookie,
+ Context *on_finish) {
+ return new ReleaseRequest(ioctx, watcher, work_queue, oid, cookie,
+ on_finish);
+}
+
+template <typename I>
+ReleaseRequest<I>::ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ ContextWQ *work_queue, const string& oid,
+ const string& cookie, Context *on_finish)
+ : m_ioctx(ioctx), m_watcher(watcher), m_oid(oid), m_cookie(cookie),
+ m_on_finish(new C_AsyncCallback<ContextWQ>(work_queue, on_finish)) {
+}
+
+template <typename I>
+ReleaseRequest<I>::~ReleaseRequest() {
+}
+
+
+template <typename I>
+void ReleaseRequest<I>::send() {
+ send_unlock();
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_unlock() {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", "
+ << "cookie=" << m_cookie << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::unlock(&op, RBD_LOCK_NAME, m_cookie);
+
+ using klass = ReleaseRequest;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_unlock>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void ReleaseRequest<I>::handle_unlock(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to unlock: " << cpp_strerror(r) << dendl;
+ }
+
+ finish();
+}
+
+template <typename I>
+void ReleaseRequest<I>::finish() {
+ m_on_finish->complete(0);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::ReleaseRequest<librbd::ImageCtx>;
+
diff --git a/src/librbd/managed_lock/ReleaseRequest.h b/src/librbd/managed_lock/ReleaseRequest.h
new file mode 100644
index 00000000..89205136
--- /dev/null
+++ b/src/librbd/managed_lock/ReleaseRequest.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "librbd/watcher/Types.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+
+namespace librbd {
+
+class Watcher;
+
+namespace managed_lock {
+
+template <typename ImageCtxT>
+class ReleaseRequest {
+private:
+ typedef watcher::Traits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Watcher Watcher;
+
+public:
+ static ReleaseRequest* create(librados::IoCtx& ioctx, Watcher *watcher,
+ ContextWQ *work_queue,
+ const std::string& oid,
+ const std::string& cookie,
+ Context *on_finish);
+
+ ~ReleaseRequest();
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNLOCK
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ ContextWQ *work_queue, const std::string& oid,
+ const std::string& cookie, Context *on_finish);
+
+ librados::IoCtx& m_ioctx;
+ Watcher *m_watcher;
+ std::string m_oid;
+ std::string m_cookie;
+ Context *m_on_finish;
+
+ void send_unlock();
+ void handle_unlock(int r);
+
+ void finish();
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H
diff --git a/src/librbd/managed_lock/Types.h b/src/librbd/managed_lock/Types.h
new file mode 100644
index 00000000..319789c8
--- /dev/null
+++ b/src/librbd/managed_lock/Types.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_TYPES_H
+#define CEPH_LIBRBD_MANAGED_LOCK_TYPES_H
+
+#include "msg/msg_types.h"
+#include <string>
+
+namespace librbd {
+namespace managed_lock {
+
+struct Locker {
+ entity_name_t entity;
+ std::string cookie;
+ std::string address;
+ uint64_t handle = 0;
+
+ Locker() {
+ }
+ Locker(const entity_name_t& entity, const std::string &cookie,
+ const std::string &address, uint64_t handle)
+ : entity(entity), cookie(cookie), address(address), handle(handle) {
+ }
+
+ inline bool operator==(const Locker &rhs) const {
+ return (entity == rhs.entity &&
+ cookie == rhs.cookie &&
+ address == rhs.address &&
+ handle == rhs.handle);
+ }
+ inline bool operator!=(const Locker &rhs) const {
+ return !(*this == rhs);
+ }
+};
+
+enum Mode {
+ EXCLUSIVE,
+ SHARED
+};
+
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_TYPES_H
diff --git a/src/librbd/managed_lock/Utils.cc b/src/librbd/managed_lock/Utils.cc
new file mode 100644
index 00000000..0b4f908d
--- /dev/null
+++ b/src/librbd/managed_lock/Utils.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/ceph_assert.h"
+#include "librbd/managed_lock/Utils.h"
+#include <sstream>
+
+namespace librbd {
+namespace managed_lock {
+namespace util {
+
+namespace {
+
+const std::string WATCHER_LOCK_COOKIE_PREFIX = "auto";
+const std::string WATCHER_LOCK_TAG("internal");
+
+} // anonymous namespace
+
+const std::string &get_watcher_lock_tag() {
+ return WATCHER_LOCK_TAG;
+}
+
+bool decode_lock_cookie(const std::string &tag, uint64_t *handle) {
+ std::string prefix;
+ std::istringstream ss(tag);
+ if (!(ss >> prefix >> *handle) || prefix != WATCHER_LOCK_COOKIE_PREFIX) {
+ return false;
+ }
+ return true;
+}
+
+std::string encode_lock_cookie(uint64_t watch_handle) {
+ ceph_assert(watch_handle != 0);
+ std::ostringstream ss;
+ ss << WATCHER_LOCK_COOKIE_PREFIX << " " << watch_handle;
+ return ss.str();
+}
+
+} // namespace util
+} // namespace managed_lock
+} // namespace librbd
+
+
diff --git a/src/librbd/managed_lock/Utils.h b/src/librbd/managed_lock/Utils.h
new file mode 100644
index 00000000..679cbfe8
--- /dev/null
+++ b/src/librbd/managed_lock/Utils.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_UTILS_H
+#define CEPH_LIBRBD_MANAGED_LOCK_UTILS_H
+
+#include "include/int_types.h"
+#include <string>
+
+namespace librbd {
+namespace managed_lock {
+namespace util {
+
+const std::string &get_watcher_lock_tag();
+
+bool decode_lock_cookie(const std::string &tag, uint64_t *handle);
+std::string encode_lock_cookie(uint64_t watch_handle);
+
+} // namespace util
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_UTILS_H
diff --git a/src/librbd/mirror/DemoteRequest.cc b/src/librbd/mirror/DemoteRequest.cc
new file mode 100644
index 00000000..c5d38752
--- /dev/null
+++ b/src/librbd/mirror/DemoteRequest.cc
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/DemoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::DemoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void DemoteRequest<I>::send() {
+ get_info();
+}
+
+template <typename I>
+void DemoteRequest<I>::get_info() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_get_info>(this);
+ auto req = GetInfoRequest<I>::create(m_image_ctx, &m_mirror_image,
+ &m_promotion_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_get_info(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_promotion_state != PROMOTION_STATE_PRIMARY) {
+ lderr(cct) << "image is not primary" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void DemoteRequest<I>::acquire_lock() {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.owner_lock.get_read();
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ m_image_ctx.owner_lock.put_read();
+ lderr(cct) << "exclusive lock is not active" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ // avoid accepting new requests from peers while we demote
+ // the image
+ m_image_ctx.exclusive_lock->block_requests(0);
+ m_blocked_requests = true;
+
+ if (m_image_ctx.exclusive_lock->is_lock_owner()) {
+ m_image_ctx.owner_lock.put_read();
+ demote();
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_acquire_lock>(this);
+ m_image_ctx.exclusive_lock->acquire_lock(ctx);
+ m_image_ctx.owner_lock.put_read();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_acquire_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_image_ctx.owner_lock.get_read();
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.exclusive_lock->is_lock_owner()) {
+ r = m_image_ctx.exclusive_lock->get_unlocked_op_error();
+ m_image_ctx.owner_lock.put_read();
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+ m_image_ctx.owner_lock.put_read();
+
+ demote();
+}
+
+template <typename I>
+void DemoteRequest<I>::demote() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_demote>(this);
+ Journal<I>::demote(&m_image_ctx, ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_demote(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to demote image: " << cpp_strerror(r) << dendl;
+ }
+
+ release_lock();
+}
+
+template <typename I>
+void DemoteRequest<I>::release_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ m_image_ctx.owner_lock.get_read();
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ m_image_ctx.owner_lock.put_read();
+ finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_release_lock>(this);
+ m_image_ctx.exclusive_lock->release_lock(ctx);
+ m_image_ctx.owner_lock.put_read();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_release_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void DemoteRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_blocked_requests && m_image_ctx.exclusive_lock != nullptr) {
+ m_image_ctx.exclusive_lock->unblock_requests();
+ }
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::DemoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/DemoteRequest.h b/src/librbd/mirror/DemoteRequest.h
new file mode 100644
index 00000000..7dc0585e
--- /dev/null
+++ b/src/librbd/mirror/DemoteRequest.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DemoteRequest {
+public:
+ static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new DemoteRequest(image_ctx, on_finish);
+ }
+
+ DemoteRequest(ImageCtxT &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * ACQUIRE_LOCK * * * *
+ * | *
+ * v *
+ * DEMOTE *
+ * | *
+ * v *
+ * RELEASE_LOCK *
+ * | *
+ * v *
+ * <finish> < * * * * *
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ int m_ret_val = 0;
+ bool m_blocked_requests = false;
+
+ cls::rbd::MirrorImage m_mirror_image;
+ PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY;
+
+ void get_info();
+ void handle_get_info(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void demote();
+ void handle_demote(int r);
+
+ void release_lock();
+ void handle_release_lock(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::DemoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H
diff --git a/src/librbd/mirror/DisableRequest.cc b/src/librbd/mirror/DisableRequest.cc
new file mode 100644
index 00000000..6a21c560
--- /dev/null
+++ b/src/librbd/mirror/DisableRequest.cc
@@ -0,0 +1,492 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/DisableRequest.h"
+#include "common/WorkQueue.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/journal/cls_journal_client.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/PromoteRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::DisableRequest: "
+
+namespace librbd {
+namespace mirror {
+
+using util::create_rados_callback;
+
+template <typename I>
+DisableRequest<I>::DisableRequest(I *image_ctx, bool force, bool remove,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_remove(remove),
+ m_on_finish(on_finish), m_lock("mirror::DisableRequest::m_lock") {
+}
+
+template <typename I>
+void DisableRequest<I>::send() {
+ send_get_mirror_image();
+}
+
+template <typename I>
+void DisableRequest<I>::send_get_mirror_image() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_ctx->id);
+
+ using klass = DisableRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_image>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_get_mirror_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto iter = m_out_bl.cbegin();
+ *result = cls_client::mirror_image_get_finish(&iter, &m_mirror_image);
+ }
+
+ if (*result < 0) {
+ if (*result == -ENOENT) {
+ ldout(cct, 20) << this << " " << __func__
+ << ": mirroring is not enabled for this image" << dendl;
+ *result = 0;
+ } else if (*result == -EOPNOTSUPP) {
+ ldout(cct, 5) << this << " " << __func__
+ << ": mirroring is not supported by OSD" << dendl;
+ } else {
+ lderr(cct) << "failed to retrieve mirror image: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return m_on_finish;
+ }
+
+ send_get_tag_owner();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_get_tag_owner() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_get_tag_owner>(this);
+
+ Journal<I>::is_tag_owner(m_image_ctx, &m_is_primary, ctx);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_get_tag_owner(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to check tag ownership: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if (!m_is_primary && !m_force) {
+ lderr(cct) << "mirrored image is not primary, "
+ << "add force option to disable mirroring" << dendl;
+ *result = -EINVAL;
+ return m_on_finish;
+ }
+
+ send_set_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_set_mirror_image() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_set(&op, m_image_ctx->id, m_mirror_image);
+
+ using klass = DisableRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_mirror_image>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_set_mirror_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to disable mirroring: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_notify_mirroring_watcher();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_notify_mirroring_watcher() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_notify_mirroring_watcher>(this);
+
+ MirroringWatcher<I>::notify_image_updated(
+ m_image_ctx->md_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLING,
+ m_image_ctx->id, m_mirror_image.global_image_id, ctx);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_notify_mirroring_watcher(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to send update notification: "
+ << cpp_strerror(*result) << dendl;
+ *result = 0;
+ }
+
+ send_promote_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_promote_image() {
+ if (m_is_primary) {
+ send_get_clients();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // Not primary -- shouldn't have the journal open
+ ceph_assert(m_image_ctx->journal == nullptr);
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_promote_image>(this);
+ auto req = journal::PromoteRequest<I>::create(m_image_ctx, true, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_promote_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to promote image: " << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ send_get_clients();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_get_clients() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_get_clients>(this);
+
+ std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id);
+ m_clients.clear();
+ cls::journal::client::client_list(m_image_ctx->md_ctx, header_oid, &m_clients,
+ ctx);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_get_clients(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to get registered clients: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_current_ops.empty());
+
+ for (auto client : m_clients) {
+ journal::ClientData client_data;
+ auto bl_it = client.data.cbegin();
+ try {
+ using ceph::decode;
+ decode(client_data, bl_it);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode client data" << dendl;
+ m_error_result = -EBADMSG;
+ continue;
+ }
+
+ journal::ClientMetaType type = client_data.get_client_meta_type();
+ if (type != journal::ClientMetaType::MIRROR_PEER_CLIENT_META_TYPE) {
+ continue;
+ }
+
+ if (m_current_ops.find(client.id) != m_current_ops.end()) {
+ // Should not happen.
+ lderr(cct) << this << " " << __func__ << ": clients with the same id "
+ << client.id << dendl;
+ continue;
+ }
+
+ m_current_ops[client.id] = 0;
+ m_ret[client.id] = 0;
+
+ journal::MirrorPeerClientMeta client_meta =
+ boost::get<journal::MirrorPeerClientMeta>(client_data.client_meta);
+
+ for (const auto& sync : client_meta.sync_points) {
+ send_remove_snap(client.id, sync.snap_namespace, sync.snap_name);
+ }
+
+ if (m_current_ops[client.id] == 0) {
+ // no snaps to remove
+ send_unregister_client(client.id);
+ }
+ }
+
+ if (m_current_ops.empty()) {
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ return m_on_finish;
+ } else if (!m_remove) {
+ return m_on_finish;
+ }
+
+ // no mirror clients to unregister
+ send_remove_mirror_image();
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_remove_snap(const std::string &client_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": client_id=" << client_id
+ << ", snap_name=" << snap_name << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ m_current_ops[client_id]++;
+
+ Context *ctx = create_context_callback(
+ &DisableRequest<I>::handle_remove_snap, client_id);
+
+ ctx = new FunctionContext([this, snap_namespace, snap_name, ctx](int r) {
+ m_image_ctx->operations->snap_remove(snap_namespace,
+ snap_name.c_str(),
+ ctx);
+ });
+
+ m_image_ctx->op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_remove_snap(int *result,
+ const std::string &client_id) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_current_ops[client_id] > 0);
+ m_current_ops[client_id]--;
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) <<
+ "failed to remove temporary snapshot created by remote peer: "
+ << cpp_strerror(*result) << dendl;
+ m_ret[client_id] = *result;
+ }
+
+ if (m_current_ops[client_id] == 0) {
+ send_unregister_client(client_id);
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_unregister_client(
+ const std::string &client_id) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_current_ops[client_id] == 0);
+
+ Context *ctx = create_context_callback(
+ &DisableRequest<I>::handle_unregister_client, client_id);
+
+ if (m_ret[client_id] < 0) {
+ m_image_ctx->op_work_queue->queue(ctx, m_ret[client_id]);
+ return;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls::journal::client::client_unregister(&op, client_id);
+ std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id);
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+
+ int r = m_image_ctx->md_ctx.aio_operate(header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_unregister_client(
+ int *result, const std::string &client_id) {
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_current_ops[client_id] == 0);
+ m_current_ops.erase(client_id);
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to unregister remote journal client: "
+ << cpp_strerror(*result) << dendl;
+ m_error_result = *result;
+ }
+
+ if (!m_current_ops.empty()) {
+ return nullptr;
+ }
+
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ return m_on_finish;
+ }
+
+ send_get_clients();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_remove_mirror_image() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_remove(&op, m_image_ctx->id);
+
+ using klass = DisableRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_mirror_image>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_remove_mirror_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ENOENT) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to remove mirror image: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ ldout(cct, 20) << this << " " << __func__
+ << ": removed image state from rbd_mirroring object" << dendl;
+
+ send_notify_mirroring_watcher_removed();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_notify_mirroring_watcher_removed() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_notify_mirroring_watcher_removed>(this);
+
+ MirroringWatcher<I>::notify_image_updated(
+ m_image_ctx->md_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED, m_image_ctx->id,
+ m_mirror_image.global_image_id, ctx);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_notify_mirroring_watcher_removed(
+ int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to send update notification: "
+ << cpp_strerror(*result) << dendl;
+ *result = 0;
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+Context *DisableRequest<I>::create_context_callback(
+ Context*(DisableRequest<I>::*handle)(int*, const std::string &client_id),
+ const std::string &client_id) {
+
+ return new FunctionContext([this, handle, client_id](int r) {
+ Context *on_finish = (this->*handle)(&r, client_id);
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ delete this;
+ }
+ });
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::DisableRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/DisableRequest.h b/src/librbd/mirror/DisableRequest.h
new file mode 100644
index 00000000..1a3b1223
--- /dev/null
+++ b/src/librbd/mirror/DisableRequest.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H
+
+#include "include/buffer.h"
+#include "common/Mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = ImageCtx>
+class DisableRequest {
+public:
+ static DisableRequest *create(ImageCtxT *image_ctx, bool force,
+ bool remove, Context *on_finish) {
+ return new DisableRequest(image_ctx, force, remove, on_finish);
+ }
+
+ DisableRequest(ImageCtxT *image_ctx, bool force, bool remove,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * GET_TAG_OWNER * * * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * SET_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * NOTIFY_MIRRORING_WATCHER *
+ * | *
+ * v *
+ * PROMOTE_IMAGE (skip if primary) *
+ * | *
+ * v *
+ * GET_CLIENTS <----------------------------------------\ * * * *
+ * | | (unregister clients) | * (on error)
+ * | |/----------------------------\ | *
+ * | | | | *
+ * | | /-----------\ (repeat | (repeat | (repeat
+ * | | | | as needed) | as needed) | as needed)
+ * | v v | | | *
+ * | REMOVE_SYNC_SNAP --/ * * * * * * | * * * * * * | * * * *
+ * | | | | *
+ * | v | | *
+ * | UNREGISTER_CLIENT ---------------/-------------/ * * * *
+ * | *
+ * | (no more clients *
+ * | to unregister) *
+ * v *
+ * REMOVE_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * *
+ * | (skip if no remove) *
+ * v *
+ * NOTIFY_MIRRORING_WATCHER_REMOVED *
+ * | (skip if not primary or no remove) *
+ * v *
+ * <finish> < * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ bool m_force;
+ bool m_remove;
+ Context *m_on_finish;
+
+ bool m_is_primary = false;
+ bufferlist m_out_bl;
+ cls::rbd::MirrorImage m_mirror_image;
+ std::set<cls::journal::Client> m_clients;
+ std::map<std::string, int> m_ret;
+ std::map<std::string, int> m_current_ops;
+ int m_error_result = 0;
+ mutable Mutex m_lock;
+
+ void send_get_mirror_image();
+ Context *handle_get_mirror_image(int *result);
+
+ void send_get_tag_owner();
+ Context *handle_get_tag_owner(int *result);
+
+ void send_set_mirror_image();
+ Context *handle_set_mirror_image(int *result);
+
+ void send_notify_mirroring_watcher();
+ Context *handle_notify_mirroring_watcher(int *result);
+
+ void send_promote_image();
+ Context *handle_promote_image(int *result);
+
+ void send_get_clients();
+ Context *handle_get_clients(int *result);
+
+ void send_remove_snap(const std::string &client_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name);
+ Context *handle_remove_snap(int *result, const std::string &client_id);
+
+ void send_unregister_client(const std::string &client_id);
+ Context *handle_unregister_client(int *result, const std::string &client_id);
+
+ void send_remove_mirror_image();
+ Context *handle_remove_mirror_image(int *result);
+
+ void send_notify_mirroring_watcher_removed();
+ Context *handle_notify_mirroring_watcher_removed(int *result);
+
+ Context *create_context_callback(
+ Context*(DisableRequest<ImageCtxT>::*handle)(
+ int*, const std::string &client_id),
+ const std::string &client_id);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::DisableRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H
diff --git a/src/librbd/mirror/EnableRequest.cc b/src/librbd/mirror/EnableRequest.cc
new file mode 100644
index 00000000..a5c5b125
--- /dev/null
+++ b/src/librbd/mirror/EnableRequest.cc
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/EnableRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::EnableRequest: "
+
+namespace librbd {
+namespace mirror {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+EnableRequest<I>::EnableRequest(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ const std::string &non_primary_global_image_id,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id),
+ m_non_primary_global_image_id(non_primary_global_image_id),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())) {
+}
+
+template <typename I>
+void EnableRequest<I>::send() {
+ send_get_mirror_image();
+}
+
+template <typename I>
+void EnableRequest<I>::send_get_mirror_image() {
+ ldout(m_cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_id);
+
+ using klass = EnableRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_image>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *EnableRequest<I>::handle_get_mirror_image(int *result) {
+ ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto iter = m_out_bl.cbegin();
+ *result = cls_client::mirror_image_get_finish(&iter, &m_mirror_image);
+ }
+
+ if (*result == 0) {
+ if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ ldout(m_cct, 10) << this << " " << __func__
+ << ": mirroring is already enabled" << dendl;
+ } else {
+ lderr(m_cct) << "currently disabling" << dendl;
+ *result = -EINVAL;
+ }
+ return m_on_finish;
+ }
+
+ if (*result != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ *result = 0;
+ m_mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+ if (m_non_primary_global_image_id.empty()) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ m_mirror_image.global_image_id = uuid_gen.to_string();
+ } else {
+ m_mirror_image.global_image_id = m_non_primary_global_image_id;
+ }
+
+ send_get_tag_owner();
+ return nullptr;
+}
+
+template <typename I>
+void EnableRequest<I>::send_get_tag_owner() {
+ if (!m_non_primary_global_image_id.empty()) {
+ send_set_mirror_image();
+ return;
+ }
+ ldout(m_cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = EnableRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_get_tag_owner>(this);
+ librbd::Journal<>::is_tag_owner(m_io_ctx, m_image_id, &m_is_primary,
+ m_op_work_queue, ctx);
+}
+
+template <typename I>
+Context *EnableRequest<I>::handle_get_tag_owner(int *result) {
+ ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to check tag ownership: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if (!m_is_primary) {
+ lderr(m_cct) << "last journal tag not owned by local cluster" << dendl;
+ *result = -EINVAL;
+ return m_on_finish;
+ }
+
+ send_set_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void EnableRequest<I>::send_set_mirror_image() {
+ ldout(m_cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_set(&op, m_image_id, m_mirror_image);
+
+ using klass = EnableRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_mirror_image>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *EnableRequest<I>::handle_set_mirror_image(int *result) {
+ ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_notify_mirroring_watcher();
+ return nullptr;
+}
+
+template <typename I>
+void EnableRequest<I>::send_notify_mirroring_watcher() {
+ ldout(m_cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = EnableRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_notify_mirroring_watcher>(this);
+
+ MirroringWatcher<>::notify_image_updated(m_io_ctx,
+ cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
+ m_image_id,
+ m_mirror_image.global_image_id, ctx);
+}
+
+template <typename I>
+Context *EnableRequest<I>::handle_notify_mirroring_watcher(int *result) {
+ ldout(m_cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to send update notification: "
+ << cpp_strerror(*result) << dendl;
+ *result = 0;
+ }
+
+ return m_on_finish;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::EnableRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/EnableRequest.h b/src/librbd/mirror/EnableRequest.h
new file mode 100644
index 00000000..965c2a36
--- /dev/null
+++ b/src/librbd/mirror/EnableRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <map>
+#include <string>
+
+class Context;
+class ContextWQ;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = ImageCtx>
+class EnableRequest {
+public:
+ static EnableRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return create(image_ctx->md_ctx, image_ctx->id, "",
+ image_ctx->op_work_queue, on_finish);
+ }
+ static EnableRequest *create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ const std::string &non_primary_global_image_id,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new EnableRequest(io_ctx, image_id, non_primary_global_image_id,
+ op_work_queue, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE * * * * * * *
+ * | * (on error)
+ * v *
+ * GET_TAG_OWNER * * * * * * * *
+ * | *
+ * v *
+ * SET_MIRROR_IMAGE * * * * * * *
+ * | *
+ * v *
+ * NOTIFY_MIRRORING_WATCHER * * *
+ * | *
+ * v *
+ * <finish> < * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ EnableRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ const std::string &non_primary_global_image_id,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ std::string m_non_primary_global_image_id;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct = nullptr;
+ bool m_is_primary = false;
+ bufferlist m_out_bl;
+ cls::rbd::MirrorImage m_mirror_image;
+
+ void send_get_mirror_image();
+ Context *handle_get_mirror_image(int *result);
+
+ void send_get_tag_owner();
+ Context *handle_get_tag_owner(int *result);
+
+ void send_set_mirror_image();
+ Context *handle_set_mirror_image(int *result);
+
+ void send_notify_mirroring_watcher();
+ Context *handle_notify_mirroring_watcher(int *result);
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::EnableRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H
diff --git a/src/librbd/mirror/GetInfoRequest.cc b/src/librbd/mirror/GetInfoRequest.cc
new file mode 100644
index 00000000..460c8cb1
--- /dev/null
+++ b/src/librbd/mirror/GetInfoRequest.cc
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/GetInfoRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::GetInfoRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void GetInfoRequest<I>::send() {
+ get_mirror_image();
+}
+
+template <typename I>
+void GetInfoRequest<I>::get_mirror_image() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_ctx.id);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_mirror_image>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetInfoRequest<I>::handle_get_mirror_image(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_mirror_image->state = cls::rbd::MIRROR_IMAGE_STATE_DISABLED;
+ *m_promotion_state = PROMOTION_STATE_NON_PRIMARY;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_get_finish(&iter, m_mirror_image);
+ }
+
+ if (r == -ENOENT ||
+ m_mirror_image->state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ ldout(cct, 20) << "mirroring is disabled" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ get_tag_owner();
+}
+
+template <typename I>
+void GetInfoRequest<I>::get_tag_owner() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_tag_owner>(this);
+ Journal<I>::get_tag_owner(m_image_ctx.md_ctx, m_image_ctx.id,
+ &m_mirror_uuid, m_image_ctx.op_work_queue, ctx);
+}
+
+template <typename I>
+void GetInfoRequest<I>::handle_get_tag_owner(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to determine tag ownership: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_mirror_uuid == Journal<>::LOCAL_MIRROR_UUID) {
+ *m_promotion_state = PROMOTION_STATE_PRIMARY;
+ } else if (m_mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) {
+ *m_promotion_state = PROMOTION_STATE_ORPHAN;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetInfoRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::GetInfoRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/GetInfoRequest.h b/src/librbd/mirror/GetInfoRequest.h
new file mode 100644
index 00000000..db8073c6
--- /dev/null
+++ b/src/librbd/mirror/GetInfoRequest.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H
+
+#include "include/buffer.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+struct Context;
+namespace cls { namespace rbd { struct MirrorImage; } }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetInfoRequest {
+public:
+ static GetInfoRequest *create(ImageCtxT &image_ctx,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ Context *on_finish) {
+ return new GetInfoRequest(image_ctx, mirror_image, promotion_state,
+ on_finish);
+ }
+
+ GetInfoRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state, Context *on_finish)
+ : m_image_ctx(image_ctx), m_mirror_image(mirror_image),
+ m_promotion_state(promotion_state), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE
+ * |
+ * v
+ * GET_TAG_OWNER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ cls::rbd::MirrorImage *m_mirror_image;
+ PromotionState *m_promotion_state;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_mirror_uuid;
+
+ void get_mirror_image();
+ void handle_get_mirror_image(int r);
+
+ void get_tag_owner();
+ void handle_get_tag_owner(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::GetInfoRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H
+
diff --git a/src/librbd/mirror/GetStatusRequest.cc b/src/librbd/mirror/GetStatusRequest.cc
new file mode 100644
index 00000000..57096883
--- /dev/null
+++ b/src/librbd/mirror/GetStatusRequest.cc
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/GetStatusRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::GetStatusRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void GetStatusRequest<I>::send() {
+ *m_mirror_image_status = cls::rbd::MirrorImageStatus(
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN, "status not found");
+
+ get_info();
+}
+
+template <typename I>
+void GetStatusRequest<I>::get_info() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ GetStatusRequest<I>, &GetStatusRequest<I>::handle_get_info>(this);
+ auto req = GetInfoRequest<I>::create(m_image_ctx, m_mirror_image,
+ m_promotion_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void GetStatusRequest<I>::handle_get_info(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else if (m_mirror_image->state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ finish(0);
+ return;
+ }
+
+ get_status();
+}
+
+template <typename I>
+void GetStatusRequest<I>::get_status() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_status_get_start(
+ &op, m_mirror_image->global_image_id);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetStatusRequest<I>, &GetStatusRequest<I>::handle_get_status>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetStatusRequest<I>::handle_get_status(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_status_get_finish(&iter,
+ m_mirror_image_status);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirror image status: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetStatusRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::GetStatusRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/GetStatusRequest.h b/src/librbd/mirror/GetStatusRequest.h
new file mode 100644
index 00000000..4c1a81f0
--- /dev/null
+++ b/src/librbd/mirror/GetStatusRequest.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H
+
+#include "include/buffer.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+struct Context;
+namespace cls { namespace rbd { struct MirrorImage; } }
+namespace cls { namespace rbd { struct MirrorImageStatus; } }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetStatusRequest {
+public:
+ static GetStatusRequest *create(ImageCtxT &image_ctx,
+ cls::rbd::MirrorImageStatus *status,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ Context *on_finish) {
+ return new GetStatusRequest(image_ctx, status, mirror_image,
+ promotion_state, on_finish);
+ }
+
+ GetStatusRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImageStatus *status,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state, Context *on_finish)
+ : m_image_ctx(image_ctx), m_mirror_image_status(status),
+ m_mirror_image(mirror_image), m_promotion_state(promotion_state),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * GET_STATUS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ cls::rbd::MirrorImageStatus *m_mirror_image_status;
+ cls::rbd::MirrorImage *m_mirror_image;
+ PromotionState *m_promotion_state;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ void get_info();
+ void handle_get_info(int r);
+
+ void get_status();
+ void handle_get_status(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::GetStatusRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H
+
diff --git a/src/librbd/mirror/PromoteRequest.cc b/src/librbd/mirror/PromoteRequest.cc
new file mode 100644
index 00000000..5603cb13
--- /dev/null
+++ b/src/librbd/mirror/PromoteRequest.cc
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/PromoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::PromoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void PromoteRequest<I>::send() {
+ get_info();
+}
+
+template <typename I>
+void PromoteRequest<I>::get_info() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_get_info>(this);
+ auto req = GetInfoRequest<I>::create(m_image_ctx, &m_mirror_image,
+ &m_promotion_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_get_info(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_promotion_state == PROMOTION_STATE_PRIMARY) {
+ lderr(cct) << "image is already primary" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_promotion_state == PROMOTION_STATE_NON_PRIMARY && !m_force) {
+ lderr(cct) << "image is still primary within a remote cluster" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ promote();
+}
+
+template <typename I>
+void PromoteRequest<I>::promote() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_promote>(this);
+ Journal<I>::promote(&m_image_ctx, ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_promote(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to promote image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void PromoteRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::PromoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/PromoteRequest.h b/src/librbd/mirror/PromoteRequest.h
new file mode 100644
index 00000000..17609c5f
--- /dev/null
+++ b/src/librbd/mirror/PromoteRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PromoteRequest {
+public:
+ static PromoteRequest *create(ImageCtxT &image_ctx, bool force,
+ Context *on_finish) {
+ return new PromoteRequest(image_ctx, force, on_finish);
+ }
+
+ PromoteRequest(ImageCtxT &image_ctx, bool force, Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * GET_TAG_OWNER
+ * |
+ * v
+ * PROMOTE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ bool m_force;
+ Context *m_on_finish;
+
+ cls::rbd::MirrorImage m_mirror_image;
+ PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY;
+
+ void get_info();
+ void handle_get_info(int r);
+
+ void promote();
+ void handle_promote(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::PromoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H
diff --git a/src/librbd/mirror/Types.h b/src/librbd/mirror/Types.h
new file mode 100644
index 00000000..38511bdb
--- /dev/null
+++ b/src/librbd/mirror/Types.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_TYPES_H
+#define CEPH_LIBRBD_MIRROR_TYPES_H
+
+namespace librbd {
+namespace mirror {
+
+enum PromotionState {
+ PROMOTION_STATE_PRIMARY,
+ PROMOTION_STATE_NON_PRIMARY,
+ PROMOTION_STATE_ORPHAN
+};
+
+} // namespace mirror
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIRROR_TYPES_H
+
diff --git a/src/librbd/mirroring_watcher/Types.cc b/src/librbd/mirroring_watcher/Types.cc
new file mode 100644
index 00000000..3226b635
--- /dev/null
+++ b/src/librbd/mirroring_watcher/Types.cc
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "librbd/mirroring_watcher/Types.h"
+#include "librbd/watcher/Utils.h"
+
+namespace librbd {
+namespace mirroring_watcher {
+
+namespace {
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void ModeUpdatedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(mirror_mode), bl);
+}
+
+void ModeUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ uint32_t mirror_mode_decode;
+ decode(mirror_mode_decode, iter);
+ mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode_decode);
+}
+
+void ModeUpdatedPayload::dump(Formatter *f) const {
+ f->dump_stream("mirror_mode") << mirror_mode;
+}
+
+void ImageUpdatedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(mirror_image_state), bl);
+ encode(image_id, bl);
+ encode(global_image_id, bl);
+}
+
+void ImageUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ uint32_t mirror_image_state_decode;
+ decode(mirror_image_state_decode, iter);
+ mirror_image_state = static_cast<cls::rbd::MirrorImageState>(
+ mirror_image_state_decode);
+ decode(image_id, iter);
+ decode(global_image_id, iter);
+}
+
+void ImageUpdatedPayload::dump(Formatter *f) const {
+ f->dump_stream("mirror_image_state") << mirror_image_state;
+ f->dump_string("image_id", image_id);
+ f->dump_string("global_image_id", global_image_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_MODE_UPDATED:
+ payload = ModeUpdatedPayload();
+ break;
+ case NOTIFY_OP_IMAGE_UPDATED:
+ payload = ImageUpdatedPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(ModeUpdatedPayload(cls::rbd::MIRROR_MODE_DISABLED)));
+ o.push_back(new NotifyMessage(ImageUpdatedPayload(cls::rbd::MIRROR_IMAGE_STATE_DISABLING,
+ "image id", "global image id")));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_MODE_UPDATED:
+ out << "ModeUpdated";
+ break;
+ case NOTIFY_OP_IMAGE_UPDATED:
+ out << "ImageUpdated";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+} // namespace mirroring_watcher
+} // namespace librbd
diff --git a/src/librbd/mirroring_watcher/Types.h b/src/librbd/mirroring_watcher/Types.h
new file mode 100644
index 00000000..1e096a9d
--- /dev/null
+++ b/src/librbd/mirroring_watcher/Types.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
+#define CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+namespace mirroring_watcher {
+
+enum NotifyOp {
+ NOTIFY_OP_MODE_UPDATED = 0,
+ NOTIFY_OP_IMAGE_UPDATED = 1
+};
+
+struct ModeUpdatedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_MODE_UPDATED;
+
+ cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+
+ ModeUpdatedPayload() {
+ }
+ ModeUpdatedPayload(cls::rbd::MirrorMode mirror_mode)
+ : mirror_mode(mirror_mode) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImageUpdatedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_UPDATED;
+
+ cls::rbd::MirrorImageState mirror_image_state =
+ cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+ std::string image_id;
+ std::string global_image_id;
+
+ ImageUpdatedPayload() {
+ }
+ ImageUpdatedPayload(cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id,
+ const std::string &global_image_id)
+ : mirror_image_state(mirror_image_state), image_id(image_id),
+ global_image_id(global_image_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ModeUpdatedPayload,
+ ImageUpdatedPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace mirroring_watcher
+} // namespace librbd
+
+using librbd::mirroring_watcher::encode;
+using librbd::mirroring_watcher::decode;
+
+#endif // CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
diff --git a/src/librbd/object_map/CreateRequest.cc b/src/librbd/object_map/CreateRequest.cc
new file mode 100644
index 00000000..a2b9d8f6
--- /dev/null
+++ b/src/librbd/object_map/CreateRequest.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/CreateRequest.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "osdc/Striper.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::CreateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+CreateRequest<I>::CreateRequest(I *image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void CreateRequest<I>::send() {
+ CephContext *cct = m_image_ctx->cct;
+
+ uint64_t max_size = m_image_ctx->size;
+
+ {
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ m_snap_ids.push_back(CEPH_NOSNAP);
+ for (auto it : m_image_ctx->snap_info) {
+ max_size = std::max(max_size, it.second.size);
+ m_snap_ids.push_back(it.first);
+ }
+
+ if (ObjectMap<>::is_compatible(m_image_ctx->layout, max_size)) {
+ send_object_map_resize();
+ return;
+ }
+ }
+
+ lderr(cct) << "image size not compatible with object map" << dendl;
+ m_on_finish->complete(-EINVAL);
+}
+
+template <typename I>
+void CreateRequest<I>::send_object_map_resize() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ CreateRequest<I>, &CreateRequest<I>::handle_object_map_resize>(this);
+ C_Gather *gather_ctx = new C_Gather(cct, ctx);
+
+ for (auto snap_id : m_snap_ids) {
+ librados::ObjectWriteOperation op;
+ uint64_t snap_size = m_image_ctx->get_image_size(snap_id);
+
+ cls_client::object_map_resize(&op, Striper::get_num_objects(
+ m_image_ctx->layout, snap_size),
+ OBJECT_NONEXISTENT);
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id));
+ librados::AioCompletion *comp = create_rados_callback(gather_ctx->new_sub());
+ int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+Context *CreateRequest<I>::handle_object_map_resize(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "object map resize failed: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::CreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/CreateRequest.h b/src/librbd/object_map/CreateRequest.h
new file mode 100644
index 00000000..6929abe7
--- /dev/null
+++ b/src/librbd/object_map/CreateRequest.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H
+
+#include "include/buffer.h"
+#include "common/Mutex.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class CreateRequest {
+public:
+ static CreateRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return new CreateRequest(image_ctx, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . .
+ * v v .
+ * OBJECT_MAP_RESIZE . (for every snapshot)
+ * | . .
+ * v . . .
+ * <finis>
+ *
+ * @endverbatim
+ */
+
+ CreateRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ Context *m_on_finish;
+
+ std::vector<uint64_t> m_snap_ids;
+
+ void send_object_map_resize();
+ Context *handle_object_map_resize(int *result);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::CreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H
diff --git a/src/librbd/object_map/InvalidateRequest.cc b/src/librbd/object_map/InvalidateRequest.cc
new file mode 100644
index 00000000..9f7a2fa1
--- /dev/null
+++ b/src/librbd/object_map/InvalidateRequest.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/InvalidateRequest.h"
+#include "common/dout.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::InvalidateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+template <typename I>
+InvalidateRequest<I>* InvalidateRequest<I>::create(I &image_ctx,
+ uint64_t snap_id, bool force,
+ Context *on_finish) {
+ return new InvalidateRequest<I>(image_ctx, snap_id, force, on_finish);
+}
+
+template <typename I>
+void InvalidateRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ ceph_assert(image_ctx.snap_lock.is_wlocked());
+
+ uint64_t snap_flags;
+ int r = image_ctx.get_flags(m_snap_id, &snap_flags);
+ if (r < 0 || ((snap_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0)) {
+ this->async_complete(r);
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ lderr(cct) << this << " invalidating object map in-memory" << dendl;
+
+ // update in-memory flags
+ uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID;
+ if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ flags |= RBD_FLAG_FAST_DIFF_INVALID;
+ }
+
+ r = image_ctx.update_flags(m_snap_id, flags, true);
+ if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+
+ // do not update on-disk flags if not image owner
+ if (image_ctx.image_watcher == nullptr ||
+ (!m_force && m_snap_id == CEPH_NOSNAP &&
+ image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner())) {
+ this->async_complete(-EROFS);
+ return;
+ }
+
+ lderr(cct) << this << " invalidating object map on-disk" << dendl;
+ librados::ObjectWriteOperation op;
+ cls_client::set_flags(&op, m_snap_id, flags, flags);
+
+ librados::AioCompletion *rados_completion =
+ this->create_callback_completion();
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+ &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+bool InvalidateRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
+ return true;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/InvalidateRequest.h b/src/librbd/object_map/InvalidateRequest.h
new file mode 100644
index 00000000..11c42037
--- /dev/null
+++ b/src/librbd/object_map/InvalidateRequest.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class InvalidateRequest : public AsyncRequest<ImageCtxT> {
+public:
+ static InvalidateRequest* create(ImageCtxT &image_ctx, uint64_t snap_id,
+ bool force, Context *on_finish);
+
+ InvalidateRequest(ImageCtxT &image_ctx, uint64_t snap_id, bool force,
+ Context *on_finish)
+ : AsyncRequest<ImageCtxT>(image_ctx, on_finish),
+ m_snap_id(snap_id), m_force(force) {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+ int filter_return_code(int r) const override{
+ // never propagate an error back to the caller
+ return 0;
+ }
+
+private:
+ uint64_t m_snap_id;
+ bool m_force;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
diff --git a/src/librbd/object_map/LockRequest.cc b/src/librbd/object_map/LockRequest.cc
new file mode 100644
index 00000000..4ed4fa1b
--- /dev/null
+++ b/src/librbd/object_map/LockRequest.cc
@@ -0,0 +1,157 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/LockRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::LockRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+LockRequest<I>::LockRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish), m_broke_lock(false) {
+}
+
+template <typename I>
+void LockRequest<I>::send() {
+ send_lock();
+}
+
+template <typename I>
+void LockRequest<I>::send_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::lock(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "", "",
+ utime_t(), 0);
+
+ using klass = LockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_lock>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_lock(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val == 0) {
+ return m_on_finish;
+ } else if (*ret_val == -EEXIST) {
+ // already locked by myself
+ *ret_val = 0;
+ return m_on_finish;
+ } else if (m_broke_lock || *ret_val != -EBUSY) {
+ lderr(cct) << "failed to lock object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ return m_on_finish;
+ }
+
+ send_get_lock_info();
+ return nullptr;
+}
+
+template <typename I>
+void LockRequest<I>::send_get_lock_info() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectReadOperation op;
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = LockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_get_lock_info>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_get_lock_info(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val == -ENOENT) {
+ send_lock();
+ return nullptr;
+ }
+
+ ClsLockType lock_type;
+ std::string lock_tag;
+ if (*ret_val == 0) {
+ auto it = m_out_bl.cbegin();
+ *ret_val = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+ &lock_type, &lock_tag);
+ }
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to list object map locks: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ return m_on_finish;
+ }
+
+ send_break_locks();
+ return nullptr;
+}
+
+template <typename I>
+void LockRequest<I>::send_break_locks() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << ", "
+ << "num_lockers=" << m_lockers.size() << dendl;
+
+ librados::ObjectWriteOperation op;
+ for (auto &locker : m_lockers) {
+ rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, locker.first.cookie,
+ locker.first.locker);
+ }
+
+ using klass = LockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_break_locks>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_break_locks(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ m_broke_lock = true;
+ if (*ret_val == 0 || *ret_val == -ENOENT) {
+ send_lock();
+ return nullptr;
+ }
+
+ lderr(cct) << "failed to break object map lock: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::LockRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/LockRequest.h b/src/librbd/object_map/LockRequest.h
new file mode 100644
index 00000000..0333548e
--- /dev/null
+++ b/src/librbd/object_map/LockRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
+
+#include "include/buffer.h"
+#include "cls/lock/cls_lock_types.h"
+#include <map>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class LockRequest {
+public:
+ static LockRequest* create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new LockRequest(image_ctx, on_finish);
+ }
+ LockRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> /------------------------------------- BREAK_LOCKS * * *
+ * | | ^ *
+ * | | | *
+ * | | | *
+ * | v (EBUSY && !broke_lock) | *
+ * \---------> LOCK_OBJECT_MAP * * * * * * * * * * * > GET_LOCK_INFO * *
+ * | * ^ * *
+ * | * * * *
+ * | * * (ENOENT) * *
+ * | * * * * * * * * * * * * * * * * * *
+ * | * *
+ * | * (other errors) *
+ * | * *
+ * v v (other errors) *
+ * <finish> < * * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ bool m_broke_lock;
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> m_lockers;
+ bufferlist m_out_bl;
+
+ void send_lock();
+ Context *handle_lock(int *ret_val);
+
+ void send_get_lock_info();
+ Context *handle_get_lock_info(int *ret_val);
+
+ void send_break_locks();
+ Context *handle_break_locks(int *ret_val);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::LockRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
diff --git a/src/librbd/object_map/RefreshRequest.cc b/src/librbd/object_map/RefreshRequest.cc
new file mode 100644
index 00000000..c4fc2539
--- /dev/null
+++ b/src/librbd/object_map/RefreshRequest.cc
@@ -0,0 +1,303 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/RefreshRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "librbd/object_map/LockRequest.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::RefreshRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace object_map {
+
+template <typename I>
+RefreshRequest<I>::RefreshRequest(I &image_ctx, ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish)
+ : m_image_ctx(image_ctx), m_object_map(object_map), m_snap_id(snap_id),
+ m_on_finish(on_finish), m_object_count(0),
+ m_truncate_on_disk_object_map(false) {
+}
+
+template <typename I>
+void RefreshRequest<I>::send() {
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ m_object_count = Striper::get_num_objects(
+ m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id));
+ }
+
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "object_count=" << m_object_count << dendl;
+ send_lock();
+}
+
+template <typename I>
+void RefreshRequest<I>::apply() {
+ uint64_t num_objs;
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ num_objs = Striper::get_num_objects(
+ m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id));
+ }
+ ceph_assert(m_on_disk_object_map.size() >= num_objs);
+
+ *m_object_map = m_on_disk_object_map;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ if (m_object_count > cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT) {
+ send_invalidate_and_close();
+ return;
+ } else if (m_snap_id != CEPH_NOSNAP) {
+ send_load();
+ return;
+ }
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_lock>(this);
+
+ LockRequest<I> *req = LockRequest<I>::create(m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_lock(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ ceph_assert(*ret_val == 0);
+ send_load();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_load() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::object_map_load_start(&op);
+
+ using klass = RefreshRequest<I>;
+ m_out_bl.clear();
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_load>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_load(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ *ret_val = cls_client::object_map_load_finish(&bl_it,
+ &m_on_disk_object_map);
+ }
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ if (*ret_val == -EINVAL) {
+ // object map is corrupt on-disk -- clear it and properly size it
+ // so future IO can keep the object map in sync
+ lderr(cct) << "object map corrupt on-disk: " << oid << dendl;
+ m_truncate_on_disk_object_map = true;
+ send_resize_invalidate();
+ return nullptr;
+ } else if (*ret_val < 0) {
+ lderr(cct) << "failed to load object map: " << oid << dendl;
+ send_invalidate();
+ return nullptr;
+ }
+
+ if (m_on_disk_object_map.size() < m_object_count) {
+ lderr(cct) << "object map smaller than current object count: "
+ << m_on_disk_object_map.size() << " != "
+ << m_object_count << dendl;
+ send_resize_invalidate();
+ return nullptr;
+ }
+
+ ldout(cct, 20) << "refreshed object map: num_objs="
+ << m_on_disk_object_map.size() << dendl;
+ if (m_on_disk_object_map.size() > m_object_count) {
+ // resize op might have been interrupted
+ ldout(cct, 1) << "object map larger than current object count: "
+ << m_on_disk_object_map.size() << " != "
+ << m_object_count << dendl;
+ }
+
+ apply();
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_invalidate() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_disk_object_map.clear();
+ object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count,
+ OBJECT_EXISTS);
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_invalidate>(this);
+ InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+ m_image_ctx, m_snap_id, true, ctx);
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_invalidate(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ }
+
+ apply();
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_resize_invalidate() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_disk_object_map.clear();
+ object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count,
+ OBJECT_EXISTS);
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_resize_invalidate>(this);
+ InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+ m_image_ctx, m_snap_id, true, ctx);
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_resize_invalidate(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ apply();
+ return m_on_finish;
+ }
+
+ send_resize();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_resize() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ }
+ if (m_truncate_on_disk_object_map) {
+ op.truncate(0);
+ }
+ cls_client::object_map_resize(&op, m_object_count, OBJECT_NONEXISTENT);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_resize>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_resize(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to adjust object map size: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ }
+
+ apply();
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_invalidate_and_close() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_invalidate_and_close>(this);
+ InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+ m_image_ctx, m_snap_id, false, ctx);
+
+ lderr(cct) << "object map too large: " << m_object_count << dendl;
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_invalidate_and_close(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ } else {
+ *ret_val = -EFBIG;
+ }
+
+ m_object_map->clear();
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::RefreshRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/RefreshRequest.h b/src/librbd/object_map/RefreshRequest.h
new file mode 100644
index 00000000..24a7998a
--- /dev/null
+++ b/src/librbd/object_map/RefreshRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/bit_vector.hpp"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class RefreshRequest {
+public:
+ static RefreshRequest *create(ImageCtxT &image_ctx,
+ ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish) {
+ return new RefreshRequest(image_ctx, object_map, snap_id, on_finish);
+ }
+
+ RefreshRequest(ImageCtxT &image_ctx, ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> -----> LOCK (skip if snapshot)
+ * * |
+ * * v (other errors)
+ * * LOAD * * * * * * * > INVALIDATE ------------\
+ * * | * |
+ * * | * (-EINVAL or too small) |
+ * * | * * * * * * > INVALIDATE_AND_RESIZE |
+ * * | | * |
+ * * | | * |
+ * * | v * |
+ * * | RESIZE * |
+ * * | | * |
+ * * | | * * * * * * * |
+ * * | | * |
+ * * | v v |
+ * * \--------------------> LOCK <-------------/
+ * * |
+ * v v
+ * INVALIDATE_AND_CLOSE ---------------> <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ ceph::BitVector<2> *m_object_map;
+ uint64_t m_snap_id;
+ Context *m_on_finish;
+
+ uint64_t m_object_count;
+ ceph::BitVector<2> m_on_disk_object_map;
+ bool m_truncate_on_disk_object_map;
+ bufferlist m_out_bl;
+
+ void send_lock();
+ Context *handle_lock(int *ret_val);
+
+ void send_load();
+ Context *handle_load(int *ret_val);
+
+ void send_invalidate();
+ Context *handle_invalidate(int *ret_val);
+
+ void send_resize_invalidate();
+ Context *handle_resize_invalidate(int *ret_val);
+
+ void send_resize();
+ Context *handle_resize(int *ret_val);
+
+ void send_invalidate_and_close();
+ Context *handle_invalidate_and_close(int *ret_val);
+
+ void apply();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::RefreshRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
diff --git a/src/librbd/object_map/RemoveRequest.cc b/src/librbd/object_map/RemoveRequest.cc
new file mode 100644
index 00000000..a7db0c03
--- /dev/null
+++ b/src/librbd/object_map/RemoveRequest.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/RemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::RemoveRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+RemoveRequest<I>::RemoveRequest(I *image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish),
+ m_lock("object_map::RemoveRequest::m_lock") {
+}
+
+template <typename I>
+void RemoveRequest<I>::send() {
+ send_remove_object_map();
+}
+
+template <typename I>
+void RemoveRequest<I>::send_remove_object_map() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ std::vector<uint64_t> snap_ids;
+ snap_ids.push_back(CEPH_NOSNAP);
+ for (auto it : m_image_ctx->snap_info) {
+ snap_ids.push_back(it.first);
+ }
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_ref_counter == 0);
+
+ for (auto snap_id : snap_ids) {
+ m_ref_counter++;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id));
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_object_map>(this);
+
+ int r = m_image_ctx->md_ctx.aio_remove(oid, comp);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+}
+
+template <typename I>
+Context *RemoveRequest<I>::handle_remove_object_map(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_ref_counter > 0);
+ m_ref_counter--;
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to remove object map: " << cpp_strerror(*result)
+ << dendl;
+ m_error_result = *result;
+ }
+ if (m_ref_counter > 0) {
+ return nullptr;
+ }
+ }
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ }
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/RemoveRequest.h b/src/librbd/object_map/RemoveRequest.h
new file mode 100644
index 00000000..1353ef9b
--- /dev/null
+++ b/src/librbd/object_map/RemoveRequest.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H
+
+#include "include/buffer.h"
+#include "common/Mutex.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class RemoveRequest {
+public:
+ static RemoveRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return new RemoveRequest(image_ctx, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . .
+ * v v .
+ * REMOVE_OBJECT_MAP . (for every snapshot)
+ * | . .
+ * v . . .
+ * <finis>
+ *
+ * @endverbatim
+ */
+
+ RemoveRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result = 0;
+ int m_ref_counter = 0;
+ mutable Mutex m_lock;
+
+ void send_remove_object_map();
+ Context *handle_remove_object_map(int *result);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H
diff --git a/src/librbd/object_map/Request.cc b/src/librbd/object_map/Request.cc
new file mode 100644
index 00000000..a5e3a8bd
--- /dev/null
+++ b/src/librbd/object_map/Request.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/Request.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/RWLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/object_map/InvalidateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::Request: "
+
+namespace librbd {
+namespace object_map {
+
+bool Request::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " should_complete: r=" << r << dendl;
+
+ switch (m_state)
+ {
+ case STATE_REQUEST:
+ if (r < 0) {
+ lderr(cct) << "failed to update object map: " << cpp_strerror(r)
+ << dendl;
+ return invalidate();
+ }
+
+ finish_request();
+ return true;
+
+ case STATE_INVALIDATE:
+ ldout(cct, 20) << "INVALIDATE" << dendl;
+ if (r < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r)
+ << dendl;
+ }
+ return true;
+
+ default:
+ lderr(cct) << "invalid state: " << m_state << dendl;
+ ceph_abort();
+ break;
+ }
+ return false;
+}
+
+bool Request::invalidate() {
+ bool flags_set;
+ int r = m_image_ctx.test_flags(m_snap_id, RBD_FLAG_OBJECT_MAP_INVALID,
+ &flags_set);
+ if (r == 0 && flags_set) {
+ return true;
+ }
+
+ m_state = STATE_INVALIDATE;
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id,
+ true,
+ create_callback_context());
+ req->send();
+ return false;
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/Request.h b/src/librbd/object_map/Request.h
new file mode 100644
index 00000000..aef8800d
--- /dev/null
+++ b/src/librbd/object_map/Request.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class Request : public AsyncRequest<> {
+public:
+ Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish)
+ : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id),
+ m_state(STATE_REQUEST)
+ {
+ }
+
+ void send() override = 0;
+
+protected:
+ const uint64_t m_snap_id;
+
+ bool should_complete(int r) override;
+ int filter_return_code(int r) const override {
+ if (m_state == STATE_REQUEST) {
+ // never propagate an error back to the caller
+ return 0;
+ }
+ return r;
+ }
+ virtual void finish_request() {
+ }
+
+private:
+ /**
+ * <start> ---> STATE_REQUEST ---> <finish>
+ * | ^
+ * v |
+ * STATE_INVALIDATE -------/
+ */
+ enum State {
+ STATE_REQUEST,
+ STATE_INVALIDATE
+ };
+
+ State m_state;
+
+ bool invalidate();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/object_map/ResizeRequest.cc b/src/librbd/object_map/ResizeRequest.cc
new file mode 100644
index 00000000..1ec0d99a
--- /dev/null
+++ b/src/librbd/object_map/ResizeRequest.cc
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/ResizeRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::ResizeRequest: "
+
+namespace librbd {
+namespace object_map {
+
+void ResizeRequest::resize(ceph::BitVector<2> *object_map, uint64_t num_objs,
+ uint8_t default_state) {
+ size_t orig_object_map_size = object_map->size();
+ object_map->resize(num_objs);
+ if (num_objs > orig_object_map_size) {
+ auto it = object_map->begin() + orig_object_map_size;
+ auto end_it = object_map->begin() + num_objs;
+ for (;it != end_it; ++it) {
+ *it = default_state;
+ }
+ }
+}
+
+void ResizeRequest::send() {
+ CephContext *cct = m_image_ctx.cct;
+
+ RWLock::WLocker l(m_image_ctx.object_map_lock);
+ m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size);
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << this << " resizing on-disk object map: "
+ << "ictx=" << &m_image_ctx << ", "
+ << "oid=" << oid << ", num_objs=" << m_num_objs << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_resize(&op, m_num_objs, m_default_object_state);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void ResizeRequest::finish_request() {
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " resizing in-memory object map: "
+ << m_num_objs << dendl;
+
+ resize(m_object_map, m_num_objs, m_default_object_state);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/ResizeRequest.h b/src/librbd/object_map/ResizeRequest.h
new file mode 100644
index 00000000..51839400
--- /dev/null
+++ b/src/librbd/object_map/ResizeRequest.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+#include "common/bit_vector.hpp"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class ResizeRequest : public Request {
+public:
+ ResizeRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+ uint64_t snap_id, uint64_t new_size,
+ uint8_t default_object_state, Context *on_finish)
+ : Request(image_ctx, snap_id, on_finish), m_object_map(object_map),
+ m_num_objs(0), m_new_size(new_size),
+ m_default_object_state(default_object_state)
+ {
+ }
+
+ static void resize(ceph::BitVector<2> *object_map, uint64_t num_objs,
+ uint8_t default_state);
+
+ void send() override;
+
+protected:
+ void finish_request() override;
+
+private:
+ ceph::BitVector<2> *m_object_map;
+ uint64_t m_num_objs;
+ uint64_t m_new_size;
+ uint8_t m_default_object_state;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotCreateRequest.cc b/src/librbd/object_map/SnapshotCreateRequest.cc
new file mode 100644
index 00000000..2949dec8
--- /dev/null
+++ b/src/librbd/object_map/SnapshotCreateRequest.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+#include <iostream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotCreateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+ const SnapshotCreateRequest::State& state) {
+ switch(state) {
+ case SnapshotCreateRequest::STATE_READ_MAP:
+ os << "READ_MAP";
+ break;
+ case SnapshotCreateRequest::STATE_WRITE_MAP:
+ os << "WRITE_MAP";
+ break;
+ case SnapshotCreateRequest::STATE_ADD_SNAPSHOT:
+ os << "ADD_SNAPSHOT";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+void SnapshotCreateRequest::send() {
+ send_read_map();
+}
+
+bool SnapshotCreateRequest::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0 && m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ if (m_ret_val < 0) {
+ // pass errors down to base class to invalidate the object map
+ return Request::should_complete(r);
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ bool finished = false;
+ switch (m_state) {
+ case STATE_READ_MAP:
+ send_write_map();
+ break;
+ case STATE_WRITE_MAP:
+ finished = send_add_snapshot();
+ break;
+ case STATE_ADD_SNAPSHOT:
+ update_object_map();
+ finished = true;
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return finished;
+}
+
+void SnapshotCreateRequest::send_read_map() {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+ m_state = STATE_READ_MAP;
+
+ // IO is blocked due to the snapshot creation -- consistent to read from disk
+ librados::ObjectReadOperation op;
+ op.read(0, 0, NULL, NULL);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op,
+ &m_read_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotCreateRequest::send_write_map() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+ << dendl;
+ m_state = STATE_WRITE_MAP;
+
+ librados::ObjectWriteOperation op;
+ op.write_full(m_read_bl);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+bool SnapshotCreateRequest::send_add_snapshot() {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) == 0) {
+ return true;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+ m_state = STATE_ADD_SNAPSHOT;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ cls_client::object_map_snap_add(&op);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return false;
+}
+
+void SnapshotCreateRequest::update_object_map() {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+
+ auto it = m_object_map.begin();
+ auto end_it = m_object_map.end();
+ for (; it != end_it; ++it) {
+ if (*it == OBJECT_EXISTS) {
+ *it = OBJECT_EXISTS_CLEAN;
+ }
+ }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotCreateRequest.h b/src/librbd/object_map/SnapshotCreateRequest.h
new file mode 100644
index 00000000..dd15abef
--- /dev/null
+++ b/src/librbd/object_map/SnapshotCreateRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+#include "librbd/object_map/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class SnapshotCreateRequest : public Request {
+public:
+ /**
+ * Snapshot create goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_READ_MAP
+ * |
+ * v (skip)
+ * STATE_WRITE_MAP . . . . . . .
+ * | .
+ * v v
+ * STATE_ADD_SNAPSHOT ---> <finish>
+ *
+ * @endverbatim
+ *
+ * The _ADD_SNAPSHOT state is skipped if the FAST_DIFF feature isn't enabled.
+ */
+ enum State {
+ STATE_READ_MAP,
+ STATE_WRITE_MAP,
+ STATE_ADD_SNAPSHOT
+ };
+
+ SnapshotCreateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish)
+ : Request(image_ctx, snap_id, on_finish),
+ m_object_map(*object_map), m_ret_val(0) {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ State m_state = STATE_READ_MAP;
+ ceph::BitVector<2> &m_object_map;
+
+ bufferlist m_read_bl;
+ int m_ret_val;
+
+ void send_read_map();
+ void send_write_map();
+ bool send_add_snapshot();
+
+ void update_object_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotRemoveRequest.cc b/src/librbd/object_map/SnapshotRemoveRequest.cc
new file mode 100644
index 00000000..54f67c8d
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRemoveRequest.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotRemoveRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace object_map {
+
+void SnapshotRemoveRequest::send() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.snap_lock.is_wlocked());
+
+ if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ int r = m_image_ctx.get_flags(m_snap_id, &m_flags);
+ ceph_assert(r == 0);
+
+ compute_next_snap_id();
+ load_map();
+ } else {
+ remove_map();
+ }
+}
+
+void SnapshotRemoveRequest::load_map() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << "snap_oid=" << snap_oid << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::object_map_load_start(&op);
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_load_map>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRemoveRequest::handle_load_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::object_map_load_finish(&it, &m_snap_object_map);
+ }
+ if (r == -ENOENT) {
+ // implies we have already deleted this snapshot and handled the
+ // necessary fast-diff cleanup
+ complete(0);
+ return;
+ } else if (r < 0) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ lderr(cct) << "failed to load object map " << oid << ": "
+ << cpp_strerror(r) << dendl;
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ invalidate_next_map();
+ return;
+ }
+
+ remove_snapshot();
+}
+
+void SnapshotRemoveRequest::remove_snapshot() {
+ if ((m_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
+ // snapshot object map exists on disk but is invalid. cannot clean fast-diff
+ // on next snapshot if current snapshot was invalid.
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ invalidate_next_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_next_snap_id));
+ ldout(cct, 5) << "oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_next_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_snap_remove(&op, m_snap_object_map);
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ SnapshotRemoveRequest,
+ &SnapshotRemoveRequest::handle_remove_snapshot>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRemoveRequest::handle_remove_snapshot(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0 && r != -ENOENT) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ m_next_snap_id));
+ lderr(cct) << "failed to remove object map snapshot " << oid << ": "
+ << cpp_strerror(r) << dendl;
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ invalidate_next_map();
+ return;
+ }
+
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ update_object_map();
+ remove_map();
+}
+
+void SnapshotRemoveRequest::invalidate_next_map() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ ceph_assert(m_image_ctx.snap_lock.is_wlocked());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = librbd::util::create_context_callback<
+ SnapshotRemoveRequest,
+ &SnapshotRemoveRequest::handle_invalidate_next_map>(this);
+ InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx,
+ m_next_snap_id, true, ctx);
+ req->send();
+}
+
+void SnapshotRemoveRequest::handle_invalidate_next_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ m_next_snap_id));
+ lderr(cct) << "failed to invalidate object map " << oid << ": "
+ << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ remove_map();
+}
+
+void SnapshotRemoveRequest::remove_map() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << "oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_remove_map>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRemoveRequest::handle_remove_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ lderr(cct) << "failed to remove object map " << oid << ": "
+ << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ complete(0);
+}
+
+void SnapshotRemoveRequest::compute_next_snap_id() {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+
+ m_next_snap_id = CEPH_NOSNAP;
+ std::map<librados::snap_t, SnapInfo>::const_iterator it =
+ m_image_ctx.snap_info.find(m_snap_id);
+ ceph_assert(it != m_image_ctx.snap_info.end());
+
+ ++it;
+ if (it != m_image_ctx.snap_info.end()) {
+ m_next_snap_id = it->first;
+ }
+}
+
+void SnapshotRemoveRequest::update_object_map() {
+ assert(m_image_ctx.snap_lock.is_locked());
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+ if (m_next_snap_id == m_image_ctx.snap_id && m_next_snap_id == CEPH_NOSNAP) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto it = m_object_map.begin();
+ auto end_it = m_object_map.end();
+ auto snap_it = m_snap_object_map.begin();
+ uint64_t i = 0;
+ for (; it != end_it; ++it) {
+ if (*it == OBJECT_EXISTS_CLEAN &&
+ (i >= m_snap_object_map.size() ||
+ *snap_it == OBJECT_EXISTS)) {
+ *it = OBJECT_EXISTS;
+ }
+ if (i < m_snap_object_map.size()) {
+ ++snap_it;
+ }
+ ++i;
+ }
+ }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotRemoveRequest.h b/src/librbd/object_map/SnapshotRemoveRequest.h
new file mode 100644
index 00000000..bd020d1a
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRemoveRequest.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/bit_vector.hpp"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+namespace object_map {
+
+class SnapshotRemoveRequest : public AsyncRequest<> {
+public:
+ /**
+ * Snapshot rollback goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start> -----------> STATE_LOAD_MAP ----\
+ * . * |
+ * . * (error) |
+ * . (invalid object map) v |
+ * . . . > STATE_INVALIDATE_NEXT_MAP |
+ * . | |
+ * . | |
+ * . (fast diff disabled) v v
+ * . . . . . . . . . . > STATE_REMOVE_MAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The _LOAD_MAP state is skipped if the fast diff feature is disabled.
+ * If the fast diff feature is enabled and the snapshot is flagged as
+ * invalid, the next snapshot / HEAD object mapis flagged as invalid;
+ * otherwise, the state machine proceeds to remove the object map.
+ */
+
+ SnapshotRemoveRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish)
+ : AsyncRequest(image_ctx, on_finish), m_object_map(*object_map),
+ m_snap_id(snap_id), m_next_snap_id(CEPH_NOSNAP) {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override {
+ return true;
+ }
+
+private:
+ ceph::BitVector<2> &m_object_map;
+ uint64_t m_snap_id;
+ uint64_t m_next_snap_id;
+
+ uint64_t m_flags = 0;
+
+ ceph::BitVector<2> m_snap_object_map;
+ bufferlist m_out_bl;
+
+ void load_map();
+ void handle_load_map(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void invalidate_next_map();
+ void handle_invalidate_next_map(int r);
+
+ void remove_map();
+ void handle_remove_map(int r);
+
+ void compute_next_snap_id();
+ void update_object_map();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotRollbackRequest.cc b/src/librbd/object_map/SnapshotRollbackRequest.cc
new file mode 100644
index 00000000..d32123ff
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRollbackRequest.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include <iostream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotRollbackRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+ const SnapshotRollbackRequest::State& state) {
+ switch(state) {
+ case SnapshotRollbackRequest::STATE_READ_MAP:
+ os << "READ_MAP";
+ break;
+ case SnapshotRollbackRequest::STATE_INVALIDATE_MAP:
+ os << "INVALIDATE_MAP";
+ break;
+ case SnapshotRollbackRequest::STATE_WRITE_MAP:
+ os << "WRITE_MAP";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+void SnapshotRollbackRequest::send() {
+ send_read_map();
+}
+
+bool SnapshotRollbackRequest::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0 && m_ret_val == 0) {
+ m_ret_val = r;
+ }
+
+ bool finished = false;
+ switch (m_state) {
+ case STATE_READ_MAP:
+ if (r < 0) {
+ // invalidate the snapshot object map
+ send_invalidate_map();
+ } else {
+ send_write_map();
+ }
+ break;
+ case STATE_INVALIDATE_MAP:
+ // invalidate the HEAD object map as well
+ finished = Request::should_complete(m_ret_val);
+ break;
+ case STATE_WRITE_MAP:
+ finished = Request::should_complete(r);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return finished;
+}
+
+void SnapshotRollbackRequest::send_read_map() {
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+ << dendl;
+ m_state = STATE_READ_MAP;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, NULL, NULL);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op,
+ &m_read_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRollbackRequest::send_write_map() {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ CEPH_NOSNAP));
+ ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+ << dendl;
+ m_state = STATE_WRITE_MAP;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ op.write_full(m_read_bl);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRollbackRequest::send_invalidate_map() {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_INVALIDATE_MAP;
+
+ InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id,
+ false,
+ create_callback_context());
+ req->send();
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotRollbackRequest.h b/src/librbd/object_map/SnapshotRollbackRequest.h
new file mode 100644
index 00000000..e26b1e0a
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRollbackRequest.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class SnapshotRollbackRequest : public Request {
+public:
+ /**
+ * Snapshot rollback goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (error)
+ * STATE_READ_MAP * * * * > STATE_INVALIDATE_MAP
+ * | |
+ * v v
+ * STATE_WRITE_MAP -------> <finish>
+ *
+ * @endverbatim
+ *
+ * If an error occurs within the READ_MAP state, the associated snapshot's
+ * object map will be flagged as invalid. Otherwise, an error from any state
+ * will result in the HEAD object map being flagged as invalid via the base
+ * class.
+ */
+ enum State {
+ STATE_READ_MAP,
+ STATE_INVALIDATE_MAP,
+ STATE_WRITE_MAP
+ };
+
+ SnapshotRollbackRequest(ImageCtx &image_ctx, uint64_t snap_id,
+ Context *on_finish)
+ : Request(image_ctx, CEPH_NOSNAP, on_finish),
+ m_snap_id(snap_id), m_ret_val(0) {
+ ceph_assert(snap_id != CEPH_NOSNAP);
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ State m_state = STATE_READ_MAP;
+ uint64_t m_snap_id;
+ int m_ret_val;
+
+ bufferlist m_read_bl;
+
+ void send_read_map();
+ void send_invalidate_map();
+ void send_write_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
diff --git a/src/librbd/object_map/UnlockRequest.cc b/src/librbd/object_map/UnlockRequest.cc
new file mode 100644
index 00000000..0220ec90
--- /dev/null
+++ b/src/librbd/object_map/UnlockRequest.cc
@@ -0,0 +1,66 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/UnlockRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::UnlockRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+UnlockRequest<I>::UnlockRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void UnlockRequest<I>::send() {
+ send_unlock();
+}
+
+template <typename I>
+void UnlockRequest<I>::send_unlock() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::unlock(&op, RBD_LOCK_NAME, "");
+
+ using klass = UnlockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_unlock>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *UnlockRequest<I>::handle_unlock(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0 && *ret_val != -ENOENT) {
+ lderr(m_image_ctx.cct) << "failed to release object map lock: "
+ << cpp_strerror(*ret_val) << dendl;
+
+ }
+
+ *ret_val = 0;
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::UnlockRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/UnlockRequest.h b/src/librbd/object_map/UnlockRequest.h
new file mode 100644
index 00000000..ae1d9e93
--- /dev/null
+++ b/src/librbd/object_map/UnlockRequest.h
@@ -0,0 +1,47 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class UnlockRequest {
+public:
+ static UnlockRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new UnlockRequest(image_ctx, on_finish);
+ }
+
+ UnlockRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> ----> UNLOCK ----> <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ void send_unlock();
+ Context* handle_unlock(int *ret_val);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::UnlockRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
diff --git a/src/librbd/object_map/UpdateRequest.cc b/src/librbd/object_map/UpdateRequest.cc
new file mode 100644
index 00000000..4b305334
--- /dev/null
+++ b/src/librbd/object_map/UpdateRequest.cc
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/UpdateRequest.h"
+#include "include/rbd/object_map_types.h"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "cls/lock/cls_lock_client.h"
+#include <string>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+// keep aligned to bit_vector 4K block sizes
+const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10);
+
+}
+
+template <typename I>
+void UpdateRequest<I>::send() {
+ update_object_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_object_map() {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.object_map_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ // break very large requests into manageable batches
+ m_update_end_object_no = std::min(
+ m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE);
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", "
+ << "[" << m_update_start_object_no << ","
+ << m_update_end_object_no << ") = "
+ << (m_current_state ?
+ stringify(static_cast<uint32_t>(*m_current_state)) : "")
+ << "->" << static_cast<uint32_t>(m_new_state)
+ << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_update(&op, m_update_start_object_no,
+ m_update_end_object_no, m_new_state,
+ m_current_state);
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ UpdateRequest<I>, &UpdateRequest<I>::handle_update_object_map>(this);
+ std::vector<librados::snap_t> snaps;
+ int r = m_image_ctx.md_ctx.aio_operate(
+ oid, rados_completion, &op, 0, snaps,
+ (m_trace.valid() ? m_trace.get_info() : nullptr));
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void UpdateRequest<I>::handle_update_object_map(int r) {
+ ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT && m_ignore_enoent) {
+ r = 0;
+ }
+ if (r < 0 && m_ret_val == 0) {
+ m_ret_val = r;
+ }
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+ update_in_memory_object_map();
+
+ if (m_update_end_object_no < m_end_object_no) {
+ m_update_start_object_no = m_update_end_object_no;
+ update_object_map();
+ return;
+ }
+ }
+
+ // no more batch updates to send
+ complete(m_ret_val);
+}
+
+template <typename I>
+void UpdateRequest<I>::update_in_memory_object_map() {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ ceph_assert(m_image_ctx.object_map_lock.is_locked());
+
+ // rebuilding the object map might update on-disk only
+ if (m_snap_id == m_image_ctx.snap_id) {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ auto it = m_object_map.begin() +
+ std::min(m_update_start_object_no, m_object_map.size());
+ auto end_it = m_object_map.begin() +
+ std::min(m_update_end_object_no, m_object_map.size());
+ for (; it != end_it; ++it) {
+ auto state_ref = *it;
+ uint8_t state = state_ref;
+ if (!m_current_state || state == *m_current_state ||
+ (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
+ state_ref = m_new_state;
+ }
+ }
+ }
+}
+
+template <typename I>
+void UpdateRequest<I>::finish_request() {
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::UpdateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/UpdateRequest.h b/src/librbd/object_map/UpdateRequest.h
new file mode 100644
index 00000000..f5dcce15
--- /dev/null
+++ b/src/librbd/object_map/UpdateRequest.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+#include "common/bit_vector.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Utils.h"
+#include <boost/optional.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class UpdateRequest : public Request {
+public:
+ static UpdateRequest *create(ImageCtx &image_ctx,
+ ceph::BitVector<2> *object_map,
+ uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace,
+ bool ignore_enoent, Context *on_finish) {
+ return new UpdateRequest(image_ctx, object_map, snap_id, start_object_no,
+ end_object_no, new_state, current_state,
+ parent_trace, ignore_enoent, on_finish);
+ }
+
+ UpdateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+ uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ Context *on_finish)
+ : Request(image_ctx, snap_id, on_finish), m_object_map(*object_map),
+ m_start_object_no(start_object_no), m_end_object_no(end_object_no),
+ m_update_start_object_no(start_object_no), m_new_state(new_state),
+ m_current_state(current_state),
+ m_trace(util::create_trace(image_ctx, "update object map", parent_trace)),
+ m_ignore_enoent(ignore_enoent)
+ {
+ m_trace.event("start");
+ }
+ virtual ~UpdateRequest() {
+ m_trace.event("finish");
+ }
+
+ void send() override;
+
+protected:
+ void finish_request() override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |/------------------\
+ * v | (repeat in batches)
+ * UPDATE_OBJECT_MAP -----/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ceph::BitVector<2> &m_object_map;
+ uint64_t m_start_object_no;
+ uint64_t m_end_object_no;
+ uint64_t m_update_start_object_no;
+ uint64_t m_update_end_object_no = 0;
+ uint8_t m_new_state;
+ boost::optional<uint8_t> m_current_state;
+ ZTracer::Trace m_trace;
+ bool m_ignore_enoent;
+
+ int m_ret_val = 0;
+
+ void update_object_map();
+ void handle_update_object_map(int r);
+
+ void update_in_memory_object_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::UpdateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
diff --git a/src/librbd/operation/DisableFeaturesRequest.cc b/src/librbd/operation/DisableFeaturesRequest.cc
new file mode 100644
index 00000000..27dfa458
--- /dev/null
+++ b/src/librbd/operation/DisableFeaturesRequest.cc
@@ -0,0 +1,646 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/DisableFeaturesRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/image/SetFlagsRequest.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/object_map/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DisableFeaturesRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+DisableFeaturesRequest<I>::DisableFeaturesRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features,
+ bool force)
+ : Request<I>(image_ctx, on_finish, journal_op_tid), m_features(features),
+ m_force(force) {
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features
+ << dendl;
+
+ send_prepare_lock();
+}
+
+template <typename I>
+bool DisableFeaturesRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_prepare_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ image_ctx.state->prepare_lock(create_async_context_callback(
+ image_ctx, create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_prepare_lock>(this)));
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_prepare_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ RWLock::WLocker locker(image_ctx.owner_lock);
+ image_ctx.io_work_queue->block_writes(create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_block_writes>(this));
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+ m_writes_blocked = true;
+
+ {
+ RWLock::WLocker locker(image_ctx.owner_lock);
+ // avoid accepting new requests from peers while we manipulate
+ // the image features
+ if (image_ctx.exclusive_lock != nullptr &&
+ (image_ctx.journal == nullptr ||
+ !image_ctx.journal->is_journal_replaying())) {
+ image_ctx.exclusive_lock->block_requests(0);
+ m_requests_blocked = true;
+ }
+ }
+
+ send_acquire_exclusive_lock();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_acquire_exclusive_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_acquire_exclusive_lock>(this);
+
+ {
+ RWLock::WLocker locker(image_ctx.owner_lock);
+ // if disabling features w/ exclusive lock supported, we need to
+ // acquire the lock to temporarily block IO against the image
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ m_acquired_lock = true;
+
+ image_ctx.exclusive_lock->acquire_lock(ctx);
+ return;
+ }
+ }
+
+ ctx->complete(0);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_acquire_exclusive_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ image_ctx.owner_lock.get_read();
+ if (*result < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl;
+ image_ctx.owner_lock.put_read();
+ return handle_finish(*result);
+ } else if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ *result = image_ctx.exclusive_lock->get_unlocked_op_error();
+ image_ctx.owner_lock.put_read();
+ return handle_finish(*result);
+ }
+
+ do {
+ m_features &= image_ctx.features;
+
+ // interlock object-map and fast-diff together
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) ||
+ ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) {
+ m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF);
+ }
+
+ m_new_features = image_ctx.features & ~m_features;
+ m_features_mask = m_features;
+
+ if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
+ if ((m_new_features & RBD_FEATURE_OBJECT_MAP) != 0 ||
+ (m_new_features & RBD_FEATURE_JOURNALING) != 0) {
+ lderr(cct) << "cannot disable exclusive-lock. object-map "
+ "or journaling must be disabled before "
+ "disabling exclusive-lock." << dendl;
+ *result = -EINVAL;
+ break;
+ }
+ m_features_mask |= (RBD_FEATURE_OBJECT_MAP |
+ RBD_FEATURE_FAST_DIFF |
+ RBD_FEATURE_JOURNALING);
+ }
+ if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) {
+ m_disable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+ }
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) {
+ m_disable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+ }
+ } while (false);
+ image_ctx.owner_lock.put_read();
+
+ if (*result < 0) {
+ return handle_finish(*result);
+ }
+
+ send_get_mirror_mode();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_get_mirror_mode() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ send_append_op_event();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = DisableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_mode>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_get_mirror_mode(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode);
+ }
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to retrieve pool mirror mode: "
+ << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": m_mirror_mode="
+ << m_mirror_mode << dendl;
+
+ send_get_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_get_mirror_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ send_disable_mirror_image();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, image_ctx.id);
+
+ using klass = DisableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_image>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_get_mirror_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::mirror_image_get_finish(&it, &mirror_image);
+ }
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to retrieve pool mirror image: "
+ << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ if ((mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) && !m_force) {
+ lderr(cct) << "cannot disable journaling: image mirroring "
+ << "enabled and mirror pool mode set to image"
+ << dendl;
+ *result = -EINVAL;
+ return handle_finish(*result);
+ }
+
+ send_disable_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_disable_mirror_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_disable_mirror_image>(this);
+
+ mirror::DisableRequest<I> *req =
+ mirror::DisableRequest<I>::create(&image_ctx, m_force, true, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_disable_mirror_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to disable image mirroring: " << cpp_strerror(*result)
+ << dendl;
+ // not fatal
+ }
+
+ send_close_journal();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_close_journal() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ {
+ RWLock::WLocker locker(image_ctx.owner_lock);
+ if (image_ctx.journal != nullptr) {
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::swap(m_journal, image_ctx.journal);
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_close_journal>(this);
+
+ m_journal->close(ctx);
+ return;
+ }
+ }
+
+ send_remove_journal();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_close_journal(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close image journal: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ ceph_assert(m_journal != nullptr);
+ delete m_journal;
+ m_journal = nullptr;
+
+ send_remove_journal();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_remove_journal() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_remove_journal>(this);
+
+ journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create(
+ image_ctx.md_ctx, image_ctx.id, librbd::Journal<>::IMAGE_CLIENT_ID,
+ image_ctx.op_work_queue, ctx);
+
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_remove_journal(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to remove image journal: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!this->template append_op_event<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_append_op_event>(this)) {
+ send_remove_object_map();
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_remove_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_remove_object_map() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ send_set_features();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_remove_object_map>(this);
+
+ object_map::RemoveRequest<I> *req =
+ object_map::RemoveRequest<I>::create(&image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_remove_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to remove object map: " << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ send_set_features();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_set_features() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": new_features="
+ << m_new_features << ", features_mask=" << m_features_mask
+ << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::set_features(&op, m_new_features, m_features_mask);
+
+ using klass = DisableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_features>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_set_features(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -EINVAL && (m_features_mask & RBD_FEATURE_JOURNALING) != 0) {
+ // NOTE: infernalis OSDs will not accept a mask with new features, so
+ // re-attempt with a reduced mask.
+ ldout(cct, 5) << this << " " << __func__
+ << ": re-attempt with a reduced mask" << dendl;
+ m_features_mask &= ~RBD_FEATURE_JOURNALING;
+ send_set_features();
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update features: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_update_flags();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_update_flags() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_disable_flags == 0) {
+ send_notify_update();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": disable_flags="
+ << m_disable_flags << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_update_flags>(this);
+
+ image::SetFlagsRequest<I> *req =
+ image::SetFlagsRequest<I>::create(&image_ctx, 0, m_disable_flags, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_update_flags(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update image flags: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_notify_update();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_notify_update() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_notify_update>(this);
+
+ image_ctx.notify_update(ctx);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_notify_update(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (image_ctx.exclusive_lock == nullptr || !m_acquired_lock) {
+ return handle_finish(*result);
+ }
+
+ send_release_exclusive_lock();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_release_exclusive_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_release_exclusive_lock>(this);
+
+ image_ctx.exclusive_lock->release_lock(ctx);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_release_exclusive_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ return handle_finish(*result);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_finish(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ RWLock::WLocker locker(image_ctx.owner_lock);
+ if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) {
+ image_ctx.exclusive_lock->unblock_requests();
+ }
+
+ image_ctx.io_work_queue->unblock_writes();
+ }
+ image_ctx.state->handle_prepare_lock_complete();
+
+ return this->create_context_finisher(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::DisableFeaturesRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/DisableFeaturesRequest.h b/src/librbd/operation/DisableFeaturesRequest.h
new file mode 100644
index 00000000..b064bc45
--- /dev/null
+++ b/src/librbd/operation/DisableFeaturesRequest.h
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H
+
+#include "librbd/ImageCtx.h"
+#include "librbd/operation/Request.h"
+#include "cls/rbd/cls_rbd_client.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class DisableFeaturesRequest : public Request<ImageCtxT> {
+public:
+ static DisableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features, bool force) {
+ return new DisableFeaturesRequest(image_ctx, on_finish, journal_op_tid,
+ features, force);
+ }
+
+ DisableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid, uint64_t features, bool force);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::UpdateFeaturesEvent(op_tid, m_features, false);
+ }
+
+private:
+ /**
+ * DisableFeatures goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PREPARE_LOCK
+ * |
+ * v
+ * STATE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_ACQUIRE_EXCLUSIVE_LOCK (skip if not
+ * | required)
+ * | (disbling journaling)
+ * \-------------------\
+ * | |
+ * | V
+ * | STATE_GET_MIRROR_MODE
+ * |(not |
+ * | disabling v
+ * | journaling) STATE_GET_MIRROR_IMAGE
+ * | |
+ * | v
+ * | STATE_DISABLE_MIRROR_IMAGE (skip if not
+ * | | required)
+ * | v
+ * | STATE_CLOSE_JOURNAL
+ * | |
+ * | v
+ * | STATE_REMOVE_JOURNAL
+ * | |
+ * |/-------------------/
+ * |
+ * v
+ * STATE_APPEND_OP_EVENT (skip if journaling
+ * | disabled)
+ * v
+ * STATE_REMOVE_OBJECT_MAP (skip if not
+ * | disabling object map)
+ * v
+ * STATE_SET_FEATURES
+ * |
+ * v
+ * STATE_UPDATE_FLAGS
+ * |
+ * v
+ * STATE_NOTIFY_UPDATE
+ * |
+ * v
+ * STATE_REALEASE_EXCLUSIVE_LOCK (skip if not
+ * | required)
+ * | (unblock writes)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+
+ uint64_t m_features;
+ bool m_force;
+
+ bool m_acquired_lock = false;
+ bool m_writes_blocked = false;
+ bool m_snap_lock_acquired = false;
+ bool m_requests_blocked = false;
+
+ uint64_t m_new_features = 0;
+ uint64_t m_disable_flags = 0;
+ uint64_t m_features_mask = 0;
+
+ decltype(ImageCtxT::journal) m_journal = nullptr;
+ cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ bufferlist m_out_bl;
+
+ void send_prepare_lock();
+ Context *handle_prepare_lock(int *result);
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_acquire_exclusive_lock();
+ Context *handle_acquire_exclusive_lock(int *result);
+
+ void send_get_mirror_mode();
+ Context *handle_get_mirror_mode(int *result);
+
+ void send_get_mirror_image();
+ Context *handle_get_mirror_image(int *result);
+
+ void send_disable_mirror_image();
+ Context *handle_disable_mirror_image(int *result);
+
+ void send_close_journal();
+ Context *handle_close_journal(int *result);
+
+ void send_remove_journal();
+ Context *handle_remove_journal(int *result);
+
+ void send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_remove_object_map();
+ Context *handle_remove_object_map(int *result);
+
+ void send_set_features();
+ Context *handle_set_features(int *result);
+
+ void send_update_flags();
+ Context *handle_update_flags(int *result);
+
+ void send_notify_update();
+ Context *handle_notify_update(int *result);
+
+ void send_release_exclusive_lock();
+ Context *handle_release_exclusive_lock(int *result);
+
+ Context *handle_finish(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::DisableFeaturesRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H
diff --git a/src/librbd/operation/EnableFeaturesRequest.cc b/src/librbd/operation/EnableFeaturesRequest.cc
new file mode 100644
index 00000000..e2c1113d
--- /dev/null
+++ b/src/librbd/operation/EnableFeaturesRequest.cc
@@ -0,0 +1,489 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/EnableFeaturesRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/image/SetFlagsRequest.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/object_map/CreateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::EnableFeaturesRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+EnableFeaturesRequest<I>::EnableFeaturesRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features)
+ : Request<I>(image_ctx, on_finish, journal_op_tid), m_features(features) {
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features
+ << dendl;
+ send_prepare_lock();
+}
+
+template <typename I>
+bool EnableFeaturesRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_prepare_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ image_ctx.state->prepare_lock(create_async_context_callback(
+ image_ctx, create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_prepare_lock>(this)));
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_prepare_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ RWLock::WLocker locker(image_ctx.owner_lock);
+ image_ctx.io_work_queue->block_writes(create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_block_writes>(this));
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+ m_writes_blocked = true;
+
+ send_get_mirror_mode();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_get_mirror_mode() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_get_mirror_mode>(this);
+ ctx->complete(-ENOENT);
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = EnableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_mode>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_get_mirror_mode(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::mirror_mode_get_finish(&it, &mirror_mode);
+ } else if (*result == -ENOENT) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve pool mirror mode: "
+ << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ m_enable_mirroring = (mirror_mode == cls::rbd::MIRROR_MODE_POOL);
+
+ bool create_journal = false;
+ do {
+ RWLock::WLocker locker(image_ctx.owner_lock);
+
+ // avoid accepting new requests from peers while we manipulate
+ // the image features
+ if (image_ctx.exclusive_lock != nullptr &&
+ (image_ctx.journal == nullptr ||
+ !image_ctx.journal->is_journal_replaying())) {
+ image_ctx.exclusive_lock->block_requests(0);
+ m_requests_blocked = true;
+ }
+
+ m_features &= ~image_ctx.features;
+
+ // interlock object-map and fast-diff together
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) ||
+ ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) {
+ m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF);
+ }
+
+ m_new_features = image_ctx.features | m_features;
+ m_features_mask = m_features;
+
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) {
+ if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot enable object-map. exclusive-lock must be "
+ "enabled before enabling object-map." << dendl;
+ *result = -EINVAL;
+ break;
+ }
+ m_enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+ m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_FAST_DIFF);
+ }
+ if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) {
+ m_enable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+ m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP);
+ }
+
+ if ((m_features & RBD_FEATURE_JOURNALING) != 0) {
+ if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot enable journaling. exclusive-lock must be "
+ "enabled before enabling journaling." << dendl;
+ *result = -EINVAL;
+ break;
+ }
+ m_features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
+ create_journal = true;
+ }
+ } while (false);
+
+ if (*result < 0) {
+ return handle_finish(*result);
+ }
+ if (create_journal) {
+ send_create_journal();
+ return nullptr;
+ }
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_create_journal() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ journal::TagData tag_data(librbd::Journal<>::LOCAL_MIRROR_UUID);
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_create_journal>(this);
+
+ journal::CreateRequest<I> *req = journal::CreateRequest<I>::create(
+ image_ctx.md_ctx, image_ctx.id,
+ image_ctx.config.template get_val<uint64_t>("rbd_journal_order"),
+ image_ctx.config.template get_val<uint64_t>("rbd_journal_splay_width"),
+ image_ctx.config.template get_val<std::string>("rbd_journal_pool"),
+ cls::journal::Tag::TAG_CLASS_NEW, tag_data,
+ librbd::Journal<>::IMAGE_CLIENT_ID, image_ctx.op_work_queue, ctx);
+
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_create_journal(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to create journal: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!this->template append_op_event<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_append_op_event>(this)) {
+ send_update_flags();
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_update_flags();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_update_flags() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_enable_flags == 0) {
+ send_set_features();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": enable_flags="
+ << m_enable_flags << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_update_flags>(this);
+
+ image::SetFlagsRequest<I> *req =
+ image::SetFlagsRequest<I>::create(&image_ctx, m_enable_flags,
+ m_enable_flags, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_update_flags(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update image flags: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_set_features();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_set_features() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": new_features="
+ << m_new_features << ", features_mask=" << m_features_mask
+ << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::set_features(&op, m_new_features, m_features_mask);
+
+ using klass = EnableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_features>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_set_features(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update features: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_create_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_create_object_map() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (((image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0) ||
+ ((m_features & RBD_FEATURE_OBJECT_MAP) == 0)) {
+ send_enable_mirror_image();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_create_object_map>(this);
+
+ object_map::CreateRequest<I> *req =
+ object_map::CreateRequest<I>::create(&image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_create_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to create object map: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_enable_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_enable_mirror_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ if (!m_enable_mirroring) {
+ send_notify_update();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_enable_mirror_image>(this);
+
+ mirror::EnableRequest<I> *req =
+ mirror::EnableRequest<I>::create(&image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_enable_mirror_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to enable mirroring: " << cpp_strerror(*result)
+ << dendl;
+ // not fatal
+ }
+
+ send_notify_update();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_notify_update() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_notify_update>(this);
+
+ image_ctx.notify_update(ctx);
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_notify_update(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ return handle_finish(*result);
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_finish(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ RWLock::WLocker locker(image_ctx.owner_lock);
+
+ if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) {
+ image_ctx.exclusive_lock->unblock_requests();
+ }
+ if (m_writes_blocked) {
+ image_ctx.io_work_queue->unblock_writes();
+ }
+ }
+ image_ctx.state->handle_prepare_lock_complete();
+
+ return this->create_context_finisher(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::EnableFeaturesRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/EnableFeaturesRequest.h b/src/librbd/operation/EnableFeaturesRequest.h
new file mode 100644
index 00000000..1c91b4dc
--- /dev/null
+++ b/src/librbd/operation/EnableFeaturesRequest.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H
+
+#include "librbd/operation/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class EnableFeaturesRequest : public Request<ImageCtxT> {
+public:
+ static EnableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features) {
+ return new EnableFeaturesRequest(image_ctx, on_finish, journal_op_tid,
+ features);
+ }
+
+ EnableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid, uint64_t features);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::UpdateFeaturesEvent(op_tid, m_features, true);
+ }
+
+private:
+ /**
+ * EnableFeatures goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PREPARE_LOCK
+ * |
+ * v
+ * STATE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_GET_MIRROR_MODE
+ * |
+ * v
+ * STATE_CREATE_JOURNAL (skip if not
+ * | required)
+ * v
+ * STATE_APPEND_OP_EVENT (skip if journaling
+ * | disabled)
+ * v
+ * STATE_UPDATE_FLAGS
+ * |
+ * v
+ * STATE_SET_FEATURES
+ * |
+ * v
+ * STATE_CREATE_OBJECT_MAP (skip if not
+ * | required)
+ * v
+ * STATE_ENABLE_MIRROR_IMAGE
+ * |
+ * V
+ * STATE_NOTIFY_UPDATE
+ * |
+ * | (unblock writes)
+ * v
+ * <finish>
+ * @endverbatim
+ *
+ */
+
+ uint64_t m_features;
+
+ bool m_enable_mirroring = false;
+ bool m_requests_blocked = false;
+ bool m_writes_blocked = false;
+
+ uint64_t m_new_features = 0;
+ uint64_t m_enable_flags = 0;
+ uint64_t m_features_mask = 0;
+
+ bufferlist m_out_bl;
+
+ void send_prepare_lock();
+ Context *handle_prepare_lock(int *result);
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_get_mirror_mode();
+ Context *handle_get_mirror_mode(int *result);
+
+ void send_create_journal();
+ Context *handle_create_journal(int *result);
+
+ void send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_update_flags();
+ Context *handle_update_flags(int *result);
+
+ void send_set_features();
+ Context *handle_set_features(int *result);
+
+ void send_create_object_map();
+ Context *handle_create_object_map(int *result);
+
+ void send_enable_mirror_image();
+ Context *handle_enable_mirror_image(int *result);
+
+ void send_notify_update();
+ Context *handle_notify_update(int *result);
+
+ Context *handle_finish(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::EnableFeaturesRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H
diff --git a/src/librbd/operation/FlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc
new file mode 100644
index 00000000..af21bd4e
--- /dev/null
+++ b/src/librbd/operation/FlattenRequest.cc
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/FlattenRequest.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/image/DetachParentRequest.h"
+#include "librbd/Types.h"
+#include "librbd/io/ObjectRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::FlattenRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+class C_FlattenObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_FlattenObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ ::SnapContext snapc, uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc),
+ m_object_no(object_no) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ CephContext *cct = image_ctx.cct;
+
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ ldout(cct, 1) << "lost exclusive lock during flatten" << dendl;
+ return -ERESTART;
+ }
+
+ {
+ RWLock::RLocker snap_lock(image_ctx.snap_lock);
+ if (image_ctx.object_map != nullptr &&
+ !image_ctx.object_map->object_may_not_exist(m_object_no)) {
+ // can skip because the object already exists
+ return 1;
+ }
+ }
+
+ bufferlist bl;
+ string oid = image_ctx.get_object_name(m_object_no);
+ auto req = new io::ObjectWriteRequest<I>(&image_ctx, oid, m_object_no, 0,
+ std::move(bl), m_snapc, 0, {},
+ this);
+ if (!req->has_parent()) {
+ // stop early if the parent went away - it just means
+ // another flatten finished first or the image was resized
+ delete req;
+ return 1;
+ }
+
+ req->send();
+ return 0;
+ }
+
+private:
+ ::SnapContext m_snapc;
+ uint64_t m_object_no;
+};
+
+template <typename I>
+bool FlattenRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void FlattenRequest<I>::send_op() {
+ flatten_objects();
+}
+
+template <typename I>
+void FlattenRequest<I>::flatten_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ assert(image_ctx.owner_lock.is_locked());
+ auto ctx = create_context_callback<
+ FlattenRequest<I>,
+ &FlattenRequest<I>::handle_flatten_objects>(this);
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_FlattenObject<I> >(),
+ boost::lambda::_1, &image_ctx, m_snapc, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, m_overlap_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void FlattenRequest<I>::handle_flatten_objects(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "flatten operation interrupted" << dendl;
+ this->complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "flatten encountered an error: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_child();
+}
+
+template <typename I>
+void FlattenRequest<I>::detach_child() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ // should have been canceled prior to releasing lock
+ image_ctx.owner_lock.get_read();
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // if there are no snaps, remove from the children object as well
+ // (if snapshots remain, they have their own parent info, and the child
+ // will be removed when the last snap goes away)
+ image_ctx.snap_lock.get_read();
+ if ((image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
+ !image_ctx.snaps.empty()) {
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ detach_parent();
+ return;
+ }
+ image_ctx.snap_lock.put_read();
+
+ ldout(cct, 5) << dendl;
+ auto ctx = create_context_callback<
+ FlattenRequest<I>,
+ &FlattenRequest<I>::handle_detach_child>(this);
+ auto req = image::DetachChildRequest<I>::create(image_ctx, ctx);
+ req->send();
+ image_ctx.owner_lock.put_read();
+}
+
+template <typename I>
+void FlattenRequest<I>::handle_detach_child(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "detach encountered an error: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_parent();
+}
+
+template <typename I>
+void FlattenRequest<I>::detach_parent() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ // should have been canceled prior to releasing lock
+ image_ctx.owner_lock.get_read();
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // stop early if the parent went away - it just means
+ // another flatten finished first, so this one is useless.
+ image_ctx.parent_lock.get_read();
+ if (!image_ctx.parent) {
+ ldout(cct, 5) << "image already flattened" << dendl;
+ image_ctx.parent_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ this->complete(0);
+ return;
+ }
+ image_ctx.parent_lock.put_read();
+
+ // remove parent from this (base) image
+ auto ctx = create_context_callback<
+ FlattenRequest<I>,
+ &FlattenRequest<I>::handle_detach_parent>(this);
+ auto req = image::DetachParentRequest<I>::create(image_ctx, ctx);
+ req->send();
+ image_ctx.owner_lock.put_read();
+}
+
+template <typename I>
+void FlattenRequest<I>::handle_detach_parent(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "remove parent encountered an error: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ this->complete(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::FlattenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/FlattenRequest.h b/src/librbd/operation/FlattenRequest.h
new file mode 100644
index 00000000..78730963
--- /dev/null
+++ b/src/librbd/operation/FlattenRequest.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "common/snap_types.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class FlattenRequest : public Request<ImageCtxT>
+{
+public:
+ FlattenRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t overlap_objects, const ::SnapContext &snapc,
+ ProgressContext &prog_ctx)
+ : Request<ImageCtxT>(image_ctx, on_finish),
+ m_overlap_objects(overlap_objects),
+ m_snapc(snapc), m_prog_ctx(prog_ctx) {
+ }
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::FlattenEvent(op_tid);
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * FLATTEN_OBJECTS
+ * |
+ * v
+ * DETACH_CHILD
+ * |
+ * v
+ * DETACH_PARENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ uint64_t m_overlap_objects;
+ ::SnapContext m_snapc;
+ ProgressContext &m_prog_ctx;
+
+ void flatten_objects();
+ void handle_flatten_objects(int r);
+
+ void detach_child();
+ void handle_detach_child(int r);
+
+ void detach_parent();
+ void handle_detach_parent(int r);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::FlattenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
diff --git a/src/librbd/operation/MetadataRemoveRequest.cc b/src/librbd/operation/MetadataRemoveRequest.cc
new file mode 100644
index 00000000..828e7a5b
--- /dev/null
+++ b/src/librbd/operation/MetadataRemoveRequest.cc
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/MetadataRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MetadataRemoveRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+MetadataRemoveRequest<I>::MetadataRemoveRequest(I &image_ctx,
+ Context *on_finish,
+ const std::string &key)
+ : Request<I>(image_ctx, on_finish), m_key(key) {
+}
+
+template <typename I>
+void MetadataRemoveRequest<I>::send_op() {
+ send_metadata_remove();
+}
+
+template <typename I>
+bool MetadataRemoveRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void MetadataRemoveRequest<I>::send_metadata_remove() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::metadata_remove(&op, m_key);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::MetadataRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/MetadataRemoveRequest.h b/src/librbd/operation/MetadataRemoveRequest.h
new file mode 100644
index 00000000..1d7f2a46
--- /dev/null
+++ b/src/librbd/operation/MetadataRemoveRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offremove:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class MetadataRemoveRequest : public Request<ImageCtxT> {
+public:
+ MetadataRemoveRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const std::string &key);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::MetadataRemoveEvent(op_tid, m_key);
+ }
+
+private:
+ std::string m_key;
+
+ void send_metadata_remove();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::MetadataRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H
diff --git a/src/librbd/operation/MetadataSetRequest.cc b/src/librbd/operation/MetadataSetRequest.cc
new file mode 100644
index 00000000..760e9b1e
--- /dev/null
+++ b/src/librbd/operation/MetadataSetRequest.cc
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/MetadataSetRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MetadataSetRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+MetadataSetRequest<I>::MetadataSetRequest(I &image_ctx,
+ Context *on_finish,
+ const std::string &key,
+ const std::string &value)
+ : Request<I>(image_ctx, on_finish), m_key(key), m_value(value) {
+}
+
+template <typename I>
+void MetadataSetRequest<I>::send_op() {
+ send_metadata_set();
+}
+
+template <typename I>
+bool MetadataSetRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void MetadataSetRequest<I>::send_metadata_set() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ m_data[m_key].append(m_value);
+ librados::ObjectWriteOperation op;
+ cls_client::metadata_set(&op, m_data);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::MetadataSetRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/MetadataSetRequest.h b/src/librbd/operation/MetadataSetRequest.h
new file mode 100644
index 00000000..5f8daa2f
--- /dev/null
+++ b/src/librbd/operation/MetadataSetRequest.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "include/buffer.h"
+#include <string>
+#include <map>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class MetadataSetRequest : public Request<ImageCtxT> {
+public:
+ MetadataSetRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const std::string &key, const std::string &value);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::MetadataSetEvent(op_tid, m_key, m_value);
+ }
+
+private:
+ std::string m_key;
+ std::string m_value;
+ std::map<std::string, bufferlist> m_data;
+
+ void send_metadata_set();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::MetadataSetRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H
diff --git a/src/librbd/operation/MigrateRequest.cc b/src/librbd/operation/MigrateRequest.cc
new file mode 100644
index 00000000..e3ddff0e
--- /dev/null
+++ b/src/librbd/operation/MigrateRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/MigrateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/ObjectCopyRequest.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MigrateRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_async_context_callback;
+
+namespace {
+
+template <typename I>
+class C_MigrateObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_MigrateObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ ::SnapContext snapc, uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc),
+ m_object_no(object_no) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ CephContext *cct = image_ctx.cct;
+
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ ldout(cct, 1) << "lost exclusive lock during migrate" << dendl;
+ return -ERESTART;
+ }
+
+ start_async_op();
+ return 0;
+ }
+
+private:
+ uint64_t m_object_size;
+ ::SnapContext m_snapc;
+ uint64_t m_object_no;
+
+ io::AsyncOperation *m_async_op = nullptr;
+
+ void start_async_op() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(m_async_op == nullptr);
+ m_async_op = new io::AsyncOperation();
+ m_async_op->start_op(image_ctx);
+
+ if (!image_ctx.io_work_queue->writes_blocked()) {
+ migrate_object();
+ return;
+ }
+
+ auto ctx = create_async_context_callback(
+ image_ctx, create_context_callback<
+ C_MigrateObject<I>, &C_MigrateObject<I>::handle_start_async_op>(this));
+ m_async_op->finish_op();
+ delete m_async_op;
+ m_async_op = nullptr;
+ image_ctx.io_work_queue->wait_on_writes_unblocked(ctx);
+ }
+
+ void handle_start_async_op(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to start async op: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ start_async_op();
+ }
+
+ bool is_within_overlap_bounds() {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ auto overlap = std::min(image_ctx.size, image_ctx.migration_info.overlap);
+ return overlap > 0 &&
+ Striper::get_num_objects(image_ctx.layout, overlap) > m_object_no;
+ }
+
+ void migrate_object() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ CephContext *cct = image_ctx.cct;
+
+ auto ctx = create_context_callback<
+ C_MigrateObject<I>, &C_MigrateObject<I>::handle_migrate_object>(this);
+
+ if (is_within_overlap_bounds()) {
+ bufferlist bl;
+ string oid = image_ctx.get_object_name(m_object_no);
+ auto req = new io::ObjectWriteRequest<I>(&image_ctx, oid, m_object_no, 0,
+ std::move(bl), m_snapc, 0, {},
+ ctx);
+
+ ldout(cct, 20) << "copyup object req " << req << ", object_no "
+ << m_object_no << dendl;
+
+ req->send();
+ } else {
+ ceph_assert(image_ctx.parent != nullptr);
+
+ auto req = deep_copy::ObjectCopyRequest<I>::create(
+ image_ctx.parent, &image_ctx, 0, 0, image_ctx.migration_info.snap_map,
+ m_object_no, image_ctx.migration_info.flatten, ctx);
+
+ ldout(cct, 20) << "deep copy object req " << req << ", object_no "
+ << m_object_no << dendl;
+ req->send();
+ }
+ }
+
+ void handle_migrate_object(int r) {
+ CephContext *cct = this->m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ m_async_op->finish_op();
+ delete m_async_op;
+ this->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+void MigrateRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ migrate_objects();
+}
+
+template <typename I>
+bool MigrateRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+
+ return true;
+}
+
+template <typename I>
+void MigrateRequest<I>::migrate_objects() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ uint64_t overlap_objects = get_num_overlap_objects();
+
+ ldout(cct, 10) << "from 0 to " << overlap_objects << dendl;
+
+ auto ctx = create_context_callback<
+ MigrateRequest<I>, &MigrateRequest<I>::handle_migrate_objects>(this);
+
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_MigrateObject<I> >(),
+ boost::lambda::_1, &image_ctx, image_ctx.snapc, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, overlap_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void MigrateRequest<I>::handle_migrate_objects(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to migrate objects: " << cpp_strerror(r) << dendl;
+ }
+
+ this->complete(r);
+}
+
+template <typename I>
+uint64_t MigrateRequest<I>::get_num_overlap_objects() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+ auto overlap = image_ctx.migration_info.overlap;
+
+ return overlap > 0 ?
+ Striper::get_num_objects(image_ctx.layout, overlap) : 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::MigrateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/MigrateRequest.h b/src/librbd/operation/MigrateRequest.h
new file mode 100644
index 00000000..99f3b012
--- /dev/null
+++ b/src/librbd/operation/MigrateRequest.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "common/snap_types.h"
+#include "librbd/Types.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class MigrateRequest : public Request<ImageCtxT>
+{
+public:
+ MigrateRequest(ImageCtxT &image_ctx, Context *on_finish,
+ ProgressContext &prog_ctx)
+ : Request<ImageCtxT>(image_ctx, on_finish), m_prog_ctx(prog_ctx) {
+ }
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ ceph_abort();
+ return journal::UnknownEvent();
+ }
+
+private:
+ /**
+ * Migrate goes through the following state machine to copy objects
+ * from the parent (migrating source) image:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * MIGRATE_OBJECTS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+
+ ProgressContext &m_prog_ctx;
+
+ void migrate_objects();
+ void handle_migrate_objects(int r);
+
+ uint64_t get_num_overlap_objects();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::MigrateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H
diff --git a/src/librbd/operation/ObjectMapIterate.cc b/src/librbd/operation/ObjectMapIterate.cc
new file mode 100644
index 00000000..66958416
--- /dev/null
+++ b/src/librbd/operation/ObjectMapIterate.cc
@@ -0,0 +1,310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/ObjectMapIterate.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "librbd/Utils.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ObjectMapIterateRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+class C_VerifyObjectCallback : public C_AsyncObjectThrottle<I> {
+public:
+ C_VerifyObjectCallback(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t snap_id, uint64_t object_no,
+ ObjectIterateWork<I> handle_mismatch,
+ std::atomic_flag *invalidate)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx),
+ m_snap_id(snap_id), m_object_no(object_no),
+ m_oid(image_ctx->get_object_name(m_object_no)),
+ m_handle_mismatch(handle_mismatch),
+ m_invalidate(invalidate)
+ {
+ m_io_ctx.dup(image_ctx->data_ctx);
+ m_io_ctx.snap_set_read(CEPH_SNAPDIR);
+ }
+
+ void complete(int r) override {
+ I &image_ctx = this->m_image_ctx;
+ if (should_complete(r)) {
+ ldout(image_ctx.cct, 20) << m_oid << " C_VerifyObjectCallback completed "
+ << dendl;
+ m_io_ctx.close();
+
+ this->finish(r);
+ delete this;
+ }
+ }
+
+ int send() override {
+ send_list_snaps();
+ return 0;
+ }
+
+private:
+ librados::IoCtx m_io_ctx;
+ uint64_t m_snap_id;
+ uint64_t m_object_no;
+ std::string m_oid;
+ ObjectIterateWork<I> m_handle_mismatch;
+ std::atomic_flag *m_invalidate;
+
+ librados::snap_set_t m_snap_set;
+ int m_snap_list_ret = 0;
+
+ bool should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ if (r == 0) {
+ r = m_snap_list_ret;
+ }
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << m_oid << " C_VerifyObjectCallback::should_complete: "
+ << "encountered an error: " << cpp_strerror(r) << dendl;
+ return true;
+ }
+
+ ldout(cct, 20) << m_oid << " C_VerifyObjectCallback::should_complete: "
+ << " r="
+ << r << dendl;
+ return object_map_action(get_object_state());
+ }
+
+ void send_list_snaps() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ ldout(image_ctx.cct, 5) << m_oid
+ << " C_VerifyObjectCallback::send_list_snaps"
+ << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_snaps(&m_snap_set, &m_snap_list_ret);
+
+ librados::AioCompletion *comp = util::create_rados_callback(this);
+ int r = m_io_ctx.aio_operate(m_oid, comp, &op, NULL);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ uint8_t get_object_state() {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ for (std::vector<librados::clone_info_t>::const_iterator r =
+ m_snap_set.clones.begin(); r != m_snap_set.clones.end(); ++r) {
+ librados::snap_t from_snap_id;
+ librados::snap_t to_snap_id;
+ if (r->cloneid == librados::SNAP_HEAD) {
+ from_snap_id = next_valid_snap_id(m_snap_set.seq + 1);
+ to_snap_id = librados::SNAP_HEAD;
+ } else {
+ from_snap_id = next_valid_snap_id(r->snaps[0]);
+ to_snap_id = r->snaps[r->snaps.size()-1];
+ }
+
+ if (to_snap_id < m_snap_id) {
+ continue;
+ } else if (m_snap_id < from_snap_id) {
+ break;
+ }
+
+ if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 &&
+ from_snap_id != m_snap_id) {
+ return OBJECT_EXISTS_CLEAN;
+ }
+ return OBJECT_EXISTS;
+ }
+ return OBJECT_NONEXISTENT;
+ }
+
+ uint64_t next_valid_snap_id(uint64_t snap_id) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.snap_lock.is_locked());
+
+ std::map<librados::snap_t, SnapInfo>::iterator it =
+ image_ctx.snap_info.lower_bound(snap_id);
+ if (it == image_ctx.snap_info.end()) {
+ return CEPH_NOSNAP;
+ }
+ return it->first;
+ }
+
+ bool object_map_action(uint8_t new_state) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ ceph_assert(image_ctx.object_map != nullptr);
+
+ RWLock::WLocker l(image_ctx.object_map_lock);
+ uint8_t state = (*image_ctx.object_map)[m_object_no];
+
+ ldout(cct, 10) << "C_VerifyObjectCallback::object_map_action"
+ << " object " << image_ctx.get_object_name(m_object_no)
+ << " state " << (int)state
+ << " new_state " << (int)new_state << dendl;
+
+ if (state != new_state) {
+ int r = 0;
+
+ ceph_assert(m_handle_mismatch);
+ r = m_handle_mismatch(image_ctx, m_object_no, state, new_state);
+ if (r) {
+ lderr(cct) << "object map error: object "
+ << image_ctx.get_object_name(m_object_no)
+ << " marked as " << (int)state << ", but should be "
+ << (int)new_state << dendl;
+ m_invalidate->test_and_set();
+ } else {
+ ldout(cct, 1) << "object map inconsistent: object "
+ << image_ctx.get_object_name(m_object_no)
+ << " marked as " << (int)state << ", but should be "
+ << (int)new_state << dendl;
+ }
+ }
+
+ return true;
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+void ObjectMapIterateRequest<I>::send() {
+ if (!m_image_ctx.data_ctx.is_valid()) {
+ this->async_complete(-ENODEV);
+ return;
+ }
+
+ send_verify_objects();
+}
+
+template <typename I>
+bool ObjectMapIterateRequest<I>::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+
+ if (r == -ENODEV) {
+ lderr(cct) << "missing data pool" << dendl;
+ return true;
+ }
+
+ if (r < 0) {
+ lderr(cct) << "object map operation encountered an error: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ switch (m_state) {
+ case STATE_VERIFY_OBJECTS:
+ if (m_invalidate.test_and_set()) {
+ send_invalidate_object_map();
+ return false;
+ } else if (r == 0) {
+ return true;
+ }
+ break;
+
+ case STATE_INVALIDATE_OBJECT_MAP:
+ if (r == 0) {
+ return true;
+ }
+ break;
+
+ default:
+ ceph_abort();
+ break;
+ }
+
+ if (r < 0) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+void ObjectMapIterateRequest<I>::send_verify_objects() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ uint64_t snap_id;
+ uint64_t num_objects;
+ {
+ RWLock::RLocker l(m_image_ctx.snap_lock);
+ snap_id = m_image_ctx.snap_id;
+ num_objects = Striper::get_num_objects(m_image_ctx.layout,
+ m_image_ctx.get_image_size(snap_id));
+ }
+ ldout(cct, 5) << this << " send_verify_objects" << dendl;
+
+ m_state = STATE_VERIFY_OBJECTS;
+
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_VerifyObjectCallback<I> >(),
+ boost::lambda::_1, &m_image_ctx, snap_id,
+ boost::lambda::_2, m_handle_mismatch, &m_invalidate));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, m_image_ctx, context_factory, this->create_callback_context(),
+ &m_prog_ctx, 0, num_objects);
+ throttle->start_ops(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+uint64_t ObjectMapIterateRequest<I>::get_image_size() const {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ if (m_image_ctx.snap_id == CEPH_NOSNAP) {
+ if (!m_image_ctx.resize_reqs.empty()) {
+ return m_image_ctx.resize_reqs.front()->get_image_size();
+ } else {
+ return m_image_ctx.size;
+ }
+ }
+ return m_image_ctx.get_image_size(m_image_ctx.snap_id);
+}
+
+template <typename I>
+void ObjectMapIterateRequest<I>::send_invalidate_object_map() {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 5) << this << " send_invalidate_object_map" << dendl;
+ m_state = STATE_INVALIDATE_OBJECT_MAP;
+
+ object_map::InvalidateRequest<I>*req =
+ object_map::InvalidateRequest<I>::create(m_image_ctx, m_image_ctx.snap_id,
+ true,
+ this->create_callback_context());
+
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ req->send();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::ObjectMapIterateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/ObjectMapIterate.h b/src/librbd/operation/ObjectMapIterate.h
new file mode 100644
index 00000000..14215902
--- /dev/null
+++ b/src/librbd/operation/ObjectMapIterate.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H
+#define CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H
+
+#include <iostream>
+#include <atomic>
+
+#include "include/int_types.h"
+#include "include/rbd/object_map_types.h"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+using ObjectIterateWork = bool(*)(ImageCtxT &image_ctx,
+ uint64_t object_no,
+ uint8_t current_state,
+ uint8_t new_state);
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectMapIterateRequest : public AsyncRequest<ImageCtxT> {
+public:
+ ObjectMapIterateRequest(ImageCtxT &image_ctx, Context *on_finish,
+ ProgressContext &prog_ctx,
+ ObjectIterateWork<ImageCtxT> handle_mismatch)
+ : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx),
+ m_prog_ctx(prog_ctx), m_handle_mismatch(handle_mismatch)
+ {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ enum State {
+ STATE_VERIFY_OBJECTS,
+ STATE_INVALIDATE_OBJECT_MAP
+ };
+
+ ImageCtxT &m_image_ctx;
+ ProgressContext &m_prog_ctx;
+ ObjectIterateWork<ImageCtxT> m_handle_mismatch;
+ std::atomic_flag m_invalidate = ATOMIC_FLAG_INIT;
+ State m_state = STATE_VERIFY_OBJECTS;
+
+ void send_verify_objects();
+ void send_invalidate_object_map();
+
+ uint64_t get_image_size() const;
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::ObjectMapIterateRequest<librbd::ImageCtx>;
+
+#endif
diff --git a/src/librbd/operation/RebuildObjectMapRequest.cc b/src/librbd/operation/RebuildObjectMapRequest.cc
new file mode 100644
index 00000000..f923f36c
--- /dev/null
+++ b/src/librbd/operation/RebuildObjectMapRequest.cc
@@ -0,0 +1,248 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/RebuildObjectMapRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/operation/TrimRequest.h"
+#include "librbd/operation/ObjectMapIterate.h"
+#include "librbd/Utils.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::RebuildObjectMapRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send() {
+ send_resize_object_map();
+}
+
+template <typename I>
+bool RebuildObjectMapRequest<I>::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ switch (m_state) {
+ case STATE_RESIZE_OBJECT_MAP:
+ ldout(cct, 5) << "RESIZE_OBJECT_MAP" << dendl;
+ if (r == -ESTALE && !m_attempted_trim) {
+ // objects are still flagged as in-use -- delete them
+ m_attempted_trim = true;
+ send_trim_image();
+ return false;
+ } else if (r == 0) {
+ send_verify_objects();
+ }
+ break;
+
+ case STATE_TRIM_IMAGE:
+ ldout(cct, 5) << "TRIM_IMAGE" << dendl;
+ if (r == 0) {
+ send_resize_object_map();
+ }
+ break;
+
+ case STATE_VERIFY_OBJECTS:
+ ldout(cct, 5) << "VERIFY_OBJECTS" << dendl;
+ if (r == 0) {
+ send_save_object_map();
+ }
+ break;
+
+ case STATE_SAVE_OBJECT_MAP:
+ ldout(cct, 5) << "SAVE_OBJECT_MAP" << dendl;
+ if (r == 0) {
+ send_update_header();
+ }
+ break;
+ case STATE_UPDATE_HEADER:
+ ldout(cct, 5) << "UPDATE_HEADER" << dendl;
+ if (r == 0) {
+ return true;
+ }
+ break;
+
+ default:
+ ceph_abort();
+ break;
+ }
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "rebuild object map operation interrupted" << dendl;
+ return true;
+ } else if (r < 0) {
+ lderr(cct) << "rebuild object map encountered an error: " << cpp_strerror(r)
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_resize_object_map() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.snap_lock.get_read();
+ ceph_assert(m_image_ctx.object_map != nullptr);
+
+ uint64_t size = get_image_size();
+ uint64_t num_objects = Striper::get_num_objects(m_image_ctx.layout, size);
+
+ if (m_image_ctx.object_map->size() == num_objects) {
+ m_image_ctx.snap_lock.put_read();
+ send_verify_objects();
+ return;
+ }
+
+ ldout(cct, 5) << this << " send_resize_object_map" << dendl;
+ m_state = STATE_RESIZE_OBJECT_MAP;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ m_image_ctx.object_map->aio_resize(size, OBJECT_NONEXISTENT,
+ this->create_callback_context());
+ m_image_ctx.snap_lock.put_read();
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_trim_image() {
+ CephContext *cct = m_image_ctx.cct;
+
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ ldout(cct, 5) << this << " send_trim_image" << dendl;
+ m_state = STATE_TRIM_IMAGE;
+
+ uint64_t new_size;
+ uint64_t orig_size;
+ {
+ RWLock::RLocker l(m_image_ctx.snap_lock);
+ ceph_assert(m_image_ctx.object_map != nullptr);
+
+ new_size = get_image_size();
+ orig_size = m_image_ctx.get_object_size() *
+ m_image_ctx.object_map->size();
+ }
+ TrimRequest<I> *req = TrimRequest<I>::create(m_image_ctx,
+ this->create_callback_context(),
+ orig_size, new_size, m_prog_ctx);
+ req->send();
+}
+
+template <typename I>
+bool update_object_map(I& image_ctx, uint64_t object_no, uint8_t current_state,
+ uint8_t new_state) {
+ CephContext *cct = image_ctx.cct;
+ uint64_t snap_id = image_ctx.snap_id;
+
+ uint8_t state = (*image_ctx.object_map)[object_no];
+ if (state == OBJECT_EXISTS && new_state == OBJECT_NONEXISTENT &&
+ snap_id == CEPH_NOSNAP) {
+ // might be writing object to OSD concurrently
+ new_state = state;
+ }
+
+ if (new_state != state) {
+ ldout(cct, 15) << image_ctx.get_object_name(object_no)
+ << " rebuild updating object map "
+ << static_cast<uint32_t>(state) << "->"
+ << static_cast<uint32_t>(new_state) << dendl;
+ (*image_ctx.object_map)[object_no] = new_state;
+ }
+ return false;
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_verify_objects() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ m_state = STATE_VERIFY_OBJECTS;
+ ldout(cct, 5) << this << " send_verify_objects" << dendl;
+
+ ObjectMapIterateRequest<I> *req =
+ new ObjectMapIterateRequest<I>(m_image_ctx,
+ this->create_callback_context(),
+ m_prog_ctx, update_object_map);
+
+ req->send();
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_save_object_map() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 5) << this << " send_save_object_map" << dendl;
+ m_state = STATE_SAVE_OBJECT_MAP;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ ceph_assert(m_image_ctx.object_map != nullptr);
+ m_image_ctx.object_map->aio_save(this->create_callback_context());
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_update_header() {
+ ceph_assert(m_image_ctx.owner_lock.is_locked());
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ ldout(m_image_ctx.cct, 5) << this << " send_update_header" << dendl;
+ m_state = STATE_UPDATE_HEADER;
+
+ librados::ObjectWriteOperation op;
+
+ uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID | RBD_FLAG_FAST_DIFF_INVALID;
+ cls_client::set_flags(&op, m_image_ctx.snap_id, 0, flags);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ m_image_ctx.update_flags(m_image_ctx.snap_id, flags, false);
+}
+
+template <typename I>
+uint64_t RebuildObjectMapRequest<I>::get_image_size() const {
+ ceph_assert(m_image_ctx.snap_lock.is_locked());
+ if (m_image_ctx.snap_id == CEPH_NOSNAP) {
+ if (!m_image_ctx.resize_reqs.empty()) {
+ return m_image_ctx.resize_reqs.front()->get_image_size();
+ } else {
+ return m_image_ctx.size;
+ }
+ }
+ return m_image_ctx.get_image_size(m_image_ctx.snap_id);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/RebuildObjectMapRequest.h b/src/librbd/operation/RebuildObjectMapRequest.h
new file mode 100644
index 00000000..c7f1aa3b
--- /dev/null
+++ b/src/librbd/operation/RebuildObjectMapRequest.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class RebuildObjectMapRequest : public AsyncRequest<ImageCtxT> {
+public:
+
+ RebuildObjectMapRequest(ImageCtxT &image_ctx, Context *on_finish,
+ ProgressContext &prog_ctx)
+ : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx),
+ m_prog_ctx(prog_ctx), m_attempted_trim(false)
+ {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ /**
+ * Rebuild object map goes through the following state machine to
+ * verify per-object state:
+ *
+ * <start>
+ * . | . . . . . . . . . .
+ * . | . .
+ * . v v .
+ * . STATE_RESIZE_OBJECT_MAP . . . > STATE_TRIM_IMAGE
+ * . |
+ * . v
+ * . . . > STATE_VERIFY_OBJECTS
+ * |
+ * v
+ * STATE_SAVE_OBJECT_MAP
+ * |
+ * v
+ * STATE_UPDATE_HEADER
+ *
+ * The _RESIZE_OBJECT_MAP state will be skipped if the object map
+ * is appropriately sized for the image. The _TRIM_IMAGE state will
+ * only be hit if the resize failed due to an in-use object.
+ */
+ enum State {
+ STATE_RESIZE_OBJECT_MAP,
+ STATE_TRIM_IMAGE,
+ STATE_VERIFY_OBJECTS,
+ STATE_SAVE_OBJECT_MAP,
+ STATE_UPDATE_HEADER
+ };
+
+ ImageCtxT &m_image_ctx;
+ ProgressContext &m_prog_ctx;
+ State m_state = STATE_RESIZE_OBJECT_MAP;
+ bool m_attempted_trim;
+
+ void send_resize_object_map();
+ void send_trim_image();
+ void send_verify_objects();
+ void send_save_object_map();
+ void send_update_header();
+
+ uint64_t get_image_size() const;
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/operation/RenameRequest.cc b/src/librbd/operation/RenameRequest.cc
new file mode 100644
index 00000000..44c5e2cd
--- /dev/null
+++ b/src/librbd/operation/RenameRequest.cc
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/RenameRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::RenameRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename RenameRequest<I>::State& state) {
+ switch(state) {
+ case RenameRequest<I>::STATE_READ_SOURCE_HEADER:
+ os << "READ_SOURCE_HEADER";
+ break;
+ case RenameRequest<I>::STATE_WRITE_DEST_HEADER:
+ os << "WRITE_DEST_HEADER";
+ break;
+ case RenameRequest<I>::STATE_UPDATE_DIRECTORY:
+ os << "UPDATE_DIRECTORY";
+ break;
+ case RenameRequest<I>::STATE_REMOVE_SOURCE_HEADER:
+ os << "REMOVE_SOURCE_HEADER";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+RenameRequest<I>::RenameRequest(I &image_ctx, Context *on_finish,
+ const std::string &dest_name)
+ : Request<I>(image_ctx, on_finish), m_dest_name(dest_name),
+ m_source_oid(image_ctx.old_format ? util::old_header_name(image_ctx.name) :
+ util::id_obj_name(image_ctx.name)),
+ m_dest_oid(image_ctx.old_format ? util::old_header_name(dest_name) :
+ util::id_obj_name(dest_name)) {
+}
+
+template <typename I>
+void RenameRequest<I>::send_op() {
+ send_read_source_header();
+}
+
+template <typename I>
+bool RenameRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ r = filter_return_code(r);
+ if (r < 0) {
+ if (r == -EEXIST) {
+ ldout(cct, 1) << "image already exists" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+ }
+
+ if (m_state == STATE_UPDATE_DIRECTORY) {
+ // update in-memory name before removing source header
+ apply();
+ } else if (m_state == STATE_REMOVE_SOURCE_HEADER) {
+ return true;
+ }
+
+ RWLock::RLocker owner_lock(image_ctx.owner_lock);
+ switch (m_state) {
+ case STATE_READ_SOURCE_HEADER:
+ send_write_destination_header();
+ break;
+ case STATE_WRITE_DEST_HEADER:
+ send_update_directory();
+ break;
+ case STATE_UPDATE_DIRECTORY:
+ send_remove_source_header();
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+int RenameRequest<I>::filter_return_code(int r) const {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_state == STATE_READ_SOURCE_HEADER && r == -ENOENT) {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.name == m_dest_name) {
+ // signal that replay raced with itself
+ return -EEXIST;
+ }
+ } else if (m_state == STATE_REMOVE_SOURCE_HEADER && r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "warning: couldn't remove old source object ("
+ << m_source_oid << ")" << dendl;
+ }
+ return 0;
+ }
+ return r;
+}
+
+template <typename I>
+void RenameRequest<I>::send_read_source_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_READ_SOURCE_HEADER;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, NULL, NULL);
+
+ // TODO: old code read omap values but there are no omap values on the
+ // old format header nor the new format id object
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op,
+ &m_header_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_write_destination_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_WRITE_DEST_HEADER;
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ op.write_full(m_header_bl);
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(m_dest_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_update_directory() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_UPDATE_DIRECTORY;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ bufferlist cmd_bl;
+ bufferlist empty_bl;
+ encode(static_cast<__u8>(CEPH_OSD_TMAP_SET), cmd_bl);
+ encode(m_dest_name, cmd_bl);
+ encode(empty_bl, cmd_bl);
+ encode(static_cast<__u8>(CEPH_OSD_TMAP_RM), cmd_bl);
+ encode(image_ctx.name, cmd_bl);
+ op.tmap_update(cmd_bl);
+ } else {
+ cls_client::dir_rename_image(&op, image_ctx.name, m_dest_name,
+ image_ctx.id);
+ }
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_remove_source_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_REMOVE_SOURCE_HEADER;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::apply() {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.set_image_name(m_dest_name);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::RenameRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/RenameRequest.h b/src/librbd/operation/RenameRequest.h
new file mode 100644
index 00000000..6534d36c
--- /dev/null
+++ b/src/librbd/operation/RenameRequest.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_RENAME_REQUEST_H
+#define CEPH_LIBRBD_RENAME_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class RenameRequest : public Request<ImageCtxT>
+{
+public:
+ /**
+ * Rename goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_READ_SOURCE_HEADER
+ * |
+ * v
+ * STATE_WRITE_DEST_HEADER
+ * |
+ * v
+ * STATE_UPDATE_DIRECTORY
+ * |
+ * v
+ * STATE_REMOVE_SOURCE_HEADER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+ enum State {
+ STATE_READ_SOURCE_HEADER,
+ STATE_WRITE_DEST_HEADER,
+ STATE_UPDATE_DIRECTORY,
+ STATE_REMOVE_SOURCE_HEADER
+ };
+
+ RenameRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const std::string &dest_name);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ int filter_return_code(int r) const override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::RenameEvent(op_tid, m_dest_name);
+ }
+
+private:
+ std::string m_dest_name;
+
+ std::string m_source_oid;
+ std::string m_dest_oid;
+
+ State m_state = STATE_READ_SOURCE_HEADER;
+
+ bufferlist m_header_bl;
+
+ void send_read_source_header();
+ void send_write_destination_header();
+ void send_update_directory();
+ void send_remove_source_header();
+
+ void apply();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::RenameRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_RENAME_REQUEST_H
diff --git a/src/librbd/operation/Request.cc b/src/librbd/operation/Request.cc
new file mode 100644
index 00000000..429cc01e
--- /dev/null
+++ b/src/librbd/operation/Request.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/Request.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Request: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+Request<I>::Request(I &image_ctx, Context *on_finish, uint64_t journal_op_tid)
+ : AsyncRequest<I>(image_ctx, on_finish), m_op_tid(journal_op_tid) {
+}
+
+template <typename I>
+void Request<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ // automatically create the event if we don't need to worry
+ // about affecting concurrent IO ops
+ if (can_affect_io() || !append_op_event()) {
+ send_op();
+ }
+}
+
+template <typename I>
+Context *Request<I>::create_context_finisher(int r) {
+ // automatically commit the event if required (delete after commit)
+ if (m_appended_op_event && !m_committed_op_event &&
+ commit_op_event(r)) {
+ return nullptr;
+ }
+
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+ return util::create_context_callback<Request<I>, &Request<I>::finish>(this);
+}
+
+template <typename I>
+void Request<I>::finish_and_destroy(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ // automatically commit the event if required (delete after commit)
+ if (m_appended_op_event && !m_committed_op_event &&
+ commit_op_event(r)) {
+ return;
+ }
+
+ AsyncRequest<I>::finish_and_destroy(r);
+}
+
+template <typename I>
+void Request<I>::finish(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(!m_appended_op_event || m_committed_op_event);
+ AsyncRequest<I>::finish(r);
+}
+
+template <typename I>
+bool Request<I>::append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending()) {
+ append_op_event(util::create_context_callback<
+ Request<I>, &Request<I>::handle_op_event_safe>(this));
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+bool Request<I>::commit_op_event(int r) {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ if (!m_appended_op_event) {
+ return false;
+ }
+
+ ceph_assert(m_op_tid != 0);
+ ceph_assert(!m_committed_op_event);
+ m_committed_op_event = true;
+
+ if (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending()) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ // ops will be canceled / completed before closing journal
+ ceph_assert(image_ctx.journal->is_journal_ready());
+ image_ctx.journal->commit_op_event(m_op_tid, r,
+ new C_CommitOpEvent(this, r));
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void Request<I>::handle_commit_op_event(int r, int original_ret_val) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r)
+ << dendl;
+ }
+ if (original_ret_val < 0) {
+ r = original_ret_val;
+ }
+ finish(r);
+}
+
+template <typename I>
+void Request<I>::replay_op_ready(Context *on_safe) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ ceph_assert(image_ctx.snap_lock.is_locked());
+ ceph_assert(m_op_tid != 0);
+
+ m_appended_op_event = true;
+ image_ctx.journal->replay_op_ready(
+ m_op_tid, util::create_async_context_callback(image_ctx, on_safe));
+}
+
+template <typename I>
+void Request<I>::append_op_event(Context *on_safe) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ ceph_assert(image_ctx.snap_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_op_tid = image_ctx.journal->allocate_op_tid();
+ image_ctx.journal->append_op_event(
+ m_op_tid, journal::EventEntry{create_event(m_op_tid)},
+ new C_AppendOpEvent(this, on_safe));
+}
+
+template <typename I>
+void Request<I>::handle_op_event_safe(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ delete this;
+ } else {
+ ceph_assert(!can_affect_io());
+
+ // haven't started the request state machine yet
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ send_op();
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::operation::Request<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/operation/Request.h b/src/librbd/operation/Request.h
new file mode 100644
index 00000000..315a3c96
--- /dev/null
+++ b/src/librbd/operation/Request.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_REQUEST_H
+
+#include "librbd/AsyncRequest.h"
+#include "include/Context.h"
+#include "common/RWLock.h"
+#include "librbd/Utils.h"
+#include "librbd/Journal.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class Request : public AsyncRequest<ImageCtxT> {
+public:
+ Request(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid = 0);
+
+ void send();
+
+protected:
+ void finish(int r) override;
+ virtual void send_op() = 0;
+
+ virtual bool can_affect_io() const {
+ return false;
+ }
+ virtual journal::Event create_event(uint64_t op_tid) const = 0;
+
+ template <typename T, Context*(T::*MF)(int*)>
+ bool append_op_event(T *request) {
+ ImageCtxT &image_ctx = this->m_image_ctx;
+
+ ceph_assert(can_affect_io());
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.journal != nullptr) {
+ if (image_ctx.journal->is_journal_replaying()) {
+ Context *ctx = util::create_context_callback<T, MF>(request);
+ replay_op_ready(ctx);
+ return true;
+ } else if (image_ctx.journal->is_journal_appending()) {
+ Context *ctx = util::create_context_callback<T, MF>(request);
+ append_op_event(ctx);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool append_op_event();
+
+ // NOTE: temporary until converted to new state machine format
+ Context *create_context_finisher(int r);
+ void finish_and_destroy(int r) override;
+
+private:
+ struct C_AppendOpEvent : public Context {
+ Request *request;
+ Context *on_safe;
+ C_AppendOpEvent(Request *request, Context *on_safe)
+ : request(request), on_safe(on_safe) {
+ }
+ void finish(int r) override {
+ if (r >= 0) {
+ request->m_appended_op_event = true;
+ }
+ on_safe->complete(r);
+ }
+ };
+
+ struct C_CommitOpEvent : public Context {
+ Request *request;
+ int ret_val;
+ C_CommitOpEvent(Request *request, int ret_val)
+ : request(request), ret_val(ret_val) {
+ }
+ void finish(int r) override {
+ request->handle_commit_op_event(r, ret_val);
+ delete request;
+ }
+ };
+
+ uint64_t m_op_tid = 0;
+ bool m_appended_op_event = false;
+ bool m_committed_op_event = false;
+
+ void replay_op_ready(Context *on_safe);
+ void append_op_event(Context *on_safe);
+ void handle_op_event_safe(int r);
+
+ bool commit_op_event(int r);
+ void handle_commit_op_event(int r, int original_ret_val);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::Request<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_REQUEST_H
diff --git a/src/librbd/operation/ResizeRequest.cc b/src/librbd/operation/ResizeRequest.cc
new file mode 100644
index 00000000..537436ce
--- /dev/null
+++ b/src/librbd/operation/ResizeRequest.cc
@@ -0,0 +1,467 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/operation/TrimRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::ResizeRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+ResizeRequest<I>::ResizeRequest(I &image_ctx, Context *on_finish,
+ uint64_t new_size, bool allow_shrink, ProgressContext &prog_ctx,
+ uint64_t journal_op_tid, bool disable_journal)
+ : Request<I>(image_ctx, on_finish, journal_op_tid),
+ m_original_size(0), m_new_size(new_size), m_allow_shrink(allow_shrink),
+ m_prog_ctx(prog_ctx), m_new_parent_overlap(0), m_disable_journal(disable_journal),
+ m_xlist_item(this)
+{
+}
+
+template <typename I>
+ResizeRequest<I>::~ResizeRequest() {
+ I &image_ctx = this->m_image_ctx;
+ ResizeRequest *next_req = NULL;
+ {
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ ceph_assert(m_xlist_item.remove_myself());
+ if (!image_ctx.resize_reqs.empty()) {
+ next_req = image_ctx.resize_reqs.front();
+ }
+ }
+
+ if (next_req != NULL) {
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ next_req->send();
+ }
+}
+
+template <typename I>
+void ResizeRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ {
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ if (!m_xlist_item.is_on_list()) {
+ image_ctx.resize_reqs.push_back(&m_xlist_item);
+ if (image_ctx.resize_reqs.front() != this) {
+ return;
+ }
+ }
+
+ ceph_assert(image_ctx.resize_reqs.front() == this);
+ m_original_size = image_ctx.size;
+ compute_parent_overlap();
+ }
+
+ Request<I>::send();
+}
+
+template <typename I>
+void ResizeRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ if (this->is_canceled()) {
+ this->async_complete(-ERESTART);
+ } else {
+ send_pre_block_writes();
+ }
+}
+
+template <typename I>
+void ResizeRequest<I>::send_pre_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ image_ctx.io_work_queue->block_writes(create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_pre_block_writes>(this));
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_pre_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ return send_append_op_event();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_new_size < m_original_size && !m_allow_shrink) {
+ ldout(cct, 1) << "shrinking the image is not permitted" << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ this->async_complete(-EINVAL);
+ return nullptr;
+ }
+
+ if (m_disable_journal || !this->template append_op_event<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_append_op_event>(this)) {
+ return send_grow_object_map();
+ }
+
+ ldout(cct, 5) << dendl;
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ return send_grow_object_map();
+}
+
+template <typename I>
+void ResizeRequest<I>::send_trim_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ TrimRequest<I> *req = TrimRequest<I>::create(
+ image_ctx, create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_trim_image>(this),
+ m_original_size, m_new_size, m_prog_ctx);
+ req->send();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_trim_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result == -ERESTART) {
+ ldout(cct, 5) << "resize operation interrupted" << dendl;
+ return this->create_context_finisher(*result);
+ } else if (*result < 0) {
+ lderr(cct) << "failed to trim image: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_post_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_flush_cache() {
+ I &image_ctx = this->m_image_ctx;
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ auto ctx = create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_flush_cache>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(&image_ctx), io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec<I>::create_flush_request(
+ image_ctx, aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ delete req;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_flush_cache(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to flush cache: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_invalidate_cache();
+ return nullptr;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_invalidate_cache() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ // need to invalidate since we're deleting objects, and
+ // ObjectCacher doesn't track non-existent objects
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ image_ctx.io_object_dispatcher->invalidate_cache(create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_invalidate_cache>(this));
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_invalidate_cache(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ // ignore busy error -- writeback was successfully flushed so we might be
+ // wasting some cache space for trimmed objects, but they will get purged
+ // eventually. Most likely cause of the issue was a in-flight cache read
+ if (*result < 0 && *result != -EBUSY) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result)
+ << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_trim_image();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::send_grow_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ {
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ m_shrink_size_visible = true;
+ }
+
+ if (m_original_size == m_new_size) {
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(0);
+ } else if (m_new_size < m_original_size) {
+ image_ctx.io_work_queue->unblock_writes();
+ send_flush_cache();
+ return nullptr;
+ }
+
+ image_ctx.owner_lock.get_read();
+ image_ctx.snap_lock.get_read();
+ if (image_ctx.object_map == nullptr) {
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+
+ // IO is still blocked
+ send_update_header();
+ return nullptr;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ image_ctx.object_map->aio_resize(
+ m_new_size, OBJECT_NONEXISTENT, create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_grow_object_map>(this));
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_grow_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to resize object map: "
+ << cpp_strerror(*result) << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ // IO is still blocked
+ send_update_header();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::send_shrink_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ image_ctx.owner_lock.get_read();
+ image_ctx.snap_lock.get_read();
+ if (image_ctx.object_map == nullptr || m_new_size > m_original_size) {
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+
+ update_size_and_overlap();
+ return this->create_context_finisher(0);
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "original_size=" << m_original_size << ", "
+ << "new_size=" << m_new_size << dendl;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ image_ctx.object_map->aio_resize(
+ m_new_size, OBJECT_NONEXISTENT, create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_shrink_object_map>(this));
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_shrink_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to resize object map: "
+ << cpp_strerror(*result) << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ update_size_and_overlap();
+ return this->create_context_finisher(0);
+}
+
+template <typename I>
+void ResizeRequest<I>::send_post_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ image_ctx.io_work_queue->block_writes(create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_post_block_writes>(this));
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_post_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ image_ctx.io_work_queue->unblock_writes();
+ lderr(cct) << "failed to block writes prior to header update: "
+ << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_update_header();
+ return nullptr;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_update_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "original_size=" << m_original_size << ", "
+ << "new_size=" << m_new_size << dendl;;
+
+ // should have been canceled prior to releasing lock
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ // rewrite only the size field of the header
+ ceph_le64 new_size = init_le64(m_new_size);
+ bufferlist bl;
+ bl.append(reinterpret_cast<const char*>(&new_size), sizeof(new_size));
+ op.write(offsetof(rbd_obj_header_ondisk, image_size), bl);
+ } else {
+ cls_client::set_size(&op, m_new_size);
+ }
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_update_header>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+ rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_update_header(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update image header: " << cpp_strerror(*result)
+ << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ return send_shrink_object_map();
+}
+
+template <typename I>
+void ResizeRequest<I>::compute_parent_overlap() {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker l2(image_ctx.parent_lock);
+ if (image_ctx.parent == NULL) {
+ m_new_parent_overlap = 0;
+ } else {
+ m_new_parent_overlap = std::min(m_new_size, image_ctx.parent_md.overlap);
+ }
+}
+
+template <typename I>
+void ResizeRequest<I>::update_size_and_overlap() {
+ I &image_ctx = this->m_image_ctx;
+ {
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ image_ctx.size = m_new_size;
+
+ RWLock::WLocker parent_locker(image_ctx.parent_lock);
+ if (image_ctx.parent != NULL && m_new_size < m_original_size) {
+ image_ctx.parent_md.overlap = m_new_parent_overlap;
+ }
+ }
+
+ // blocked by PRE_BLOCK_WRITES (grow) or POST_BLOCK_WRITES (shrink) state
+ image_ctx.io_work_queue->unblock_writes();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::ResizeRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/ResizeRequest.h b/src/librbd/operation/ResizeRequest.h
new file mode 100644
index 00000000..f5e2f807
--- /dev/null
+++ b/src/librbd/operation/ResizeRequest.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "include/xlist.h"
+
+namespace librbd
+{
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class ResizeRequest : public Request<ImageCtxT> {
+public:
+ static ResizeRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t new_size, bool allow_shrink,
+ ProgressContext &prog_ctx, uint64_t journal_op_tid,
+ bool disable_journal) {
+ return new ResizeRequest(image_ctx, on_finish, new_size, allow_shrink, prog_ctx,
+ journal_op_tid, disable_journal);
+ }
+
+ ResizeRequest(ImageCtxT &image_ctx, Context *on_finish, uint64_t new_size,
+ bool allow_shrink, ProgressContext &prog_ctx, uint64_t journal_op_tid,
+ bool disable_journal);
+ ~ResizeRequest() override;
+
+ inline bool shrinking() const {
+ return (m_shrink_size_visible && m_new_size < m_original_size);
+ }
+
+ inline uint64_t get_image_size() const {
+ return m_new_size;
+ }
+
+ void send() override;
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override {
+ return true;
+ }
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::ResizeEvent(op_tid, m_new_size);
+ }
+
+private:
+ /**
+ * Resize goes through the following state machine to resize the image
+ * and update the object map:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PRE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_APPEND_OP_EVENT (skip if journaling
+ * | disabled)
+ * |
+ * | (grow)
+ * |\--------> STATE_GROW_OBJECT_MAP (skip if object map
+ * | | disabled)
+ * | v
+ * | STATE_UPDATE_HEADER ----------------------------\
+ * | (unblock writes) |
+ * | |
+ * | (unblock writes) |
+ * | |
+ * | (shrink) |
+ * |\--------> STATE_FLUSH_CACHE |
+ * | | |
+ * | v |
+ * | STATE_INVALIDATE_CACHE |
+ * | | |
+ * | v |
+ * | STATE_TRIM_IMAGE |
+ * | | |
+ * | v |
+ * | STATE_POST_BLOCK_WRITES |
+ * | | |
+ * | v |
+ * | STATE_UPDATE_HEADER |
+ * | | |
+ * | v |
+ * | STATE_SHRINK_OBJECT_MAP (skip if object map |
+ * | | disabled) |
+ * | | (unblock writes) |
+ * | (no change) v |
+ * \------------> <finish> <-----------------------------------/
+ *
+ * @endverbatim
+ *
+ * The _OBJECT_MAP states are skipped if the object map isn't enabled.
+ * The state machine will immediately transition to _FINISHED if there
+ * are no objects to trim.
+ */
+
+ uint64_t m_original_size;
+ uint64_t m_new_size;
+ bool m_allow_shrink = true;
+ ProgressContext &m_prog_ctx;
+ uint64_t m_new_parent_overlap;
+ bool m_shrink_size_visible = false;
+ bool m_disable_journal = false;
+
+ typename xlist<ResizeRequest<ImageCtxT>*>::item m_xlist_item;
+
+ void send_pre_block_writes();
+ Context *handle_pre_block_writes(int *result);
+
+ Context *send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_flush_cache();
+ Context *handle_flush_cache(int *result);
+
+ void send_invalidate_cache();
+ Context *handle_invalidate_cache(int *result);
+
+ void send_trim_image();
+ Context *handle_trim_image(int *result);
+
+ Context *send_grow_object_map();
+ Context *handle_grow_object_map(int *result);
+
+ Context *send_shrink_object_map();
+ Context *handle_shrink_object_map(int *result);
+
+ void send_post_block_writes();
+ Context *handle_post_block_writes(int *result);
+
+ void send_update_header();
+ Context *handle_update_header(int *result);
+
+ void compute_parent_overlap();
+ void update_size_and_overlap();
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::ResizeRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotCreateRequest.cc b/src/librbd/operation/SnapshotCreateRequest.cc
new file mode 100644
index 00000000..66293609
--- /dev/null
+++ b/src/librbd/operation/SnapshotCreateRequest.cc
@@ -0,0 +1,342 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ImageRequestWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotCreateRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+SnapshotCreateRequest<I>::SnapshotCreateRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t journal_op_tid,
+ bool skip_object_map)
+ : Request<I>(image_ctx, on_finish, journal_op_tid),
+ m_snap_namespace(snap_namespace), m_snap_name(snap_name),
+ m_skip_object_map(skip_object_map), m_ret_val(0), m_snap_id(CEPH_NOSNAP) {
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(cct) << "missing data pool" << dendl;
+ this->async_complete(-ENODEV);
+ return;
+ }
+
+ send_suspend_requests();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_suspend_requests() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ // TODO suspend (shrink) resize to ensure consistent RBD mirror
+ send_suspend_aio();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_suspend_requests(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ // TODO
+ send_suspend_aio();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_suspend_aio() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ image_ctx.io_work_queue->block_writes(create_context_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_suspend_aio>(this));
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_suspend_aio(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ if (!this->template append_op_event<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_append_op_event>(this)) {
+ send_allocate_snap_id();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ image_ctx.io_work_queue->unblock_writes();
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_allocate_snap_id();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_allocate_snap_id() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_allocate_snap_id>(this);
+ image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_allocate_snap_id(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << ", "
+ << "snap_id=" << m_snap_id << dendl;
+
+ if (*result < 0) {
+ save_result(result);
+ image_ctx.io_work_queue->unblock_writes();
+ lderr(cct) << "failed to allocate snapshot id: " << cpp_strerror(*result)
+ << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_create_snap();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_snap() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // save current size / parent info for creating snapshot record in ImageCtx
+ m_size = image_ctx.size;
+ m_parent_info = image_ctx.parent_md;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ cls_client::old_snapshot_add(&op, m_snap_id, m_snap_name);
+ } else {
+ cls_client::snapshot_add(&op, m_snap_id, m_snap_name, m_snap_namespace);
+ }
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_create_snap>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+ rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_create_snap(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ESTALE) {
+ send_allocate_snap_id();
+ return nullptr;
+ } else if (*result < 0) {
+ save_result(result);
+ send_release_snap_id();
+ return nullptr;
+ }
+
+ return send_create_object_map();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::send_create_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ image_ctx.snap_lock.get_read();
+ if (image_ctx.object_map == nullptr || m_skip_object_map) {
+ image_ctx.snap_lock.put_read();
+
+ update_snap_context();
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(0);
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ {
+ RWLock::RLocker object_map_lock(image_ctx.object_map_lock);
+ image_ctx.object_map->snapshot_add(
+ m_snap_id, create_context_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_create_object_map>(this));
+ }
+ image_ctx.snap_lock.put_read();
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_create_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ update_snap_context();
+ image_ctx.io_work_queue->unblock_writes();
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to snapshot object map: "
+ << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ return this->create_context_finisher(0);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_release_snap_id() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_snap_id != CEPH_NOSNAP);
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_release_snap_id>(this);
+ image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_release_snap_id(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ ceph_assert(m_ret_val < 0);
+ *result = m_ret_val;
+
+ image_ctx.io_work_queue->unblock_writes();
+ return this->create_context_finisher(m_ret_val);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::update_snap_context() {
+ I &image_ctx = this->m_image_ctx;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.old_format) {
+ return;
+ }
+
+ if (image_ctx.get_snap_info(m_snap_id) != NULL) {
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // immediately add a reference to the new snapshot
+ utime_t snap_time = ceph_clock_now();
+ image_ctx.add_snap(m_snap_namespace, m_snap_name, m_snap_id, m_size,
+ m_parent_info, RBD_PROTECTION_STATUS_UNPROTECTED,
+ 0, snap_time);
+
+ // immediately start using the new snap context if we
+ // own the exclusive lock
+ std::vector<snapid_t> snaps;
+ snaps.push_back(m_snap_id);
+ snaps.insert(snaps.end(), image_ctx.snapc.snaps.begin(),
+ image_ctx.snapc.snaps.end());
+
+ image_ctx.snapc.seq = m_snap_id;
+ image_ctx.snapc.snaps.swap(snaps);
+ image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(
+ image_ctx.snapc.seq, image_ctx.snaps);
+
+ if (!image_ctx.migration_info.empty()) {
+ auto it = image_ctx.migration_info.snap_map.find(CEPH_NOSNAP);
+ ceph_assert(it != image_ctx.migration_info.snap_map.end());
+ ceph_assert(!it->second.empty());
+ if (it->second[0] == CEPH_NOSNAP) {
+ ldout(cct, 5) << this << " " << __func__
+ << ": updating migration snap_map" << dendl;
+ it->second[0] = m_snap_id;
+ }
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotCreateRequest.h b/src/librbd/operation/SnapshotCreateRequest.h
new file mode 100644
index 00000000..406d2f01
--- /dev/null
+++ b/src/librbd/operation/SnapshotCreateRequest.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Types.h"
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotCreateRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Create goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_SUSPEND_REQUESTS
+ * |
+ * v
+ * STATE_SUSPEND_AIO * * * * * * * * * * * * *
+ * | *
+ * v *
+ * STATE_APPEND_OP_EVENT (skip if journal *
+ * | disabled) *
+ * (retry) v *
+ * . . . > STATE_ALLOCATE_SNAP_ID *
+ * . | *
+ * . v *
+ * . . . . STATE_CREATE_SNAP * * * * * * * * * * *
+ * | * *
+ * v * *
+ * STATE_CREATE_OBJECT_MAP (skip if * *
+ * | disabled) * *
+ * | * *
+ * | v *
+ * | STATE_RELEASE_SNAP_ID *
+ * | | *
+ * | v *
+ * \----------------> <finish> < * * * * *
+ *
+ * @endverbatim
+ *
+ * The _CREATE_STATE state may repeat back to the _ALLOCATE_SNAP_ID state
+ * if a stale snapshot context is allocated. If the create operation needs
+ * to abort, the error path is followed to record the result in the journal
+ * (if enabled) and bubble the originating error code back to the client.
+ */
+ SnapshotCreateRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t journal_op_tid,
+ bool skip_object_map);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override {
+ return true;
+ }
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapCreateEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ bool m_skip_object_map;
+
+ int m_ret_val;
+
+ uint64_t m_snap_id;
+ uint64_t m_size;
+ ParentImageInfo m_parent_info;
+
+ void send_suspend_requests();
+ Context *handle_suspend_requests(int *result);
+
+ void send_suspend_aio();
+ Context *handle_suspend_aio(int *result);
+
+ void send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_allocate_snap_id();
+ Context *handle_allocate_snap_id(int *result);
+
+ void send_create_snap();
+ Context *handle_create_snap(int *result);
+
+ Context *send_create_object_map();
+ Context *handle_create_object_map(int *result);
+
+ void send_release_snap_id();
+ Context *handle_release_snap_id(int *result);
+
+ void update_snap_context();
+
+ void save_result(int *result) {
+ if (m_ret_val == 0 && *result < 0) {
+ m_ret_val = *result;
+ }
+ }
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotLimitRequest.cc b/src/librbd/operation/SnapshotLimitRequest.cc
new file mode 100644
index 00000000..d47dc3a8
--- /dev/null
+++ b/src/librbd/operation/SnapshotLimitRequest.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotLimitRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotLimitRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+SnapshotLimitRequest<I>::SnapshotLimitRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t limit)
+ : Request<I>(image_ctx, on_finish), m_snap_limit(limit) {
+}
+
+template <typename I>
+void SnapshotLimitRequest<I>::send_op() {
+ send_limit_snaps();
+}
+
+template <typename I>
+bool SnapshotLimitRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotLimitRequest<I>::send_limit_snaps() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ {
+ RWLock::RLocker md_locker(image_ctx.md_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ librados::ObjectWriteOperation op;
+ cls_client::snapshot_set_limit(&op, m_snap_limit);
+
+ librados::AioCompletion *rados_completion =
+ this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+ &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotLimitRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotLimitRequest.h b/src/librbd/operation/SnapshotLimitRequest.h
new file mode 100644
index 00000000..09622a45
--- /dev/null
+++ b/src/librbd/operation/SnapshotLimitRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotLimitRequest : public Request<ImageCtxT> {
+public:
+ SnapshotLimitRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t limit);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapLimitEvent(op_tid, m_snap_limit);
+ }
+
+private:
+ uint64_t m_snap_limit;
+
+ void send_limit_snaps();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotLimitRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H
diff --git a/src/librbd/operation/SnapshotProtectRequest.cc b/src/librbd/operation/SnapshotProtectRequest.cc
new file mode 100644
index 00000000..9a0375fc
--- /dev/null
+++ b/src/librbd/operation/SnapshotProtectRequest.cc
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotProtectRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename SnapshotProtectRequest<I>::State& state) {
+ switch(state) {
+ case SnapshotProtectRequest<I>::STATE_PROTECT_SNAP:
+ os << "PROTECT_SNAP";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotProtectRequest<I>::SnapshotProtectRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_state(STATE_PROTECT_SNAP) {
+}
+
+template <typename I>
+void SnapshotProtectRequest<I>::send_op() {
+ send_protect_snap();
+}
+
+template <typename I>
+bool SnapshotProtectRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EBUSY) {
+ ldout(cct, 1) << "snapshot is already protected" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotProtectRequest<I>::send_protect_snap() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ int r = verify_and_send_protect_snap();
+ if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+}
+
+template <typename I>
+int SnapshotProtectRequest<I>::verify_and_send_protect_snap() {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker md_locker(image_ctx.md_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ CephContext *cct = image_ctx.cct;
+ if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) {
+ lderr(cct) << "image must support layering" << dendl;
+ return -ENOSYS;
+ }
+
+ uint64_t snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+
+ bool is_protected;
+ int r = image_ctx.is_snap_protected(snap_id, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_protected) {
+ return -EBUSY;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, snap_id,
+ RBD_PROTECTION_STATUS_PROTECTED);
+
+ librados::AioCompletion *rados_completion =
+ this->create_callback_completion();
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+ &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotProtectRequest.h b/src/librbd/operation/SnapshotProtectRequest.h
new file mode 100644
index 00000000..bef80229
--- /dev/null
+++ b/src/librbd/operation/SnapshotProtectRequest.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotProtectRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Protect goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PROTECT_SNAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+ enum State {
+ STATE_PROTECT_SNAP
+ };
+
+ SnapshotProtectRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapProtectEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ State m_state;
+
+ void send_protect_snap();
+
+ int verify_and_send_protect_snap();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRemoveRequest.cc b/src/librbd/operation/SnapshotRemoveRequest.cc
new file mode 100644
index 00000000..69fbc3e7
--- /dev/null
+++ b/src/librbd/operation/SnapshotRemoveRequest.cc
@@ -0,0 +1,469 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/DetachChildRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRemoveRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+SnapshotRemoveRequest<I>::SnapshotRemoveRequest(
+ I &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, uint64_t snap_id)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_snap_id(snap_id) {
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ RWLock::RLocker object_map_locker(image_ctx.object_map_lock);
+ if (image_ctx.snap_info.find(m_snap_id) == image_ctx.snap_info.end()) {
+ lderr(cct) << "snapshot doesn't exist" << dendl;
+ this->async_complete(-ENOENT);
+ return;
+ }
+ }
+
+ trash_snap();
+}
+
+template <typename I>
+bool SnapshotRemoveRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::trash_snap() {
+ I &image_ctx = this->m_image_ctx;
+ if (image_ctx.old_format) {
+ release_snap_id();
+ return;
+ } else if (cls::rbd::get_snap_namespace_type(m_snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) {
+ get_snap();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::snapshot_trash_add(&op, m_snap_id);
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_trash_snap>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_trash_snap(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ // trash / clone v2 not supported
+ detach_child();
+ return;
+ } else if (r < 0 && r != -EEXIST) {
+ lderr(cct) << "failed to move snapshot to trash: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ m_trashed_snapshot = true;
+ get_snap();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::get_snap() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::snapshot_get_start(&op, m_snap_id);
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_get_snap>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_get_snap(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == 0) {
+ cls::rbd::SnapshotInfo snap_info;
+
+ auto it = m_out_bl.cbegin();
+ r = cls_client::snapshot_get_finish(&it, &snap_info);
+ m_child_attached = (snap_info.child_count > 0);
+ if (r == 0 && m_child_attached) {
+ list_children();
+ return;
+ }
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve snapshot: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::list_children() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::children_list_start(&op, m_snap_id);
+
+ m_out_bl.clear();
+ m_child_images.clear();
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_list_children>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_list_children(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::children_list_finish(&it, &m_child_images);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve child: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_stale_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::detach_stale_child() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ for (auto& child_image : m_child_images) {
+ m_child_attached = true;
+ IoCtx ioctx;
+ int r = util::create_ioctx(image_ctx.md_ctx, "child image",
+ child_image.pool_id,
+ child_image.pool_namespace, &ioctx);
+ if (r == -ENOENT) {
+ librados::ObjectWriteOperation op;
+ cls_client::child_detach(&op, m_snap_id,
+ {child_image.pool_id,
+ child_image.pool_namespace,
+ child_image.image_id});
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_detach_stale_child>(this);
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ return;
+ } else if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+ }
+
+ detach_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_detach_stale_child(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to detach stale child: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ m_child_attached = false;
+ list_children();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::detach_child() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ bool detach_child = false;
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+ cls::rbd::ParentImageSpec our_pspec;
+ int r = image_ctx.get_parent_spec(m_snap_id, &our_pspec);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "No such snapshot" << dendl;
+ } else {
+ lderr(cct) << "failed to retrieve parent spec" << dendl;
+ }
+
+ this->async_complete(r);
+ return;
+ }
+
+ if (image_ctx.parent_md.spec != our_pspec &&
+ (scan_for_parents(our_pspec) == -ENOENT)) {
+ // no other references to the parent image
+ detach_child = true;
+ }
+ }
+
+ if (!detach_child) {
+ // HEAD image or other snapshots still associated with parent
+ remove_object_map();
+ return;
+ }
+
+ ldout(cct, 5) << dendl;
+ auto ctx = create_context_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_detach_child>(this);
+ auto req = image::DetachChildRequest<I>::create(image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_detach_child(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to detach child from parent: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_object_map();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_object_map() {
+ I &image_ctx = this->m_image_ctx;
+ if (m_child_attached) {
+ // if a clone v2 child is attached to this snapshot, we cannot
+ // proceed. It's only an error if the snap was already in the trash
+ this->complete(m_trashed_snapshot ? 0 : -EBUSY);
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+
+ {
+ RWLock::RLocker owner_lock(image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ RWLock::RLocker object_map_locker(image_ctx.object_map_lock);
+ if (image_ctx.object_map != nullptr) {
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_remove_object_map>(this);
+ image_ctx.object_map->snapshot_remove(m_snap_id, ctx);
+ return;
+ }
+ }
+
+ // object map disabled
+ release_snap_id();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_remove_object_map(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove snapshot object map: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ release_snap_id();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::release_snap_id() {
+ I &image_ctx = this->m_image_ctx;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ remove_snap();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_snap_id << dendl;
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_release_snap_id>(this);
+ image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, aio_comp);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_release_snap_id(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to release snap id: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_snap();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_snap() {
+ I &image_ctx = this->m_image_ctx;
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ cls_client::old_snapshot_remove(&op, m_snap_name);
+ } else {
+ cls_client::snapshot_remove(&op, m_snap_id);
+ }
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_remove_snap>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_remove_snap(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove snapshot: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_snap_context();
+ this->complete(0);
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_snap_context() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ image_ctx.rm_snap(m_snap_namespace, m_snap_name, m_snap_id);
+}
+
+template <typename I>
+int SnapshotRemoveRequest<I>::scan_for_parents(
+ cls::rbd::ParentImageSpec &pspec) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.snap_lock.is_locked());
+ ceph_assert(image_ctx.parent_lock.is_locked());
+
+ if (pspec.pool_id != -1) {
+ map<uint64_t, SnapInfo>::iterator it;
+ for (it = image_ctx.snap_info.begin();
+ it != image_ctx.snap_info.end(); ++it) {
+ // skip our snap id (if checking base image, CEPH_NOSNAP won't match)
+ if (it->first == m_snap_id) {
+ continue;
+ }
+ if (it->second.parent.spec == pspec) {
+ break;
+ }
+ }
+ if (it == image_ctx.snap_info.end()) {
+ return -ENOENT;
+ }
+ }
+ return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRemoveRequest.h b/src/librbd/operation/SnapshotRemoveRequest.h
new file mode 100644
index 00000000..a47bbc0a
--- /dev/null
+++ b/src/librbd/operation/SnapshotRemoveRequest.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "include/buffer.h"
+#include "librbd/Types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRemoveRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * TRASH_SNAP
+ * |
+ * v (skip if unsupported)
+ * GET_SNAP
+ * |
+ * v (skip if unnecessary)
+ * LIST_CHILDREN <-------------\
+ * | |
+ * v (skip if unnecessary) | (repeat as needed)
+ * DETACH_STALE_CHILD ---------/
+ * |
+ * v (skip if unnecessary)
+ * DETACH_CHILD
+ * |
+ * v (skip if disabled/in-use)
+ * REMOVE_OBJECT_MAP
+ * |
+ * v (skip if in-use)
+ * RELEASE_SNAP_ID
+ * |
+ * v (skip if in-use)
+ * REMOVE_SNAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ static SnapshotRemoveRequest *create(
+ ImageCtxT &image_ctx, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, uint64_t snap_id, Context *on_finish) {
+ return new SnapshotRemoveRequest(image_ctx, on_finish, snap_namespace,
+ snap_name, snap_id);
+ }
+
+ SnapshotRemoveRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t snap_id);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapRemoveEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ cls::rbd::ChildImageSpecs m_child_images;
+ std::string m_snap_name;
+ uint64_t m_snap_id;
+ bool m_trashed_snapshot = false;
+ bool m_child_attached = false;
+
+ ceph::bufferlist m_out_bl;
+
+ void trash_snap();
+ void handle_trash_snap(int r);
+
+ void get_snap();
+ void handle_get_snap(int r);
+
+ void list_children();
+ void handle_list_children(int r);
+
+ void detach_stale_child();
+ void handle_detach_stale_child(int r);
+
+ void detach_child();
+ void handle_detach_child(int r);
+
+ void remove_object_map();
+ void handle_remove_object_map(int r);
+
+ void release_snap_id();
+ void handle_release_snap_id(int r);
+
+ void remove_snap();
+ void handle_remove_snap(int r);
+
+ void remove_snap_context();
+ int scan_for_parents(cls::rbd::ParentImageSpec &pspec);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRenameRequest.cc b/src/librbd/operation/SnapshotRenameRequest.cc
new file mode 100644
index 00000000..b25b991f
--- /dev/null
+++ b/src/librbd/operation/SnapshotRenameRequest.cc
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRenameRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRenameRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename SnapshotRenameRequest<I>::State& state) {
+ switch(state) {
+ case SnapshotRenameRequest<I>::STATE_RENAME_SNAP:
+ os << "RENAME_SNAP";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRenameRequest<I>::SnapshotRenameRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t snap_id,
+ const std::string &snap_name)
+ : Request<I>(image_ctx, on_finish), m_snap_id(snap_id),
+ m_snap_name(snap_name), m_state(STATE_RENAME_SNAP) {
+}
+
+template <typename I>
+journal::Event SnapshotRenameRequest<I>::create_event(uint64_t op_tid) const {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.snap_lock.is_locked());
+
+ std::string src_snap_name;
+ auto snap_info_it = image_ctx.snap_info.find(m_snap_id);
+ if (snap_info_it != image_ctx.snap_info.end()) {
+ src_snap_name = snap_info_it->second.name;
+ }
+
+ return journal::SnapRenameEvent(op_tid, m_snap_id, src_snap_name,
+ m_snap_name);
+}
+
+template <typename I>
+void SnapshotRenameRequest<I>::send_op() {
+ send_rename_snap();
+}
+
+template <typename I>
+bool SnapshotRenameRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EEXIST) {
+ ldout(cct, 1) << "snapshot already exists" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotRenameRequest<I>::send_rename_snap() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ RWLock::RLocker md_locker(image_ctx.md_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ cls_client::old_snapshot_rename(&op, m_snap_id, m_snap_name);
+ } else {
+ cls_client::snapshot_rename(&op, m_snap_id, m_snap_name);
+ }
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+ rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRenameRequest.h b/src/librbd/operation/SnapshotRenameRequest.h
new file mode 100644
index 00000000..697772e0
--- /dev/null
+++ b/src/librbd/operation/SnapshotRenameRequest.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRenameRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Rename goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_RENAME_SNAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+ enum State {
+ STATE_RENAME_SNAP
+ };
+
+ SnapshotRenameRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t snap_id, const std::string &snap_name);
+
+ journal::Event create_event(uint64_t op_tid) const override;
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+private:
+ uint64_t m_snap_id;
+ std::string m_snap_name;
+ State m_state;
+
+ void send_rename_snap();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRollbackRequest.cc b/src/librbd/operation/SnapshotRollbackRequest.cc
new file mode 100644
index 00000000..596570a3
--- /dev/null
+++ b/src/librbd/operation/SnapshotRollbackRequest.cc
@@ -0,0 +1,412 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRollbackRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ImageRequestWQ.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRollbackRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace {
+
+template <typename I>
+class C_RollbackObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_RollbackObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t snap_id, uint64_t object_num,
+ uint64_t head_num_objects,
+ decltype(I::object_map) snap_object_map)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snap_id(snap_id),
+ m_object_num(object_num), m_head_num_objects(head_num_objects),
+ m_snap_object_map(snap_object_map) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << "C_RollbackObject: " << __func__ << ": object_num="
+ << m_object_num << dendl;
+
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (m_object_num < m_head_num_objects &&
+ m_snap_object_map != nullptr &&
+ !image_ctx.object_map->object_may_exist(m_object_num) &&
+ !m_snap_object_map->object_may_exist(m_object_num)) {
+ return 1;
+ }
+ }
+
+ std::string oid = image_ctx.get_object_name(m_object_num);
+
+ librados::ObjectWriteOperation op;
+ op.selfmanaged_snap_rollback(m_snap_id);
+
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(this);
+ image_ctx.data_ctx.aio_operate(oid, rados_completion, &op);
+ rados_completion->release();
+ return 0;
+ }
+
+private:
+ uint64_t m_snap_id;
+ uint64_t m_object_num;
+ uint64_t m_head_num_objects;
+ decltype(I::object_map) m_snap_object_map;
+};
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRollbackRequest<I>::SnapshotRollbackRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t snap_id,
+ uint64_t snap_size,
+ ProgressContext &prog_ctx)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_snap_id(snap_id),
+ m_snap_size(snap_size), m_prog_ctx(prog_ctx),
+ m_object_map(nullptr), m_snap_object_map(nullptr) {
+}
+
+template <typename I>
+SnapshotRollbackRequest<I>::~SnapshotRollbackRequest() {
+ I &image_ctx = this->m_image_ctx;
+ if (m_blocking_writes) {
+ image_ctx.io_work_queue->unblock_writes();
+ }
+ delete m_object_map;
+ delete m_snap_object_map;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_op() {
+ send_block_writes();
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_blocking_writes = true;
+ image_ctx.io_work_queue->block_writes(create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_block_writes>(this));
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_resize_image();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_resize_image() {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t current_size;
+ {
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ current_size = image_ctx.get_image_size(CEPH_NOSNAP);
+ }
+
+ m_head_num_objects = Striper::get_num_objects(image_ctx.layout, current_size);
+
+ if (current_size == m_snap_size) {
+ send_get_snap_object_map();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_resize_image>(this);
+ ResizeRequest<I> *req = ResizeRequest<I>::create(image_ctx, ctx, m_snap_size,
+ true, m_no_op_prog_ctx, 0, true);
+ req->send();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_resize_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to resize image for rollback: "
+ << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_get_snap_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_get_snap_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t flags = 0;
+ bool object_map_enabled;
+ CephContext *cct = image_ctx.cct;
+ {
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ object_map_enabled = (image_ctx.object_map != nullptr);
+ int r = image_ctx.get_flags(m_snap_id, &flags);
+ if (r < 0) {
+ object_map_enabled = false;
+ }
+ }
+ if (object_map_enabled &&
+ (flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
+ lderr(cct) << "warning: object-map is invalid for snapshot" << dendl;
+ object_map_enabled = false;
+ }
+ if (!object_map_enabled) {
+ send_rollback_object_map();
+ return;
+ }
+
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_snap_object_map = image_ctx.create_object_map(m_snap_id);
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_get_snap_object_map>(this);
+ m_snap_object_map->open(ctx);
+ return;
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_get_snap_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to open object map: "
+ << cpp_strerror(*result) << dendl;
+ delete m_snap_object_map;
+ m_snap_object_map = nullptr;
+ }
+
+ send_rollback_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_rollback_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ {
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ RWLock::WLocker object_map_lock(image_ctx.object_map_lock);
+ if (image_ctx.object_map != nullptr) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_rollback_object_map>(this);
+ image_ctx.object_map->rollback(m_snap_id, ctx);
+ return;
+ }
+ }
+
+ send_rollback_objects();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_rollback_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to roll back object "
+ << "map: " << cpp_strerror(*result) << dendl;
+
+ ceph_assert(m_object_map == nullptr);
+ apply();
+ return this->create_context_finisher(*result);
+ }
+
+ send_rollback_objects();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_rollback_objects() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ uint64_t num_objects;
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ num_objects = Striper::get_num_objects(image_ctx.layout,
+ image_ctx.get_current_size());
+ }
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_rollback_objects>(this);
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_RollbackObject<I> >(),
+ boost::lambda::_1, &image_ctx, m_snap_id, boost::lambda::_2,
+ m_head_num_objects, m_snap_object_map));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, num_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_rollback_objects(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ERESTART) {
+ ldout(cct, 5) << "snapshot rollback operation interrupted" << dendl;
+ return this->create_context_finisher(*result);
+ } else if (*result < 0) {
+ lderr(cct) << "failed to rollback objects: " << cpp_strerror(*result)
+ << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ return send_refresh_object_map();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::send_refresh_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool object_map_enabled;
+ {
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ object_map_enabled = (image_ctx.object_map != nullptr);
+ }
+ if (!object_map_enabled) {
+ return send_invalidate_cache();
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_object_map = image_ctx.create_object_map(CEPH_NOSNAP);
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_refresh_object_map>(this);
+ m_object_map->open(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_refresh_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to open object map: "
+ << cpp_strerror(*result) << dendl;
+ delete m_object_map;
+ m_object_map = nullptr;
+ apply();
+
+ return this->create_context_finisher(*result);
+ }
+
+ return send_invalidate_cache();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::send_invalidate_cache() {
+ I &image_ctx = this->m_image_ctx;
+
+ apply();
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ RWLock::RLocker owner_lock(image_ctx.owner_lock);
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_invalidate_cache>(this);
+ image_ctx.io_object_dispatcher->invalidate_cache(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_invalidate_cache(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return this->create_context_finisher(*result);
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::apply() {
+ I &image_ctx = this->m_image_ctx;
+
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::WLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.object_map != nullptr) {
+ std::swap(m_object_map, image_ctx.object_map);
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRollbackRequest.h b/src/librbd/operation/SnapshotRollbackRequest.h
new file mode 100644
index 00000000..e58a618f
--- /dev/null
+++ b/src/librbd/operation/SnapshotRollbackRequest.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/journal/Types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRollbackRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Rollback goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start> ---------\
+ * |
+ * v
+ * STATE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_RESIZE_IMAGE (skip if resize not
+ * | required)
+ * v
+ * STATE_GET_SNAP_OBJECT_MAP (skip if object)
+ * | map disabled)
+ * v
+ * STATE_ROLLBACK_OBJECT_MAP (skip if object
+ * | map disabled)
+ * v
+ * STATE_ROLLBACK_OBJECTS
+ * |
+ * v
+ * STATE_REFRESH_OBJECT_MAP (skip if object
+ * | map disabled)
+ * v
+ * STATE_INVALIDATE_CACHE (skip if cache
+ * | disabled)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The _RESIZE_IMAGE state is skipped if the image doesn't need to be resized.
+ * The _ROLLBACK_OBJECT_MAP state is skipped if the object map isn't enabled.
+ * The _INVALIDATE_CACHE state is skipped if the cache isn't enabled.
+ */
+
+ SnapshotRollbackRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t snap_id,
+ uint64_t snap_size, ProgressContext &prog_ctx);
+ ~SnapshotRollbackRequest() override;
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override {
+ return true;
+ }
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapRollbackEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ uint64_t m_snap_id;
+ uint64_t m_snap_size;
+ uint64_t m_head_num_objects;
+ ProgressContext &m_prog_ctx;
+
+ NoOpProgressContext m_no_op_prog_ctx;
+
+ bool m_blocking_writes = false;
+ decltype(ImageCtxT::object_map) m_object_map;
+ decltype(ImageCtxT::object_map) m_snap_object_map;
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_resize_image();
+ Context *handle_resize_image(int *result);
+
+ void send_get_snap_object_map();
+ Context *handle_get_snap_object_map(int *result);
+
+ void send_rollback_object_map();
+ Context *handle_rollback_object_map(int *result);
+
+ void send_rollback_objects();
+ Context *handle_rollback_objects(int *result);
+
+ Context *send_refresh_object_map();
+ Context *handle_refresh_object_map(int *result);
+
+ Context *send_invalidate_cache();
+ Context *handle_invalidate_cache(int *result);
+
+ void apply();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
diff --git a/src/librbd/operation/SnapshotUnprotectRequest.cc b/src/librbd/operation/SnapshotUnprotectRequest.cc
new file mode 100644
index 00000000..4a58dbc0
--- /dev/null
+++ b/src/librbd/operation/SnapshotUnprotectRequest.cc
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include <list>
+#include <set>
+#include <vector>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotUnprotectRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+typedef std::pair<int64_t, std::string> Pool;
+typedef std::vector<Pool> Pools;
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename SnapshotUnprotectRequest<I>::State& state) {
+ switch(state) {
+ case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_START:
+ os << "UNPROTECT_SNAP_START";
+ break;
+ case SnapshotUnprotectRequest<I>::STATE_SCAN_POOL_CHILDREN:
+ os << "SCAN_POOL_CHILDREN";
+ break;
+ case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_FINISH:
+ os << "UNPROTECT_SNAP_FINISH";
+ break;
+ case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_ROLLBACK:
+ os << "UNPROTECT_SNAP_ROLLBACK";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+template <typename I>
+class C_ScanPoolChildren : public C_AsyncObjectThrottle<I> {
+public:
+ C_ScanPoolChildren(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ const cls::rbd::ParentImageSpec &pspec, const Pools &pools,
+ size_t pool_idx)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_pspec(pspec),
+ m_pool(pools[pool_idx]) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " scanning pool '" << m_pool.second << "'"
+ << dendl;
+
+ librados::Rados rados(image_ctx.md_ctx);
+ int64_t base_tier;
+ int r = rados.pool_get_base_tier(m_pool.first, &base_tier);
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "pool '" << m_pool.second << "' no longer exists"
+ << dendl;
+ return 1;
+ } else if (r < 0) {
+ lderr(cct) << "error retrieving base tier for pool '"
+ << m_pool.second << "'" << dendl;
+ return r;
+ }
+ if (m_pool.first != base_tier) {
+ // pool is a cache; skip it
+ return 1;
+ }
+
+ r = util::create_ioctx(image_ctx.md_ctx, "child image", m_pool.first, {},
+ &m_pool_ioctx);
+ if (r == -ENOENT) {
+ return 1;
+ } else if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectReadOperation op;
+ cls_client::get_children_start(&op, m_pspec);
+
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(this);
+ r = m_pool_ioctx.aio_operate(RBD_CHILDREN, rados_completion, &op,
+ &m_children_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return 0;
+ }
+
+protected:
+ void finish(int r) override {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (r == 0) {
+ auto it = m_children_bl.cbegin();
+ r= cls_client::get_children_finish(&it, &m_children);
+ }
+
+ ldout(cct, 10) << this << " retrieved children: r=" << r << dendl;
+ if (r == -ENOENT) {
+ // no children -- proceed with unprotect
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "cannot get children for pool '" << m_pool.second << "'"
+ << dendl;
+ } else {
+ lderr(cct) << "cannot unprotect: at least " << m_children.size() << " "
+ << "child(ren) [" << joinify(m_children.begin(),
+ m_children.end(),
+ std::string(",")) << "] "
+ << "in pool '" << m_pool.second << "'" << dendl;
+ r = -EBUSY;
+ }
+ C_AsyncObjectThrottle<I>::finish(r);
+ }
+
+private:
+ cls::rbd::ParentImageSpec m_pspec;
+ Pool m_pool;
+
+ IoCtx m_pool_ioctx;
+ std::set<std::string> m_children;
+ bufferlist m_children_bl;
+};
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotUnprotectRequest<I>::SnapshotUnprotectRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_state(STATE_UNPROTECT_SNAP_START),
+ m_ret_val(0), m_snap_id(CEPH_NOSNAP) {
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_op() {
+ send_unprotect_snap_start();
+}
+
+template <typename I>
+bool SnapshotUnprotectRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EINVAL) {
+ ldout(cct, 1) << "snapshot is already unprotected" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ }
+
+ // use a different state machine once an error is encountered
+ if (m_ret_val < 0) {
+ return should_complete_error();
+ }
+
+ RWLock::RLocker owner_lock(image_ctx.owner_lock);
+ bool finished = false;
+ switch (m_state) {
+ case STATE_UNPROTECT_SNAP_START:
+ send_scan_pool_children();
+ break;
+ case STATE_SCAN_POOL_CHILDREN:
+ send_unprotect_snap_finish();
+ break;
+ case STATE_UNPROTECT_SNAP_FINISH:
+ finished = true;
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return finished;
+}
+
+template <typename I>
+bool SnapshotUnprotectRequest<I>::should_complete_error() {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ CephContext *cct = image_ctx.cct;
+ lderr(cct) << this << " " << __func__ << ": "
+ << "ret_val=" << m_ret_val << dendl;
+
+ bool finished = true;
+ if (m_state == STATE_SCAN_POOL_CHILDREN ||
+ m_state == STATE_UNPROTECT_SNAP_FINISH) {
+ send_unprotect_snap_rollback();
+ finished = false;
+ }
+ return finished;
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_start() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ int r = verify_and_send_unprotect_snap_start();
+ if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_scan_pool_children() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_SCAN_POOL_CHILDREN;
+
+ // search all pools for children depending on this snapshot
+ // TODO add async version of wait_for_latest_osdmap
+ librados::Rados rados(image_ctx.md_ctx);
+ rados.wait_for_latest_osdmap();
+
+ // protect against pools being renamed/deleted
+ std::list<Pool> pool_list;
+ rados.pool_list2(pool_list);
+
+ cls::rbd::ParentImageSpec pspec(image_ctx.md_ctx.get_id(),
+ image_ctx.md_ctx.get_namespace(),
+ image_ctx.id, m_snap_id);
+ Pools pools(pool_list.begin(), pool_list.end());
+
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_ScanPoolChildren<I> >(),
+ boost::lambda::_1, &image_ctx, pspec, pools, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ nullptr, image_ctx, context_factory, ctx, NULL, 0, pools.size());
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_finish() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_state = STATE_UNPROTECT_SNAP_FINISH;
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, m_snap_id,
+ RBD_PROTECTION_STATUS_UNPROTECTED);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_rollback() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_state = STATE_UNPROTECT_SNAP_ROLLBACK;
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, m_snap_id,
+ RBD_PROTECTION_STATUS_PROTECTED);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+int SnapshotUnprotectRequest<I>::verify_and_send_unprotect_snap_start() {
+ I &image_ctx = this->m_image_ctx;
+ RWLock::RLocker md_locker(image_ctx.md_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ CephContext *cct = image_ctx.cct;
+ if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) {
+ lderr(cct) << "image must support layering" << dendl;
+ return -ENOSYS;
+ }
+
+ m_snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name);
+ if (m_snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+
+ bool is_unprotected;
+ int r = image_ctx.is_snap_unprotected(m_snap_id, &is_unprotected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_unprotected) {
+ lderr(cct) << "snapshot is already unprotected" << dendl;
+ return -EINVAL;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, m_snap_id,
+ RBD_PROTECTION_STATUS_UNPROTECTING);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+
+ // TODO legacy code threw a notification post UNPROTECTING update -- required?
+ return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotUnprotectRequest.h b/src/librbd/operation/SnapshotUnprotectRequest.h
new file mode 100644
index 00000000..19cc6d32
--- /dev/null
+++ b/src/librbd/operation/SnapshotUnprotectRequest.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotUnprotectRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Unprotect goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_UNPROTECT_SNAP_START
+ * |
+ * v
+ * STATE_SCAN_POOL_CHILDREN * * * * > STATE_UNPROTECT_SNAP_ROLLBACK
+ * | |
+ * v |
+ * STATE_UNPROTECT_SNAP_FINISH |
+ * | |
+ * v |
+ * <finish> <----------------------------/
+ *
+ * @endverbatim
+ *
+ * If the unprotect operation needs to abort, the error path is followed
+ * to rollback the unprotect in-progress status on the image.
+ */
+ enum State {
+ STATE_UNPROTECT_SNAP_START,
+ STATE_SCAN_POOL_CHILDREN,
+ STATE_UNPROTECT_SNAP_FINISH,
+ STATE_UNPROTECT_SNAP_ROLLBACK
+ };
+
+ SnapshotUnprotectRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ int filter_return_code(int r) const override {
+ if (m_ret_val < 0) {
+ return m_ret_val;
+ }
+ return 0;
+ }
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapUnprotectEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ State m_state;
+
+ int m_ret_val;
+ uint64_t m_snap_id;
+
+ bool should_complete_error();
+
+ void send_unprotect_snap_start();
+ void send_scan_pool_children();
+ void send_unprotect_snap_finish();
+ void send_unprotect_snap_rollback();
+
+ int verify_and_send_unprotect_snap_start();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
diff --git a/src/librbd/operation/SparsifyRequest.cc b/src/librbd/operation/SparsifyRequest.cc
new file mode 100644
index 00000000..53049f88
--- /dev/null
+++ b/src/librbd/operation/SparsifyRequest.cc
@@ -0,0 +1,519 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SparsifyRequest.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/err.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include "librbd/io/ObjectRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+bool may_be_trimmed(const std::map<uint64_t,uint64_t> &extent_map,
+ const bufferlist &bl, size_t sparse_size,
+ uint64_t *new_end_ptr) {
+ if (extent_map.empty()) {
+ *new_end_ptr = 0;
+ return true;
+ }
+
+ uint64_t end = extent_map.rbegin()->first + extent_map.rbegin()->second;
+ uint64_t new_end = end;
+ uint64_t bl_off = bl.length();
+
+ for (auto it = extent_map.rbegin(); it != extent_map.rend(); it++) {
+ auto off = it->first;
+ auto len = it->second;
+
+ new_end = p2roundup<uint64_t>(off + len, sparse_size);
+
+ uint64_t extent_left = len;
+ uint64_t sub_len = len % sparse_size;
+ if (sub_len == 0) {
+ sub_len = sparse_size;
+ }
+ while (extent_left > 0) {
+ ceph_assert(bl_off >= sub_len);
+ bl_off -= sub_len;
+ bufferlist sub_bl;
+ sub_bl.substr_of(bl, bl_off, sub_len);
+ if (!sub_bl.is_zero()) {
+ break;
+ }
+ new_end -= sparse_size;
+ extent_left -= sub_len;
+ sub_len = sparse_size;
+ }
+ if (extent_left > 0) {
+ break;
+ }
+ }
+
+ if (new_end < end) {
+ *new_end_ptr = new_end;
+ return true;
+ }
+
+ return false;
+}
+
+} // anonymous namespace
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::SparsifyObject: " << this \
+ << " " << m_oid << " " << __func__ << ": "
+
+template <typename I>
+class C_SparsifyObject : public C_AsyncObjectThrottle<I> {
+public:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (not supported)
+ * SPARSIFY * * * * * * * * * * * * > READ < * * * * * * * * * * (concurrent
+ * | | * update is
+ * | (object map disabled) | (can trim) * detected)
+ * |------------------------\ V *
+ * | | PRE UPDATE OBJECT MAP *
+ * | (object map enabled) | | (if needed) *
+ * v | V *
+ * PRE UPDATE OBJECT MAP | TRIM * * * * * * * * * * *
+ * | | |
+ * v | V
+ * CHECK EXISTS | POST UPDATE OBJECT MAP
+ * | | | (if needed)
+ * v | |
+ * POST UPDATE OBJECT MAP | |
+ * | | |
+ * v | |
+ * <finish> <------------------/<-------/
+ *
+ * @endverbatim
+ *
+ */
+
+ C_SparsifyObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no, size_t sparse_size)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_cct(image_ctx->cct),
+ m_object_no(object_no), m_sparse_size(sparse_size),
+ m_oid(image_ctx->get_object_name(object_no)) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ ldout(m_cct, 20) << dendl;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(m_cct) << "missing data pool" << dendl;
+ return -ENODEV;
+ }
+
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ ldout(m_cct, 1) << "lost exclusive lock during sparsify" << dendl;
+ return -ERESTART;
+ }
+
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.object_map != nullptr &&
+ !image_ctx.object_map->object_may_exist(m_object_no)) {
+ // can skip because the object does not exist
+ return 1;
+ }
+
+ RWLock::RLocker parent_locker(image_ctx.parent_lock);
+ uint64_t overlap_objects = 0;
+ uint64_t overlap;
+ int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &overlap);
+ if (r == 0 && overlap > 0) {
+ overlap_objects = Striper::get_num_objects(image_ctx.layout, overlap);
+ }
+ m_remove_empty = (m_object_no >= overlap_objects);
+ }
+
+ send_sparsify();
+ return 0;
+ }
+
+ void send_sparsify() {
+ I &image_ctx = this->m_image_ctx;
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::sparsify(&op, m_sparse_size, m_remove_empty);
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_sparsify>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_sparsify(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ m_trying_trim = true;
+ send_read();
+ return;
+ }
+
+ if (r == -ENOENT) {
+ finish_op(0);
+ return;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl;
+ finish_op(r);
+ return;
+ }
+
+ send_pre_update_object_map();
+ }
+
+ void send_pre_update_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ if (m_trying_trim) {
+ if (!m_remove_empty || m_new_end != 0 ||
+ !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ send_trim();
+ return;
+ }
+ } else if (!m_remove_empty ||
+ !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ finish_op(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ image_ctx.owner_lock.get_read();
+ image_ctx.snap_lock.get_read();
+ if (image_ctx.object_map == nullptr) {
+ // possible that exclusive lock was lost in background
+ lderr(m_cct) << "object map is not initialized" << dendl;
+
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ finish_op(-EINVAL);
+ return;
+ }
+
+ int r;
+ m_finish_op_ctx = image_ctx.exclusive_lock->start_op(&r);
+ if (m_finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ finish_op(r);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ C_SparsifyObject<I>,
+ &C_SparsifyObject<I>::handle_pre_update_object_map>(this);
+
+ image_ctx.object_map_lock.get_write();
+ bool sent = image_ctx.object_map->template aio_update<
+ Context, &Context::complete>(CEPH_NOSNAP, m_object_no, OBJECT_PENDING,
+ OBJECT_EXISTS, {}, false, ctx);
+
+ // NOTE: state machine might complete before we reach here
+ image_ctx.object_map_lock.put_write();
+ image_ctx.snap_lock.put_read();
+ image_ctx.owner_lock.put_read();
+ if (!sent) {
+ finish_op(0);
+ }
+ }
+
+ void handle_pre_update_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update object map: " << cpp_strerror(r)
+ << dendl;
+ finish_op(r);
+ return;
+ }
+
+ if (m_trying_trim) {
+ send_trim();
+ } else {
+ send_check_exists();
+ }
+ }
+
+ void send_check_exists() {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+ m_bl.clear();
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_check_exists>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_check_exists(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "stat failed: " << cpp_strerror(r) << dendl;
+ finish_op(r);
+ return;
+ }
+
+ send_post_update_object_map(r == 0);
+ }
+
+ void send_post_update_object_map(bool exists) {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ C_SparsifyObject<I>,
+ &C_SparsifyObject<I>::handle_post_update_object_map>(this);
+ bool sent;
+ {
+ RWLock::RLocker owner_locker(image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+ assert(image_ctx.exclusive_lock->is_lock_owner());
+ assert(image_ctx.object_map != nullptr);
+
+ RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+
+ sent = image_ctx.object_map->template aio_update<
+ Context, &Context::complete>(CEPH_NOSNAP, m_object_no,
+ exists ? OBJECT_EXISTS : OBJECT_NONEXISTENT,
+ OBJECT_PENDING, {}, false, ctx);
+ }
+ if (!sent) {
+ ctx->complete(0);
+ }
+ }
+
+ void handle_post_update_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update object map: " << cpp_strerror(r)
+ << dendl;
+ finish_op(r);
+ return;
+ }
+
+ finish_op(0);
+ }
+
+ void send_read() {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ m_bl.clear();
+ op.sparse_read(0, image_ctx.layout.object_size, &m_extent_map, &m_bl,
+ nullptr);
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_read>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_read(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ r = 0;
+ } else {
+ lderr(m_cct) << "failed to read object: " << cpp_strerror(r) << dendl;
+ }
+ finish_op(r);
+ return;
+ }
+
+ if (!may_be_trimmed(m_extent_map, m_bl, m_sparse_size, &m_new_end)) {
+ finish_op(0);
+ return;
+ }
+
+ send_pre_update_object_map();
+ }
+
+ void send_trim() {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ ceph_assert(m_new_end < image_ctx.layout.object_size);
+
+ librados::ObjectWriteOperation op;
+ m_bl.clear();
+ m_bl.append_zero(image_ctx.layout.object_size - m_new_end);
+ op.cmpext(m_new_end, m_bl, nullptr);
+ if (m_new_end == 0 && m_remove_empty) {
+ op.remove();
+ } else {
+ op.truncate(m_new_end);
+ }
+
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_trim>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_trim(int r) {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r <= -MAX_ERRNO) {
+ m_finish_op_ctx->complete(0);
+ m_finish_op_ctx = nullptr;
+ send_read();
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to trim: " << cpp_strerror(r) << dendl;
+ finish_op(r);
+ return;
+ }
+
+ if (!m_remove_empty || m_new_end != 0 ||
+ !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ finish_op(0);
+ return;
+ }
+
+ send_post_update_object_map(false);
+ }
+
+ void finish_op(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (m_finish_op_ctx != nullptr) {
+ m_finish_op_ctx->complete(0);
+ }
+ this->complete(r);
+ }
+
+private:
+ CephContext *m_cct;
+ uint64_t m_object_no;
+ size_t m_sparse_size;
+ std::string m_oid;
+
+ bool m_remove_empty = false;
+ bool m_trying_trim = false;
+ bufferlist m_bl;
+ std::map<uint64_t,uint64_t> m_extent_map;
+ uint64_t m_new_end = 0;
+ Context *m_finish_op_ctx = nullptr;
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::SparsifyRequest: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+bool SparsifyRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void SparsifyRequest<I>::send_op() {
+ sparsify_objects();
+}
+
+template <typename I>
+void SparsifyRequest<I>::sparsify_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ assert(image_ctx.owner_lock.is_locked());
+
+ uint64_t objects = 0;
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ objects = image_ctx.get_object_count(CEPH_NOSNAP);
+ }
+
+ auto ctx = create_context_callback<
+ SparsifyRequest<I>,
+ &SparsifyRequest<I>::handle_sparsify_objects>(this);
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_SparsifyObject<I> >(),
+ boost::lambda::_1, &image_ctx, boost::lambda::_2, m_sparse_size));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void SparsifyRequest<I>::handle_sparsify_objects(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "sparsify operation interrupted" << dendl;
+ this->complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "sparsify encountered an error: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ this->complete(0);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SparsifyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SparsifyRequest.h b/src/librbd/operation/SparsifyRequest.h
new file mode 100644
index 00000000..74f9eb72
--- /dev/null
+++ b/src/librbd/operation/SparsifyRequest.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "common/snap_types.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SparsifyRequest : public Request<ImageCtxT>
+{
+public:
+ SparsifyRequest(ImageCtxT &image_ctx, size_t sparse_size, Context *on_finish,
+ ProgressContext &prog_ctx)
+ : Request<ImageCtxT>(image_ctx, on_finish), m_sparse_size(sparse_size),
+ m_prog_ctx(prog_ctx) {
+ }
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ ceph_abort();
+ return journal::UnknownEvent();
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SPARSIFY OBJECTS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ size_t m_sparse_size;
+ ProgressContext &m_prog_ctx;
+
+ void sparsify_objects();
+ void handle_sparsify_objects(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SparsifyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H
diff --git a/src/librbd/operation/TrimRequest.cc b/src/librbd/operation/TrimRequest.cc
new file mode 100644
index 00000000..15f6a5df
--- /dev/null
+++ b/src/librbd/operation/TrimRequest.cc
@@ -0,0 +1,377 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/TrimRequest.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "common/ContextCompletion.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include <boost/bind.hpp>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::TrimRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+class C_CopyupObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_CopyupObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ ::SnapContext snapc, uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc),
+ m_object_no(object_no)
+ {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ string oid = image_ctx.get_object_name(m_object_no);
+ ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
+
+ auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard(
+ &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, oid, m_object_no, 0,
+ image_ctx.layout.object_size, m_snapc,
+ io::OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE, 0, {}, this);
+ object_dispatch_spec->send();
+ return 0;
+ }
+private:
+ ::SnapContext m_snapc;
+ uint64_t m_object_no;
+};
+
+template <typename I>
+class C_RemoveObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_RemoveObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no)
+ {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.object_map != nullptr &&
+ !image_ctx.object_map->object_may_exist(m_object_no)) {
+ return 1;
+ }
+ }
+
+ string oid = image_ctx.get_object_name(m_object_no);
+ ldout(image_ctx.cct, 10) << "removing " << oid << dendl;
+
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(this);
+ int r = image_ctx.data_ctx.aio_remove(oid, rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return 0;
+ }
+
+private:
+ uint64_t m_object_no;
+};
+
+template <typename I>
+TrimRequest<I>::TrimRequest(I &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx)
+ : AsyncRequest<I>(image_ctx, on_finish), m_new_size(new_size),
+ m_prog_ctx(prog_ctx)
+{
+ uint64_t period = image_ctx.get_stripe_period();
+ uint64_t new_num_periods = ((m_new_size + period - 1) / period);
+ m_delete_off = std::min(new_num_periods * period, original_size);
+ // first object we can delete free and clear
+ m_delete_start = new_num_periods * image_ctx.get_stripe_count();
+ m_delete_start_min = m_delete_start;
+ m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size);
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " trim image " << original_size << " -> "
+ << m_new_size << " periods " << new_num_periods
+ << " discard to offset " << m_delete_off
+ << " delete objects " << m_delete_start
+ << " to " << m_num_objects << dendl;
+}
+
+template <typename I>
+bool TrimRequest<I>::should_complete(int r)
+{
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " should_complete: r=" << r << dendl;
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "trim operation interrupted" << dendl;
+ return true;
+ } else if (r < 0) {
+ lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl;
+ return true;
+ }
+
+ RWLock::RLocker owner_lock(image_ctx.owner_lock);
+ switch (m_state) {
+ case STATE_PRE_TRIM:
+ ldout(cct, 5) << " PRE_TRIM" << dendl;
+ send_copyup_objects();
+ break;
+
+ case STATE_COPYUP_OBJECTS:
+ ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
+ send_remove_objects();
+ break;
+
+ case STATE_REMOVE_OBJECTS:
+ ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
+ send_post_trim();
+ break;
+
+ case STATE_POST_TRIM:
+ ldout(cct, 5) << " POST_TRIM" << dendl;
+ send_clean_boundary();
+ break;
+
+ case STATE_CLEAN_BOUNDARY:
+ ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
+ send_finish(0);
+ break;
+
+ case STATE_FINISHED:
+ ldout(cct, 5) << "FINISHED" << dendl;
+ return true;
+
+ default:
+ lderr(cct) << "invalid state: " << m_state << dendl;
+ ceph_abort();
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+void TrimRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(cct) << "missing data pool" << dendl;
+ send_finish(-ENODEV);
+ return;
+ }
+
+ send_pre_trim();
+}
+
+template<typename I>
+void TrimRequest<I>::send_pre_trim() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ if (m_delete_start >= m_num_objects) {
+ send_clean_boundary();
+ return;
+ }
+
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.object_map != nullptr) {
+ ldout(image_ctx.cct, 5) << this << " send_pre_trim: "
+ << " delete_start_min=" << m_delete_start_min
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_PRE_TRIM;
+
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+ if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+ CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING,
+ OBJECT_EXISTS, {}, false, this)) {
+ return;
+ }
+ }
+ }
+
+ send_copyup_objects();
+}
+
+template<typename I>
+void TrimRequest<I>::send_copyup_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ ::SnapContext snapc;
+ bool has_snapshots;
+ uint64_t parent_overlap;
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+ snapc = image_ctx.snapc;
+ has_snapshots = !image_ctx.snaps.empty();
+ int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
+ ceph_assert(r == 0);
+ }
+
+ // copyup is only required for portion of image that overlaps parent
+ uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout,
+ parent_overlap);
+
+ // TODO: protect against concurrent shrink and snap create?
+ // skip to remove if no copyup is required.
+ if (copyup_end <= m_delete_start || !has_snapshots) {
+ send_remove_objects();
+ return;
+ }
+
+ uint64_t copyup_start = m_delete_start;
+ m_delete_start = copyup_end;
+
+ ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
+ << " start object=" << copyup_start << ", "
+ << " end object=" << copyup_end << dendl;
+ m_state = STATE_COPYUP_OBJECTS;
+
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
+ boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+ copyup_end);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void TrimRequest<I>::send_remove_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
+ << " delete_start=" << m_delete_start
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_REMOVE_OBJECTS;
+
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
+ boost::lambda::_1, &image_ctx, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
+ m_num_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template<typename I>
+void TrimRequest<I>::send_post_trim() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ if (image_ctx.object_map != nullptr) {
+ ldout(image_ctx.cct, 5) << this << " send_post_trim:"
+ << " delete_start_min=" << m_delete_start_min
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_POST_TRIM;
+
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+ if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+ CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT,
+ OBJECT_PENDING, {}, false, this)) {
+ return;
+ }
+ }
+ }
+
+ send_clean_boundary();
+}
+
+template <typename I>
+void TrimRequest<I>::send_clean_boundary() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(image_ctx.owner_lock.is_locked());
+ CephContext *cct = image_ctx.cct;
+ if (m_delete_off <= m_new_size) {
+ send_finish(0);
+ return;
+ }
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+ uint64_t delete_len = m_delete_off - m_new_size;
+ ldout(image_ctx.cct, 5) << this << " send_clean_boundary: "
+ << " delete_off=" << m_delete_off
+ << " length=" << delete_len << dendl;
+ m_state = STATE_CLEAN_BOUNDARY;
+
+ ::SnapContext snapc;
+ {
+ RWLock::RLocker snap_locker(image_ctx.snap_lock);
+ snapc = image_ctx.snapc;
+ }
+
+ // discard the weird boundary
+ std::vector<ObjectExtent> extents;
+ Striper::file_to_extents(cct, image_ctx.format_string,
+ &image_ctx.layout, m_new_size, delete_len, 0,
+ extents);
+
+ ContextCompletion *completion =
+ new ContextCompletion(this->create_async_callback_context(), true);
+ for (vector<ObjectExtent>::iterator p = extents.begin();
+ p != extents.end(); ++p) {
+ ldout(cct, 20) << " ex " << *p << dendl;
+ Context *req_comp = new C_ContextCompletion(*completion);
+
+ if (p->offset == 0) {
+ // treat as a full object delete on the boundary
+ p->length = image_ctx.layout.object_size;
+ }
+
+ auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard(
+ &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, p->oid.name, p->objectno,
+ p->offset, p->length, snapc, 0, 0, {}, req_comp);
+ object_dispatch_spec->send();
+ }
+ completion->finish_adding_requests();
+}
+
+template <typename I>
+void TrimRequest<I>::send_finish(int r) {
+ m_state = STATE_FINISHED;
+ this->async_complete(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::TrimRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/TrimRequest.h b/src/librbd/operation/TrimRequest.h
new file mode 100644
index 00000000..8526046c
--- /dev/null
+++ b/src/librbd/operation/TrimRequest.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
+
+#include "librbd/AsyncRequest.h"
+
+namespace librbd
+{
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class TrimRequest : public AsyncRequest<ImageCtxT>
+{
+public:
+ static TrimRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx) {
+ return new TrimRequest(image_ctx, on_finish, original_size, new_size,
+ prog_ctx);
+ }
+
+ TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx);
+
+ void send() override;
+
+protected:
+ /**
+ * Trim goes through the following state machine to remove whole objects,
+ * clean partially trimmed objects, and update the object map:
+ *
+ * @verbatim
+ *
+ * <start> . . . . . . . . . . . . . . . . .
+ * | .
+ * v (skip if not needed) .
+ * STATE_PRE_TRIM .
+ * | .
+ * v (skip if not needed) .
+ * STATE_COPYUP_OBJECTS .
+ * | .
+ * v (skip if not needed) .
+ * STATE_REMOVE_OBJECTS .
+ * | .
+ * v (skip if not needed) .
+ * STATE_POST_TRIM .
+ * | .
+ * v (skip if not needed) .
+ * STATE_CLEAN_BOUNDARY .
+ * | .
+ * v .
+ * STATE_FINISHED < . . . . . . . . . . . . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * The _COPYUP_OBJECTS state is skipped if there is no parent overlap
+ * within the new image size and the image does not have any snapshots.
+ * The _PRE_TRIM/_POST_TRIM states are skipped if the object map
+ * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects
+ * are removed. The _CLEAN_BOUNDARY state is skipped if no boundary
+ * objects are cleaned. The state machine will immediately transition
+ * to _FINISHED state if there are no bytes to trim.
+ */
+
+ enum State {
+ STATE_PRE_TRIM,
+ STATE_COPYUP_OBJECTS,
+ STATE_REMOVE_OBJECTS,
+ STATE_POST_TRIM,
+ STATE_CLEAN_BOUNDARY,
+ STATE_FINISHED
+ };
+
+ bool should_complete(int r) override;
+
+ State m_state = STATE_PRE_TRIM;
+
+private:
+ uint64_t m_delete_start;
+ uint64_t m_delete_start_min = 0;
+ uint64_t m_num_objects;
+ uint64_t m_delete_off;
+ uint64_t m_new_size;
+ ProgressContext &m_prog_ctx;
+
+ void send_pre_trim();
+ void send_copyup_objects();
+ void send_remove_objects();
+ void send_post_trim();
+
+ void send_clean_boundary();
+ void send_finish(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::TrimRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
diff --git a/src/librbd/trash/MoveRequest.cc b/src/librbd/trash/MoveRequest.cc
new file mode 100644
index 00000000..526e92ab
--- /dev/null
+++ b/src/librbd/trash/MoveRequest.cc
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/trash/MoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::trash::MoveRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace trash {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void MoveRequest<I>::send() {
+ trash_add();
+}
+
+template <typename I>
+void MoveRequest<I>::trash_add() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::trash_add(&op, m_image_id, m_trash_image_spec);
+
+ auto aio_comp = create_rados_callback<
+ MoveRequest<I>, &MoveRequest<I>::handle_trash_add>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MoveRequest<I>::handle_trash_add(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 10) << "previous unfinished deferred remove for image: "
+ << m_image_id << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to add image to trash: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_id();
+}
+
+template <typename I>
+void MoveRequest<I>::remove_id() {
+ ldout(m_cct, 10) << dendl;
+
+ auto aio_comp = create_rados_callback<
+ MoveRequest<I>, &MoveRequest<I>::handle_remove_id>(this);
+ int r = m_io_ctx.aio_remove(util::id_obj_name(m_trash_image_spec.name),
+ aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MoveRequest<I>::handle_remove_id(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image id object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ directory_remove();
+}
+
+template <typename I>
+void MoveRequest<I>::directory_remove() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::dir_remove_image(&op, m_trash_image_spec.name,
+ m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ MoveRequest<I>, &MoveRequest<I>::handle_directory_remove>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MoveRequest<I>::handle_directory_remove(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image from directory: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void MoveRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace trash
+} // namespace librbd
+
+template class librbd::trash::MoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/trash/MoveRequest.h b/src/librbd/trash/MoveRequest.h
new file mode 100644
index 00000000..f0d470c1
--- /dev/null
+++ b/src/librbd/trash/MoveRequest.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_MOVE_REQUEST_H
+#define CEPH_LIBRBD_TRASH_MOVE_REQUEST_H
+
+#include "include/utime.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+
+struct CephContext;
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace trash {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MoveRequest {
+public:
+ static MoveRequest* create(librados::IoCtx& io_ctx,
+ const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec,
+ Context* on_finish) {
+ return new MoveRequest(io_ctx, image_id, trash_image_spec, on_finish);
+ }
+
+ MoveRequest(librados::IoCtx& io_ctx, const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec,
+ Context* on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id),
+ m_trash_image_spec(trash_image_spec), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * TRASH_ADD
+ * |
+ * v
+ * REMOVE_ID
+ * |
+ * v
+ * DIRECTORY_REMOVE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ cls::rbd::TrashImageSpec m_trash_image_spec;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ void trash_add();
+ void handle_trash_add(int r);
+
+ void remove_id();
+ void handle_remove_id(int r);
+
+ void directory_remove();
+ void handle_directory_remove(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace trash
+} // namespace librbd
+
+extern template class librbd::trash::MoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_TRASH_MOVE_REQUEST_H
diff --git a/src/librbd/trash/RemoveRequest.cc b/src/librbd/trash/RemoveRequest.cc
new file mode 100644
index 00000000..77cbfd81
--- /dev/null
+++ b/src/librbd/trash/RemoveRequest.cc
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/trash/RemoveRequest.h"
+#include "common/WorkQueue.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::trash::RemoveRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace trash {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void RemoveRequest<I>::send() {
+ set_state();
+}
+
+template <typename I>
+void RemoveRequest<I>::set_state() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::trash_state_set(&op, m_image_id, m_trash_set_state,
+ m_trash_expect_state);
+
+ auto aio_comp = create_rados_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_set_state>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_set_state(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(m_cct) << "error setting trash image state: " << cpp_strerror(r)
+ << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ close_image();
+ } else {
+ finish(m_ret_val);
+ }
+ return;
+ }
+
+ if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ remove_image();
+ } else {
+ ceph_assert(m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_NORMAL);
+ finish(m_ret_val < 0 ? m_ret_val : r);
+ };
+}
+
+template <typename I>
+void RemoveRequest<I>::close_image() {
+ if (m_image_ctx == nullptr) {
+ finish(m_ret_val);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_close_image>(this);
+ m_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_close_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed to close image:" << cpp_strerror(r) << dendl;
+ }
+
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+ finish(m_ret_val);
+}
+
+template <typename I>
+void RemoveRequest<I>::remove_image() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_remove_image>(this);
+ if (m_image_ctx != nullptr) {
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_io_ctx, m_image_ctx, m_force, true, m_prog_ctx, m_op_work_queue, ctx);
+ req->send();
+ } else {
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_io_ctx, "", m_image_id, m_force, true, m_prog_ctx, m_op_work_queue,
+ ctx);
+ req->send();
+ }
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_remove_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed to remove image:" << cpp_strerror(r) << dendl;
+
+ m_ret_val = r;
+ m_trash_set_state = cls::rbd::TRASH_IMAGE_STATE_NORMAL;
+ m_trash_expect_state = cls::rbd::TRASH_IMAGE_STATE_REMOVING;
+ set_state();
+ return;
+ }
+
+ m_image_ctx = nullptr;
+ remove_trash_entry();
+}
+
+template <typename I>
+void RemoveRequest<I>::remove_trash_entry() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::trash_remove(&op, m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_remove_trash_entry>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_remove_trash_entry(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing trash entry: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void RemoveRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace trash
+} // namespace librbd
+
+template class librbd::trash::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/trash/RemoveRequest.h b/src/librbd/trash/RemoveRequest.h
new file mode 100644
index 00000000..5f65a57c
--- /dev/null
+++ b/src/librbd/trash/RemoveRequest.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H
+
+#include "include/utime.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+
+class CephContext;
+class Context;
+class ContextWQ;
+
+namespace librbd {
+
+class ProgressContext;
+
+struct ImageCtx;
+
+namespace trash {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class RemoveRequest {
+public:
+ static RemoveRequest* create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ ContextWQ *op_work_queue, bool force,
+ ProgressContext &prog_ctx, Context *on_finish) {
+ return new RemoveRequest(io_ctx, image_id, op_work_queue, force, prog_ctx,
+ on_finish);
+ }
+
+ static RemoveRequest* create(librados::IoCtx &io_ctx, ImageCtxT *image_ctx,
+ ContextWQ *op_work_queue, bool force,
+ ProgressContext &prog_ctx, Context *on_finish) {
+ return new RemoveRequest(io_ctx, image_ctx, op_work_queue, force, prog_ctx,
+ on_finish);
+ }
+
+
+ RemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ ContextWQ *op_work_queue, bool force, ProgressContext &prog_ctx,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_op_work_queue(op_work_queue),
+ m_force(force), m_prog_ctx(prog_ctx), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+ }
+
+ RemoveRequest(librados::IoCtx &io_ctx, ImageCtxT *image_ctx,
+ ContextWQ *op_work_queue, bool force, ProgressContext &prog_ctx,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(m_image_ctx->id),
+ m_op_work_queue(op_work_queue), m_force(force), m_prog_ctx(prog_ctx),
+ m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SET_STATE (removing) * * * * * * *> CLOSE_IMAGE
+ * | |
+ * v |
+ * REMOVE_IMAGE * * *> SET_STATE (normal) |
+ * | | |
+ * v | |
+ * REMOVE_TRASH_ENTRY | |
+ * | | |
+ * v | |
+ * <finish> <-------------/<---------------/
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ ImageCtxT *m_image_ctx = nullptr;
+ std::string m_image_id;
+ ContextWQ *m_op_work_queue;
+ bool m_force;
+ ProgressContext &m_prog_ctx;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ cls::rbd::TrashImageState m_trash_set_state =
+ cls::rbd::TRASH_IMAGE_STATE_REMOVING;
+ cls::rbd::TrashImageState m_trash_expect_state =
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL;
+ int m_ret_val = 0;
+
+ void set_state();
+ void handle_set_state(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void remove_image();
+ void handle_remove_image(int r);
+
+ void remove_trash_entry();
+ void handle_remove_trash_entry(int r);
+
+ void finish(int r);
+};
+
+} // namespace trash
+} // namespace librbd
+
+extern template class librbd::trash::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H
diff --git a/src/librbd/trash_watcher/Types.cc b/src/librbd/trash_watcher/Types.cc
new file mode 100644
index 00000000..c95ea223
--- /dev/null
+++ b/src/librbd/trash_watcher/Types.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "librbd/trash_watcher/Types.h"
+#include "librbd/watcher/Utils.h"
+
+namespace librbd {
+namespace trash_watcher {
+
+namespace {
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void ImageAddedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(image_id, bl);
+ encode(trash_image_spec, bl);
+}
+
+void ImageAddedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(image_id, iter);
+ decode(trash_image_spec, iter);
+}
+
+void ImageAddedPayload::dump(Formatter *f) const {
+ f->dump_string("image_id", image_id);
+ f->open_object_section("trash_image_spec");
+ trash_image_spec.dump(f);
+ f->close_section();
+}
+
+void ImageRemovedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(image_id, bl);
+}
+
+void ImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(image_id, iter);
+}
+
+void ImageRemovedPayload::dump(Formatter *f) const {
+ f->dump_string("image_id", image_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_IMAGE_ADDED:
+ payload = ImageAddedPayload();
+ break;
+ case NOTIFY_OP_IMAGE_REMOVED:
+ payload = ImageRemovedPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage{ImageAddedPayload{
+ "id", {cls::rbd::TRASH_IMAGE_SOURCE_USER, "name", {}, {}}}});
+ o.push_back(new NotifyMessage{ImageRemovedPayload{"id"}});
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_IMAGE_ADDED:
+ out << "ImageAdded";
+ break;
+ case NOTIFY_OP_IMAGE_REMOVED:
+ out << "ImageRemoved";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+} // namespace trash_watcher
+} // namespace librbd
diff --git a/src/librbd/trash_watcher/Types.h b/src/librbd/trash_watcher/Types.h
new file mode 100644
index 00000000..22c2b437
--- /dev/null
+++ b/src/librbd/trash_watcher/Types.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_WATCHER_TYPES_H
+#define CEPH_LIBRBD_TRASH_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <boost/variant.hpp>
+
+
+namespace librbd {
+namespace trash_watcher {
+
+enum NotifyOp {
+ NOTIFY_OP_IMAGE_ADDED = 0,
+ NOTIFY_OP_IMAGE_REMOVED = 1
+};
+
+struct ImageAddedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ADDED;
+
+ std::string image_id;
+ cls::rbd::TrashImageSpec trash_image_spec;
+
+ ImageAddedPayload() {
+ }
+ ImageAddedPayload(const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec)
+ : image_id(image_id), trash_image_spec(trash_image_spec) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImageRemovedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_REMOVED;
+
+ std::string image_id;
+
+ ImageRemovedPayload() {
+ }
+ ImageRemovedPayload(const std::string& image_id)
+ : image_id(image_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ImageAddedPayload,
+ ImageRemovedPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace trash_watcher
+} // namespace librbd
+
+using librbd::trash_watcher::encode;
+using librbd::trash_watcher::decode;
+
+#endif // CEPH_LIBRBD_TRASH_WATCHER_TYPES_H
diff --git a/src/librbd/watcher/Notifier.cc b/src/librbd/watcher/Notifier.cc
new file mode 100644
index 00000000..dfb95aec
--- /dev/null
+++ b/src/librbd/watcher/Notifier.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/watcher/Notifier.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/watcher/Types.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::watcher::Notifier: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace watcher {
+
+const uint64_t Notifier::NOTIFY_TIMEOUT = 5000;
+
+Notifier::C_AioNotify::C_AioNotify(Notifier *notifier, NotifyResponse *response,
+ Context *on_finish)
+ : notifier(notifier), response(response), on_finish(on_finish) {
+}
+
+void Notifier::C_AioNotify::finish(int r) {
+ if (response != nullptr) {
+ if (r == 0 || r == -ETIMEDOUT) {
+ try {
+ auto it = out_bl.cbegin();
+ decode(*response, it);
+ } catch (const buffer::error &err) {
+ r = -EBADMSG;
+ }
+ }
+ }
+ notifier->handle_notify(r, on_finish);
+}
+
+Notifier::Notifier(ContextWQ *work_queue, IoCtx &ioctx, const std::string &oid)
+ : m_work_queue(work_queue), m_ioctx(ioctx), m_oid(oid),
+ m_aio_notify_lock(util::unique_lock_name(
+ "librbd::object_watcher::Notifier::m_aio_notify_lock", this)) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+Notifier::~Notifier() {
+ Mutex::Locker aio_notify_locker(m_aio_notify_lock);
+ ceph_assert(m_pending_aio_notifies == 0);
+}
+
+void Notifier::flush(Context *on_finish) {
+ Mutex::Locker aio_notify_locker(m_aio_notify_lock);
+ if (m_pending_aio_notifies == 0) {
+ m_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ m_aio_notify_flush_ctxs.push_back(on_finish);
+}
+
+void Notifier::notify(bufferlist &bl, NotifyResponse *response,
+ Context *on_finish) {
+ {
+ Mutex::Locker aio_notify_locker(m_aio_notify_lock);
+ ++m_pending_aio_notifies;
+
+ ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl;
+ }
+
+ C_AioNotify *ctx = new C_AioNotify(this, response, on_finish);
+ librados::AioCompletion *comp = util::create_rados_callback(ctx);
+ int r = m_ioctx.aio_notify(m_oid, comp, bl, NOTIFY_TIMEOUT, &ctx->out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+void Notifier::handle_notify(int r, Context *on_finish) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ Mutex::Locker aio_notify_locker(m_aio_notify_lock);
+ ceph_assert(m_pending_aio_notifies > 0);
+ --m_pending_aio_notifies;
+
+ ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl;
+ if (m_pending_aio_notifies == 0) {
+ for (auto ctx : m_aio_notify_flush_ctxs) {
+ m_work_queue->queue(ctx, 0);
+ }
+ m_aio_notify_flush_ctxs.clear();
+ }
+
+ if (on_finish != nullptr) {
+ m_work_queue->queue(on_finish, r);
+ }
+}
+
+} // namespace watcher
+} // namespace librbd
diff --git a/src/librbd/watcher/Notifier.h b/src/librbd/watcher/Notifier.h
new file mode 100644
index 00000000..8b0ad37b
--- /dev/null
+++ b/src/librbd/watcher/Notifier.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_NOTIFIER_H
+#define CEPH_LIBRBD_WATCHER_NOTIFIER_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "common/WorkQueue.h"
+#include <list>
+
+namespace librbd {
+
+namespace watcher {
+
+struct NotifyResponse;
+
+class Notifier {
+public:
+ static const uint64_t NOTIFY_TIMEOUT;
+
+ Notifier(ContextWQ *work_queue, librados::IoCtx &ioctx,
+ const std::string &oid);
+ ~Notifier();
+
+ void flush(Context *on_finish);
+ void notify(bufferlist &bl, NotifyResponse *response, Context *on_finish);
+
+private:
+ typedef std::list<Context*> Contexts;
+
+ struct C_AioNotify : public Context {
+ Notifier *notifier;
+ NotifyResponse *response;
+ Context *on_finish;
+ bufferlist out_bl;
+
+ C_AioNotify(Notifier *notifier, NotifyResponse *response,
+ Context *on_finish);
+
+ void finish(int r) override;
+ };
+
+ ContextWQ *m_work_queue;
+ librados::IoCtx &m_ioctx;
+ CephContext *m_cct;
+ std::string m_oid;
+
+ Mutex m_aio_notify_lock;
+ size_t m_pending_aio_notifies = 0;
+ Contexts m_aio_notify_flush_ctxs;
+
+ void handle_notify(int r, Context *on_finish);
+
+};
+
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_NOTIFIER_H
diff --git a/src/librbd/watcher/RewatchRequest.cc b/src/librbd/watcher/RewatchRequest.cc
new file mode 100644
index 00000000..40c3dfe7
--- /dev/null
+++ b/src/librbd/watcher/RewatchRequest.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/watcher/RewatchRequest.h"
+#include "common/RWLock.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::watcher::RewatchRequest: " \
+ << this << " " << __func__ << " "
+
+namespace librbd {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace watcher {
+
+using std::string;
+
+RewatchRequest::RewatchRequest(librados::IoCtx& ioctx, const string& oid,
+ RWLock &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish)
+ : m_ioctx(ioctx), m_oid(oid), m_watch_lock(watch_lock),
+ m_watch_ctx(watch_ctx), m_watch_handle(watch_handle),
+ m_on_finish(on_finish) {
+}
+
+void RewatchRequest::send() {
+ unwatch();
+}
+
+void RewatchRequest::unwatch() {
+ ceph_assert(m_watch_lock.is_wlocked());
+ if (*m_watch_handle == 0) {
+ rewatch();
+ return;
+ }
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << dendl;
+
+ uint64_t watch_handle = 0;
+ std::swap(*m_watch_handle, watch_handle);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ RewatchRequest, &RewatchRequest::handle_unwatch>(this);
+ int r = m_ioctx.aio_unwatch(watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_unwatch(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -EBLACKLISTED) {
+ lderr(cct) << "client blacklisted" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "failed to unwatch: " << cpp_strerror(r) << dendl;
+ }
+ rewatch();
+}
+
+void RewatchRequest::rewatch() {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << dendl;
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ RewatchRequest, &RewatchRequest::handle_rewatch>(this);
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_rewatch_handle, m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_rewatch(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "failed to watch object: " << cpp_strerror(r)
+ << dendl;
+ m_rewatch_handle = 0;
+ }
+
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ *m_watch_handle = m_rewatch_handle;
+ }
+
+ finish(r);
+}
+
+void RewatchRequest::finish(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace watcher
+} // namespace librbd
+
diff --git a/src/librbd/watcher/RewatchRequest.h b/src/librbd/watcher/RewatchRequest.h
new file mode 100644
index 00000000..d4fc250a
--- /dev/null
+++ b/src/librbd/watcher/RewatchRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H
+#define CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+
+struct Context;
+struct RWLock;
+
+namespace librbd {
+
+namespace watcher {
+
+class RewatchRequest {
+public:
+
+ static RewatchRequest *create(librados::IoCtx& ioctx, const std::string& oid,
+ RWLock &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish) {
+ return new RewatchRequest(ioctx, oid, watch_lock, watch_ctx, watch_handle,
+ on_finish);
+ }
+
+ RewatchRequest(librados::IoCtx& ioctx, const std::string& oid,
+ RWLock &watch_lock, librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNWATCH
+ * |
+ * | . . . .
+ * | . . (recoverable error)
+ * v v .
+ * REWATCH . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx& m_ioctx;
+ std::string m_oid;
+ RWLock &m_watch_lock;
+ librados::WatchCtx2 *m_watch_ctx;
+ uint64_t *m_watch_handle;
+ Context *m_on_finish;
+
+ uint64_t m_rewatch_handle = 0;
+
+ void unwatch();
+ void handle_unwatch(int r);
+
+ void rewatch();
+ void handle_rewatch(int r);
+
+ void finish(int r);
+};
+
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H
diff --git a/src/librbd/watcher/Types.cc b/src/librbd/watcher/Types.cc
new file mode 100644
index 00000000..8f1991d7
--- /dev/null
+++ b/src/librbd/watcher/Types.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/watcher/Types.h"
+#include "common/Formatter.h"
+
+namespace librbd {
+namespace watcher {
+
+void ClientId::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(gid, bl);
+ encode(handle, bl);
+}
+
+void ClientId::decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(gid, iter);
+ decode(handle, iter);
+}
+
+void ClientId::dump(Formatter *f) const {
+ f->dump_unsigned("gid", gid);
+ f->dump_unsigned("handle", handle);
+}
+
+void NotifyResponse::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(acks, bl);
+ encode(timeouts, bl);
+}
+
+void NotifyResponse::decode(bufferlist::const_iterator& iter) {
+ using ceph::decode;
+ decode(acks, iter);
+ decode(timeouts, iter);
+}
+std::ostream &operator<<(std::ostream &out,
+ const ClientId &client_id) {
+ out << "[" << client_id.gid << "," << client_id.handle << "]";
+ return out;
+}
+
+} // namespace watcher
+} // namespace librbd
diff --git a/src/librbd/watcher/Types.h b/src/librbd/watcher/Types.h
new file mode 100644
index 00000000..d1517fb0
--- /dev/null
+++ b/src/librbd/watcher/Types.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_TYPES_H
+#define CEPH_LIBRBD_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+
+class Watcher;
+
+namespace watcher {
+
+struct ClientId {
+ uint64_t gid;
+ uint64_t handle;
+
+ ClientId() : gid(0), handle(0) {}
+ ClientId(uint64_t gid, uint64_t handle) : gid(gid), handle(handle) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ inline bool is_valid() const {
+ return (*this != ClientId());
+ }
+
+ inline bool operator==(const ClientId &rhs) const {
+ return (gid == rhs.gid && handle == rhs.handle);
+ }
+ inline bool operator!=(const ClientId &rhs) const {
+ return !(*this == rhs);
+ }
+ inline bool operator<(const ClientId &rhs) const {
+ if (gid != rhs.gid) {
+ return gid < rhs.gid;
+ } else {
+ return handle < rhs.handle;
+ }
+ }
+};
+
+struct NotifyResponse {
+ std::map<ClientId, bufferlist> acks;
+ std::vector<ClientId> timeouts;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+};
+
+template <typename ImageCtxT>
+struct Traits {
+ typedef librbd::Watcher Watcher;
+};
+
+std::ostream &operator<<(std::ostream &out,
+ const ClientId &client);
+
+WRITE_CLASS_ENCODER(ClientId);
+WRITE_CLASS_ENCODER(NotifyResponse);
+
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_TYPES_H
diff --git a/src/librbd/watcher/Utils.h b/src/librbd/watcher/Utils.h
new file mode 100644
index 00000000..d2510aaf
--- /dev/null
+++ b/src/librbd/watcher/Utils.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_UTILS_H
+#define CEPH_LIBRBD_WATCHER_UTILS_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "include/Context.h"
+#include "librbd/Watcher.h"
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+namespace watcher {
+namespace util {
+
+template <typename Watcher>
+struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ Watcher *watcher;
+ uint64_t notify_id;
+ uint64_t handle;
+
+ HandlePayloadVisitor(Watcher *watcher_, uint64_t notify_id_,
+ uint64_t handle_)
+ : watcher(watcher_), notify_id(notify_id_), handle(handle_)
+ {
+ }
+
+ template <typename P>
+ inline void operator()(const P &payload) const {
+ typename Watcher::C_NotifyAck *ctx =
+ new typename Watcher::C_NotifyAck(watcher, notify_id, handle);
+ if (watcher->handle_payload(payload, ctx)) {
+ ctx->complete(0);
+ }
+ }
+};
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+ template <typename P>
+ inline void operator()(const P &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(P::NOTIFY_OP), m_bl);
+ payload.encode(m_bl);
+ }
+
+private:
+ bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {}
+
+ template <typename P>
+ inline void operator()(P &payload) const {
+ payload.decode(m_version, m_iter);
+ }
+
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+} // namespace util
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_UTILS_H