summaryrefslogtreecommitdiffstats
path: root/src/librbd
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/librbd
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/librbd')
-rw-r--r--src/librbd/AsioEngine.cc55
-rw-r--r--src/librbd/AsioEngine.h80
-rw-r--r--src/librbd/AsyncObjectThrottle.cc109
-rw-r--r--src/librbd/AsyncObjectThrottle.h79
-rw-r--r--src/librbd/AsyncRequest.cc71
-rw-r--r--src/librbd/AsyncRequest.h76
-rw-r--r--src/librbd/BlockGuard.h176
-rw-r--r--src/librbd/CMakeLists.txt355
-rw-r--r--src/librbd/ConfigWatcher.cc116
-rw-r--r--src/librbd/ConfigWatcher.h47
-rw-r--r--src/librbd/DeepCopyRequest.cc361
-rw-r--r--src/librbd/DeepCopyRequest.h138
-rw-r--r--src/librbd/ExclusiveLock.cc388
-rw-r--r--src/librbd/ExclusiveLock.h117
-rw-r--r--src/librbd/Features.cc111
-rw-r--r--src/librbd/Features.h16
-rw-r--r--src/librbd/ImageCtx.cc965
-rw-r--r--src/librbd/ImageCtx.h365
-rw-r--r--src/librbd/ImageState.cc1040
-rw-r--r--src/librbd/ImageState.h155
-rw-r--r--src/librbd/ImageWatcher.cc1555
-rw-r--r--src/librbd/ImageWatcher.h313
-rw-r--r--src/librbd/Journal.cc1819
-rw-r--r--src/librbd/Journal.h375
-rw-r--r--src/librbd/LibrbdAdminSocketHook.cc91
-rw-r--r--src/librbd/LibrbdAdminSocketHook.h34
-rw-r--r--src/librbd/ManagedLock.cc854
-rw-r--r--src/librbd/ManagedLock.h270
-rw-r--r--src/librbd/MirroringWatcher.cc142
-rw-r--r--src/librbd/MirroringWatcher.h67
-rw-r--r--src/librbd/ObjectMap.cc380
-rw-r--r--src/librbd/ObjectMap.h176
-rw-r--r--src/librbd/Operations.cc1932
-rw-r--r--src/librbd/Operations.h158
-rw-r--r--src/librbd/PluginRegistry.cc101
-rw-r--r--src/librbd/PluginRegistry.h51
-rw-r--r--src/librbd/TaskFinisher.h179
-rw-r--r--src/librbd/TrashWatcher.cc116
-rw-r--r--src/librbd/TrashWatcher.h58
-rw-r--r--src/librbd/Types.h142
-rw-r--r--src/librbd/Utils.cc246
-rw-r--r--src/librbd/Utils.h286
-rw-r--r--src/librbd/WatchNotifyTypes.cc557
-rw-r--r--src/librbd/WatchNotifyTypes.h532
-rw-r--r--src/librbd/Watcher.cc370
-rw-r--r--src/librbd/Watcher.h183
-rw-r--r--src/librbd/api/Config.cc233
-rw-r--r--src/librbd/api/Config.h37
-rw-r--r--src/librbd/api/DiffIterate.cc376
-rw-r--r--src/librbd/api/DiffIterate.h66
-rw-r--r--src/librbd/api/Group.cc1290
-rw-r--r--src/librbd/api/Group.h60
-rw-r--r--src/librbd/api/Image.cc1002
-rw-r--r--src/librbd/api/Image.h87
-rw-r--r--src/librbd/api/Io.cc550
-rw-r--r--src/librbd/api/Io.h65
-rw-r--r--src/librbd/api/Migration.cc2126
-rw-r--r--src/librbd/api/Migration.h113
-rw-r--r--src/librbd/api/Mirror.cc2089
-rw-r--r--src/librbd/api/Mirror.h126
-rw-r--r--src/librbd/api/Namespace.cc235
-rw-r--r--src/librbd/api/Namespace.h33
-rw-r--r--src/librbd/api/Pool.cc375
-rw-r--r--src/librbd/api/Pool.h38
-rw-r--r--src/librbd/api/PoolMetadata.cc156
-rw-r--r--src/librbd/api/PoolMetadata.h36
-rw-r--r--src/librbd/api/Snapshot.cc444
-rw-r--r--src/librbd/api/Snapshot.h67
-rw-r--r--src/librbd/api/Trash.cc759
-rw-r--r--src/librbd/api/Trash.h53
-rw-r--r--src/librbd/api/Utils.cc84
-rw-r--r--src/librbd/api/Utils.h28
-rw-r--r--src/librbd/asio/ContextWQ.cc49
-rw-r--r--src/librbd/asio/ContextWQ.h52
-rw-r--r--src/librbd/asio/Utils.h33
-rw-r--r--src/librbd/cache/ImageWriteback.cc149
-rw-r--r--src/librbd/cache/ImageWriteback.h77
-rw-r--r--src/librbd/cache/ObjectCacherObjectDispatch.cc467
-rw-r--r--src/librbd/cache/ObjectCacherObjectDispatch.h132
-rw-r--r--src/librbd/cache/ObjectCacherWriteback.cc290
-rw-r--r--src/librbd/cache/ObjectCacherWriteback.h78
-rw-r--r--src/librbd/cache/ParentCacheObjectDispatch.cc255
-rw-r--r--src/librbd/cache/ParentCacheObjectDispatch.h161
-rw-r--r--src/librbd/cache/TypeTraits.h26
-rw-r--r--src/librbd/cache/Types.h28
-rw-r--r--src/librbd/cache/Utils.h33
-rw-r--r--src/librbd/cache/WriteAroundObjectDispatch.cc525
-rw-r--r--src/librbd/cache/WriteAroundObjectDispatch.h212
-rw-r--r--src/librbd/cache/WriteLogImageDispatch.cc217
-rw-r--r--src/librbd/cache/WriteLogImageDispatch.h108
-rw-r--r--src/librbd/cache/pwl/AbstractWriteLog.cc2195
-rw-r--r--src/librbd/cache/pwl/AbstractWriteLog.h410
-rw-r--r--src/librbd/cache/pwl/Builder.h61
-rw-r--r--src/librbd/cache/pwl/DiscardRequest.cc164
-rw-r--r--src/librbd/cache/pwl/DiscardRequest.h90
-rw-r--r--src/librbd/cache/pwl/ImageCacheState.cc194
-rw-r--r--src/librbd/cache/pwl/ImageCacheState.h86
-rw-r--r--src/librbd/cache/pwl/InitRequest.cc226
-rw-r--r--src/librbd/cache/pwl/InitRequest.h105
-rw-r--r--src/librbd/cache/pwl/LogEntry.cc140
-rw-r--r--src/librbd/cache/pwl/LogEntry.h280
-rw-r--r--src/librbd/cache/pwl/LogMap.cc278
-rw-r--r--src/librbd/cache/pwl/LogMap.h81
-rw-r--r--src/librbd/cache/pwl/LogOperation.cc316
-rw-r--r--src/librbd/cache/pwl/LogOperation.h224
-rw-r--r--src/librbd/cache/pwl/ReadRequest.h45
-rw-r--r--src/librbd/cache/pwl/Request.cc561
-rw-r--r--src/librbd/cache/pwl/Request.h374
-rw-r--r--src/librbd/cache/pwl/ShutdownRequest.cc161
-rw-r--r--src/librbd/cache/pwl/ShutdownRequest.h95
-rw-r--r--src/librbd/cache/pwl/SyncPoint.cc109
-rw-r--r--src/librbd/cache/pwl/SyncPoint.h69
-rw-r--r--src/librbd/cache/pwl/Types.cc185
-rw-r--r--src/librbd/cache/pwl/Types.h444
-rw-r--r--src/librbd/cache/pwl/rwl/Builder.h107
-rw-r--r--src/librbd/cache/pwl/rwl/LogEntry.cc105
-rw-r--r--src/librbd/cache/pwl/rwl/LogEntry.h68
-rw-r--r--src/librbd/cache/pwl/rwl/LogOperation.cc39
-rw-r--r--src/librbd/cache/pwl/rwl/LogOperation.h55
-rw-r--r--src/librbd/cache/pwl/rwl/ReadRequest.cc70
-rw-r--r--src/librbd/cache/pwl/rwl/ReadRequest.h34
-rw-r--r--src/librbd/cache/pwl/rwl/Request.cc86
-rw-r--r--src/librbd/cache/pwl/rwl/Request.h90
-rw-r--r--src/librbd/cache/pwl/rwl/WriteLog.cc1014
-rw-r--r--src/librbd/cache/pwl/rwl/WriteLog.h124
-rw-r--r--src/librbd/cache/pwl/ssd/Builder.h108
-rw-r--r--src/librbd/cache/pwl/ssd/LogEntry.cc63
-rw-r--r--src/librbd/cache/pwl/ssd/LogEntry.h75
-rw-r--r--src/librbd/cache/pwl/ssd/LogOperation.cc36
-rw-r--r--src/librbd/cache/pwl/ssd/LogOperation.h35
-rw-r--r--src/librbd/cache/pwl/ssd/ReadRequest.cc92
-rw-r--r--src/librbd/cache/pwl/ssd/ReadRequest.h34
-rw-r--r--src/librbd/cache/pwl/ssd/Request.cc63
-rw-r--r--src/librbd/cache/pwl/ssd/Request.h92
-rw-r--r--src/librbd/cache/pwl/ssd/Types.h51
-rw-r--r--src/librbd/cache/pwl/ssd/WriteLog.cc1158
-rw-r--r--src/librbd/cache/pwl/ssd/WriteLog.h156
-rw-r--r--src/librbd/crypto/BlockCrypto.cc131
-rw-r--r--src/librbd/crypto/BlockCrypto.h60
-rw-r--r--src/librbd/crypto/CryptoContextPool.cc44
-rw-r--r--src/librbd/crypto/CryptoContextPool.h71
-rw-r--r--src/librbd/crypto/CryptoImageDispatch.cc28
-rw-r--r--src/librbd/crypto/CryptoImageDispatch.h111
-rw-r--r--src/librbd/crypto/CryptoInterface.h124
-rw-r--r--src/librbd/crypto/CryptoObjectDispatch.cc661
-rw-r--r--src/librbd/crypto/CryptoObjectDispatch.h115
-rw-r--r--src/librbd/crypto/DataCryptor.h37
-rw-r--r--src/librbd/crypto/EncryptionFormat.h30
-rw-r--r--src/librbd/crypto/FormatRequest.cc119
-rw-r--r--src/librbd/crypto/FormatRequest.h49
-rw-r--r--src/librbd/crypto/LoadRequest.cc74
-rw-r--r--src/librbd/crypto/LoadRequest.h44
-rw-r--r--src/librbd/crypto/ShutDownCryptoRequest.cc102
-rw-r--r--src/librbd/crypto/ShutDownCryptoRequest.h44
-rw-r--r--src/librbd/crypto/Types.h18
-rw-r--r--src/librbd/crypto/Utils.cc73
-rw-r--r--src/librbd/crypto/Utils.h29
-rw-r--r--src/librbd/crypto/luks/EncryptionFormat.cc48
-rw-r--r--src/librbd/crypto/luks/EncryptionFormat.h67
-rw-r--r--src/librbd/crypto/luks/FormatRequest.cc178
-rw-r--r--src/librbd/crypto/luks/FormatRequest.h58
-rw-r--r--src/librbd/crypto/luks/Header.cc256
-rw-r--r--src/librbd/crypto/luks/Header.h51
-rw-r--r--src/librbd/crypto/luks/LoadRequest.cc196
-rw-r--r--src/librbd/crypto/luks/LoadRequest.h66
-rw-r--r--src/librbd/crypto/openssl/DataCryptor.cc153
-rw-r--r--src/librbd/crypto/openssl/DataCryptor.h49
-rw-r--r--src/librbd/deep_copy/Handler.h50
-rw-r--r--src/librbd/deep_copy/ImageCopyRequest.cc278
-rw-r--r--src/librbd/deep_copy/ImageCopyRequest.h123
-rw-r--r--src/librbd/deep_copy/MetadataCopyRequest.cc117
-rw-r--r--src/librbd/deep_copy/MetadataCopyRequest.h78
-rw-r--r--src/librbd/deep_copy/ObjectCopyRequest.cc845
-rw-r--r--src/librbd/deep_copy/ObjectCopyRequest.h162
-rw-r--r--src/librbd/deep_copy/SetHeadRequest.cc223
-rw-r--r--src/librbd/deep_copy/SetHeadRequest.h87
-rw-r--r--src/librbd/deep_copy/SnapshotCopyRequest.cc731
-rw-r--r--src/librbd/deep_copy/SnapshotCopyRequest.h151
-rw-r--r--src/librbd/deep_copy/SnapshotCreateRequest.cc187
-rw-r--r--src/librbd/deep_copy/SnapshotCreateRequest.h98
-rw-r--r--src/librbd/deep_copy/Types.h28
-rw-r--r--src/librbd/deep_copy/Utils.cc61
-rw-r--r--src/librbd/deep_copy/Utils.h29
-rw-r--r--src/librbd/exclusive_lock/AutomaticPolicy.cc29
-rw-r--r--src/librbd/exclusive_lock/AutomaticPolicy.h34
-rw-r--r--src/librbd/exclusive_lock/ImageDispatch.cc318
-rw-r--r--src/librbd/exclusive_lock/ImageDispatch.h134
-rw-r--r--src/librbd/exclusive_lock/Policy.h31
-rw-r--r--src/librbd/exclusive_lock/PostAcquireRequest.cc368
-rw-r--r--src/librbd/exclusive_lock/PostAcquireRequest.h124
-rw-r--r--src/librbd/exclusive_lock/PreAcquireRequest.cc95
-rw-r--r--src/librbd/exclusive_lock/PreAcquireRequest.h75
-rw-r--r--src/librbd/exclusive_lock/PreReleaseRequest.cc363
-rw-r--r--src/librbd/exclusive_lock/PreReleaseRequest.h139
-rw-r--r--src/librbd/exclusive_lock/StandardPolicy.cc29
-rw-r--r--src/librbd/exclusive_lock/StandardPolicy.h37
-rw-r--r--src/librbd/image/AttachChildRequest.cc261
-rw-r--r--src/librbd/image/AttachChildRequest.h105
-rw-r--r--src/librbd/image/AttachParentRequest.cc90
-rw-r--r--src/librbd/image/AttachParentRequest.h79
-rw-r--r--src/librbd/image/CloneRequest.cc607
-rw-r--r--src/librbd/image/CloneRequest.h181
-rw-r--r--src/librbd/image/CloseRequest.cc350
-rw-r--r--src/librbd/image/CloseRequest.h127
-rw-r--r--src/librbd/image/CreateRequest.cc835
-rw-r--r--src/librbd/image/CreateRequest.h191
-rw-r--r--src/librbd/image/DetachChildRequest.cc392
-rw-r--r--src/librbd/image/DetachChildRequest.h119
-rw-r--r--src/librbd/image/DetachParentRequest.cc81
-rw-r--r--src/librbd/image/DetachParentRequest.h66
-rw-r--r--src/librbd/image/GetMetadataRequest.cc121
-rw-r--r--src/librbd/image/GetMetadataRequest.h83
-rw-r--r--src/librbd/image/ListWatchersRequest.cc174
-rw-r--r--src/librbd/image/ListWatchersRequest.h82
-rw-r--r--src/librbd/image/OpenRequest.cc727
-rw-r--r--src/librbd/image/OpenRequest.h149
-rw-r--r--src/librbd/image/PreRemoveRequest.cc348
-rw-r--r--src/librbd/image/PreRemoveRequest.h100
-rw-r--r--src/librbd/image/RefreshParentRequest.cc244
-rw-r--r--src/librbd/image/RefreshParentRequest.h109
-rw-r--r--src/librbd/image/RefreshRequest.cc1575
-rw-r--r--src/librbd/image/RefreshRequest.h275
-rw-r--r--src/librbd/image/RemoveRequest.cc617
-rw-r--r--src/librbd/image/RemoveRequest.h197
-rw-r--r--src/librbd/image/SetFlagsRequest.cc78
-rw-r--r--src/librbd/image/SetFlagsRequest.h61
-rw-r--r--src/librbd/image/SetSnapRequest.cc368
-rw-r--r--src/librbd/image/SetSnapRequest.h118
-rw-r--r--src/librbd/image/TypeTraits.h21
-rw-r--r--src/librbd/image/Types.h20
-rw-r--r--src/librbd/image/ValidatePoolRequest.cc234
-rw-r--r--src/librbd/image/ValidatePoolRequest.h93
-rw-r--r--src/librbd/image_watcher/NotifyLockOwner.cc95
-rw-r--r--src/librbd/image_watcher/NotifyLockOwner.h50
-rw-r--r--src/librbd/internal.cc1750
-rw-r--r--src/librbd/internal.h143
-rw-r--r--src/librbd/io/AioCompletion.cc294
-rw-r--r--src/librbd/io/AioCompletion.h203
-rw-r--r--src/librbd/io/AsyncOperation.cc94
-rw-r--r--src/librbd/io/AsyncOperation.h52
-rw-r--r--src/librbd/io/CopyupRequest.cc774
-rw-r--r--src/librbd/io/CopyupRequest.h143
-rw-r--r--src/librbd/io/Dispatcher.h252
-rw-r--r--src/librbd/io/DispatcherInterface.h37
-rw-r--r--src/librbd/io/FlushTracker.cc126
-rw-r--r--src/librbd/io/FlushTracker.h61
-rw-r--r--src/librbd/io/ImageDispatch.cc184
-rw-r--r--src/librbd/io/ImageDispatch.h96
-rw-r--r--src/librbd/io/ImageDispatchInterface.h92
-rw-r--r--src/librbd/io/ImageDispatchSpec.cc54
-rw-r--r--src/librbd/io/ImageDispatchSpec.h243
-rw-r--r--src/librbd/io/ImageDispatcher.cc311
-rw-r--r--src/librbd/io/ImageDispatcher.h76
-rw-r--r--src/librbd/io/ImageDispatcherInterface.h39
-rw-r--r--src/librbd/io/ImageRequest.cc881
-rw-r--r--src/librbd/io/ImageRequest.h386
-rw-r--r--src/librbd/io/ObjectDispatch.cc161
-rw-r--r--src/librbd/io/ObjectDispatch.h115
-rw-r--r--src/librbd/io/ObjectDispatchInterface.h102
-rw-r--r--src/librbd/io/ObjectDispatchSpec.cc47
-rw-r--r--src/librbd/io/ObjectDispatchSpec.h295
-rw-r--r--src/librbd/io/ObjectDispatcher.cc208
-rw-r--r--src/librbd/io/ObjectDispatcher.h60
-rw-r--r--src/librbd/io/ObjectDispatcherInterface.h35
-rw-r--r--src/librbd/io/ObjectRequest.cc1075
-rw-r--r--src/librbd/io/ObjectRequest.h502
-rw-r--r--src/librbd/io/QosImageDispatch.cc305
-rw-r--r--src/librbd/io/QosImageDispatch.h132
-rw-r--r--src/librbd/io/QueueImageDispatch.cc155
-rw-r--r--src/librbd/io/QueueImageDispatch.h111
-rw-r--r--src/librbd/io/ReadResult.cc262
-rw-r--r--src/librbd/io/ReadResult.h129
-rw-r--r--src/librbd/io/RefreshImageDispatch.cc167
-rw-r--r--src/librbd/io/RefreshImageDispatch.h102
-rw-r--r--src/librbd/io/SimpleSchedulerObjectDispatch.cc564
-rw-r--r--src/librbd/io/SimpleSchedulerObjectDispatch.h227
-rw-r--r--src/librbd/io/TypeTraits.h20
-rw-r--r--src/librbd/io/Types.cc38
-rw-r--r--src/librbd/io/Types.h307
-rw-r--r--src/librbd/io/Utils.cc239
-rw-r--r--src/librbd/io/Utils.h79
-rw-r--r--src/librbd/io/WriteBlockImageDispatch.cc271
-rw-r--r--src/librbd/io/WriteBlockImageDispatch.h135
-rw-r--r--src/librbd/journal/CreateRequest.cc234
-rw-r--r--src/librbd/journal/CreateRequest.h106
-rw-r--r--src/librbd/journal/DemoteRequest.cc255
-rw-r--r--src/librbd/journal/DemoteRequest.h107
-rw-r--r--src/librbd/journal/DisabledPolicy.h31
-rw-r--r--src/librbd/journal/ObjectDispatch.cc258
-rw-r--r--src/librbd/journal/ObjectDispatch.h124
-rw-r--r--src/librbd/journal/OpenRequest.cc144
-rw-r--r--src/librbd/journal/OpenRequest.h85
-rw-r--r--src/librbd/journal/Policy.h25
-rw-r--r--src/librbd/journal/PromoteRequest.cc237
-rw-r--r--src/librbd/journal/PromoteRequest.h109
-rw-r--r--src/librbd/journal/RemoveRequest.cc153
-rw-r--r--src/librbd/journal/RemoveRequest.h81
-rw-r--r--src/librbd/journal/Replay.cc1177
-rw-r--r--src/librbd/journal/Replay.h205
-rw-r--r--src/librbd/journal/ResetRequest.cc162
-rw-r--r--src/librbd/journal/ResetRequest.h110
-rw-r--r--src/librbd/journal/StandardPolicy.cc32
-rw-r--r--src/librbd/journal/StandardPolicy.h38
-rw-r--r--src/librbd/journal/TypeTraits.h29
-rw-r--r--src/librbd/journal/Types.cc956
-rw-r--r--src/librbd/journal/Types.h685
-rw-r--r--src/librbd/journal/Utils.cc86
-rw-r--r--src/librbd/journal/Utils.h80
-rw-r--r--src/librbd/librbd.cc7398
-rw-r--r--src/librbd/managed_lock/AcquireRequest.cc184
-rw-r--r--src/librbd/managed_lock/AcquireRequest.h102
-rw-r--r--src/librbd/managed_lock/BreakRequest.cc249
-rw-r--r--src/librbd/managed_lock/BreakRequest.h120
-rw-r--r--src/librbd/managed_lock/GetLockerRequest.cc131
-rw-r--r--src/librbd/managed_lock/GetLockerRequest.h58
-rw-r--r--src/librbd/managed_lock/ReacquireRequest.cc79
-rw-r--r--src/librbd/managed_lock/ReacquireRequest.h69
-rw-r--r--src/librbd/managed_lock/ReleaseRequest.cc95
-rw-r--r--src/librbd/managed_lock/ReleaseRequest.h72
-rw-r--r--src/librbd/managed_lock/Types.h46
-rw-r--r--src/librbd/managed_lock/Utils.cc43
-rw-r--r--src/librbd/managed_lock/Utils.h23
-rw-r--r--src/librbd/migration/FileStream.cc232
-rw-r--r--src/librbd/migration/FileStream.h68
-rw-r--r--src/librbd/migration/FormatInterface.h53
-rw-r--r--src/librbd/migration/HttpClient.cc946
-rw-r--r--src/librbd/migration/HttpClient.h205
-rw-r--r--src/librbd/migration/HttpProcessorInterface.h27
-rw-r--r--src/librbd/migration/HttpStream.cc83
-rw-r--r--src/librbd/migration/HttpStream.h68
-rw-r--r--src/librbd/migration/ImageDispatch.cc157
-rw-r--r--src/librbd/migration/ImageDispatch.h102
-rw-r--r--src/librbd/migration/NativeFormat.cc309
-rw-r--r--src/librbd/migration/NativeFormat.h82
-rw-r--r--src/librbd/migration/OpenSourceImageRequest.cc249
-rw-r--r--src/librbd/migration/OpenSourceImageRequest.h103
-rw-r--r--src/librbd/migration/QCOW.h466
-rw-r--r--src/librbd/migration/QCOWFormat.cc1542
-rw-r--r--src/librbd/migration/QCOWFormat.h211
-rw-r--r--src/librbd/migration/RawFormat.cc235
-rw-r--r--src/librbd/migration/RawFormat.h78
-rw-r--r--src/librbd/migration/RawSnapshot.cc220
-rw-r--r--src/librbd/migration/RawSnapshot.h75
-rw-r--r--src/librbd/migration/S3Stream.cc202
-rw-r--r--src/librbd/migration/S3Stream.h78
-rw-r--r--src/librbd/migration/SnapshotInterface.h48
-rw-r--r--src/librbd/migration/SourceSpecBuilder.cc147
-rw-r--r--src/librbd/migration/SourceSpecBuilder.h54
-rw-r--r--src/librbd/migration/StreamInterface.h32
-rw-r--r--src/librbd/migration/Types.h42
-rw-r--r--src/librbd/migration/Utils.cc133
-rw-r--r--src/librbd/migration/Utils.h30
-rw-r--r--src/librbd/mirror/DemoteRequest.cc216
-rw-r--r--src/librbd/mirror/DemoteRequest.h86
-rw-r--r--src/librbd/mirror/DisableRequest.cc479
-rw-r--r--src/librbd/mirror/DisableRequest.h143
-rw-r--r--src/librbd/mirror/EnableRequest.cc329
-rw-r--r--src/librbd/mirror/EnableRequest.h135
-rw-r--r--src/librbd/mirror/GetInfoRequest.cc290
-rw-r--r--src/librbd/mirror/GetInfoRequest.h123
-rw-r--r--src/librbd/mirror/GetStatusRequest.cc116
-rw-r--r--src/librbd/mirror/GetStatusRequest.h86
-rw-r--r--src/librbd/mirror/GetUuidRequest.cc86
-rw-r--r--src/librbd/mirror/GetUuidRequest.h69
-rw-r--r--src/librbd/mirror/ImageRemoveRequest.cc98
-rw-r--r--src/librbd/mirror/ImageRemoveRequest.h77
-rw-r--r--src/librbd/mirror/ImageStateUpdateRequest.cc151
-rw-r--r--src/librbd/mirror/ImageStateUpdateRequest.h92
-rw-r--r--src/librbd/mirror/PromoteRequest.cc115
-rw-r--r--src/librbd/mirror/PromoteRequest.h76
-rw-r--r--src/librbd/mirror/Types.h21
-rw-r--r--src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc273
-rw-r--r--src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h123
-rw-r--r--src/librbd/mirror/snapshot/CreatePrimaryRequest.cc277
-rw-r--r--src/librbd/mirror/snapshot/CreatePrimaryRequest.h106
-rw-r--r--src/librbd/mirror/snapshot/DemoteRequest.cc110
-rw-r--r--src/librbd/mirror/snapshot/DemoteRequest.h76
-rw-r--r--src/librbd/mirror/snapshot/GetImageStateRequest.cc114
-rw-r--r--src/librbd/mirror/snapshot/GetImageStateRequest.h76
-rw-r--r--src/librbd/mirror/snapshot/ImageMeta.cc175
-rw-r--r--src/librbd/mirror/snapshot/ImageMeta.h78
-rw-r--r--src/librbd/mirror/snapshot/PromoteRequest.cc405
-rw-r--r--src/librbd/mirror/snapshot/PromoteRequest.h151
-rw-r--r--src/librbd/mirror/snapshot/RemoveImageStateRequest.cc131
-rw-r--r--src/librbd/mirror/snapshot/RemoveImageStateRequest.h75
-rw-r--r--src/librbd/mirror/snapshot/SetImageStateRequest.cc235
-rw-r--r--src/librbd/mirror/snapshot/SetImageStateRequest.h96
-rw-r--r--src/librbd/mirror/snapshot/Types.cc109
-rw-r--r--src/librbd/mirror/snapshot/Types.h122
-rw-r--r--src/librbd/mirror/snapshot/UnlinkPeerRequest.cc235
-rw-r--r--src/librbd/mirror/snapshot/UnlinkPeerRequest.h95
-rw-r--r--src/librbd/mirror/snapshot/Utils.cc186
-rw-r--r--src/librbd/mirror/snapshot/Utils.h38
-rw-r--r--src/librbd/mirror/snapshot/WriteImageStateRequest.cc120
-rw-r--r--src/librbd/mirror/snapshot/WriteImageStateRequest.h73
-rw-r--r--src/librbd/mirroring_watcher/Types.cc136
-rw-r--r--src/librbd/mirroring_watcher/Types.h102
-rw-r--r--src/librbd/object_map/CreateRequest.cc94
-rw-r--r--src/librbd/object_map/CreateRequest.h59
-rw-r--r--src/librbd/object_map/DiffRequest.cc258
-rw-r--r--src/librbd/object_map/DiffRequest.h87
-rw-r--r--src/librbd/object_map/InvalidateRequest.cc83
-rw-r--r--src/librbd/object_map/InvalidateRequest.h45
-rw-r--r--src/librbd/object_map/LockRequest.cc157
-rw-r--r--src/librbd/object_map/LockRequest.h75
-rw-r--r--src/librbd/object_map/RefreshRequest.cc311
-rw-r--r--src/librbd/object_map/RefreshRequest.h102
-rw-r--r--src/librbd/object_map/RemoveRequest.cc88
-rw-r--r--src/librbd/object_map/RemoveRequest.h63
-rw-r--r--src/librbd/object_map/Request.cc74
-rw-r--r--src/librbd/object_map/Request.h66
-rw-r--r--src/librbd/object_map/ResizeRequest.cc65
-rw-r--r--src/librbd/object_map/ResizeRequest.h52
-rw-r--r--src/librbd/object_map/SnapshotCreateRequest.cc147
-rw-r--r--src/librbd/object_map/SnapshotCreateRequest.h80
-rw-r--r--src/librbd/object_map/SnapshotRemoveRequest.cc227
-rw-r--r--src/librbd/object_map/SnapshotRemoveRequest.h88
-rw-r--r--src/librbd/object_map/SnapshotRollbackRequest.cc131
-rw-r--r--src/librbd/object_map/SnapshotRollbackRequest.h74
-rw-r--r--src/librbd/object_map/Types.h20
-rw-r--r--src/librbd/object_map/UnlockRequest.cc66
-rw-r--r--src/librbd/object_map/UnlockRequest.h47
-rw-r--r--src/librbd/object_map/UpdateRequest.cc129
-rw-r--r--src/librbd/object_map/UpdateRequest.h106
-rw-r--r--src/librbd/operation/DisableFeaturesRequest.cc655
-rw-r--r--src/librbd/operation/DisableFeaturesRequest.h171
-rw-r--r--src/librbd/operation/EnableFeaturesRequest.cc494
-rw-r--r--src/librbd/operation/EnableFeaturesRequest.h135
-rw-r--r--src/librbd/operation/FlattenRequest.cc226
-rw-r--r--src/librbd/operation/FlattenRequest.h73
-rw-r--r--src/librbd/operation/MetadataRemoveRequest.cc60
-rw-r--r--src/librbd/operation/MetadataRemoveRequest.h44
-rw-r--r--src/librbd/operation/MetadataSetRequest.cc62
-rw-r--r--src/librbd/operation/MetadataSetRequest.h47
-rw-r--r--src/librbd/operation/MigrateRequest.cc238
-rw-r--r--src/librbd/operation/MigrateRequest.h68
-rw-r--r--src/librbd/operation/ObjectMapIterate.cc308
-rw-r--r--src/librbd/operation/ObjectMapIterate.h65
-rw-r--r--src/librbd/operation/RebuildObjectMapRequest.cc250
-rw-r--r--src/librbd/operation/RebuildObjectMapRequest.h84
-rw-r--r--src/librbd/operation/RenameRequest.cc257
-rw-r--r--src/librbd/operation/RenameRequest.h95
-rw-r--r--src/librbd/operation/Request.cc183
-rw-r--r--src/librbd/operation/Request.h107
-rw-r--r--src/librbd/operation/ResizeRequest.cc466
-rw-r--r--src/librbd/operation/ResizeRequest.h156
-rw-r--r--src/librbd/operation/SnapshotCreateRequest.cc449
-rw-r--r--src/librbd/operation/SnapshotCreateRequest.h148
-rw-r--r--src/librbd/operation/SnapshotLimitRequest.cc66
-rw-r--r--src/librbd/operation/SnapshotLimitRequest.h44
-rw-r--r--src/librbd/operation/SnapshotProtectRequest.cc118
-rw-r--r--src/librbd/operation/SnapshotProtectRequest.h68
-rw-r--r--src/librbd/operation/SnapshotRemoveRequest.cc505
-rw-r--r--src/librbd/operation/SnapshotRemoveRequest.h128
-rw-r--r--src/librbd/operation/SnapshotRenameRequest.cc102
-rw-r--r--src/librbd/operation/SnapshotRenameRequest.h63
-rw-r--r--src/librbd/operation/SnapshotRollbackRequest.cc424
-rw-r--r--src/librbd/operation/SnapshotRollbackRequest.h122
-rw-r--r--src/librbd/operation/SnapshotUnprotectRequest.cc353
-rw-r--r--src/librbd/operation/SnapshotUnprotectRequest.h94
-rw-r--r--src/librbd/operation/SparsifyRequest.cc514
-rw-r--r--src/librbd/operation/SparsifyRequest.h64
-rw-r--r--src/librbd/operation/TrimRequest.cc373
-rw-r--r--src/librbd/operation/TrimRequest.h107
-rw-r--r--src/librbd/plugin/Api.cc92
-rw-r--r--src/librbd/plugin/Api.h84
-rw-r--r--src/librbd/plugin/ParentCache.cc81
-rw-r--r--src/librbd/plugin/ParentCache.h38
-rw-r--r--src/librbd/plugin/Types.h45
-rw-r--r--src/librbd/plugin/WriteLogImageCache.cc104
-rw-r--r--src/librbd/plugin/WriteLogImageCache.h53
-rw-r--r--src/librbd/trash/MoveRequest.cc126
-rw-r--r--src/librbd/trash/MoveRequest.h87
-rw-r--r--src/librbd/trash/RemoveRequest.cc170
-rw-r--r--src/librbd/trash/RemoveRequest.h118
-rw-r--r--src/librbd/trash_watcher/Types.cc130
-rw-r--r--src/librbd/trash_watcher/Types.h97
-rw-r--r--src/librbd/watcher/Notifier.cc99
-rw-r--r--src/librbd/watcher/Notifier.h64
-rw-r--r--src/librbd/watcher/RewatchRequest.cc108
-rw-r--r--src/librbd/watcher/RewatchRequest.h75
-rw-r--r--src/librbd/watcher/Types.cc45
-rw-r--r--src/librbd/watcher/Types.h71
-rw-r--r--src/librbd/watcher/Utils.h74
484 files changed, 109043 insertions, 0 deletions
diff --git a/src/librbd/AsioEngine.cc b/src/librbd/AsioEngine.cc
new file mode 100644
index 000000000..1a46b5904
--- /dev/null
+++ b/src/librbd/AsioEngine.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/AsioEngine.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "include/neorados/RADOS.hpp"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "librbd/asio/ContextWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::AsioEngine: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+
+AsioEngine::AsioEngine(std::shared_ptr<librados::Rados> rados)
+ : m_rados_api(std::make_shared<neorados::RADOS>(
+ neorados::RADOS::make_with_librados(*rados))),
+ m_cct(m_rados_api->cct()),
+ m_io_context(m_rados_api->get_io_context()),
+ m_api_strand(std::make_unique<boost::asio::io_context::strand>(
+ m_io_context)),
+ m_context_wq(std::make_unique<asio::ContextWQ>(m_cct, m_io_context)) {
+ ldout(m_cct, 20) << dendl;
+
+ auto rados_threads = m_cct->_conf.get_val<uint64_t>("librados_thread_count");
+ auto rbd_threads = m_cct->_conf.get_val<uint64_t>("rbd_op_threads");
+ if (rbd_threads > rados_threads) {
+ // inherit the librados thread count -- but increase it if librbd wants to
+ // utilize more threads
+ m_cct->_conf.set_val("librados_thread_count", stringify(rbd_threads));
+ }
+}
+
+AsioEngine::AsioEngine(librados::IoCtx& io_ctx)
+ : AsioEngine(std::make_shared<librados::Rados>(io_ctx)) {
+}
+
+AsioEngine::~AsioEngine() {
+ ldout(m_cct, 20) << dendl;
+ m_api_strand.reset();
+}
+
+void AsioEngine::dispatch(Context* ctx, int r) {
+ dispatch([ctx, r]() { ctx->complete(r); });
+}
+
+void AsioEngine::post(Context* ctx, int r) {
+ post([ctx, r]() { ctx->complete(r); });
+}
+
+} // namespace librbd
diff --git a/src/librbd/AsioEngine.h b/src/librbd/AsioEngine.h
new file mode 100644
index 000000000..0f476d80b
--- /dev/null
+++ b/src/librbd/AsioEngine.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_ASIO_ENGINE_H
+#define CEPH_LIBRBD_ASIO_ENGINE_H
+
+#include "include/common_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include <memory>
+#include <boost/asio/dispatch.hpp>
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/io_context_strand.hpp>
+#include <boost/asio/post.hpp>
+
+struct Context;
+namespace neorados { struct RADOS; }
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+class AsioEngine {
+public:
+ explicit AsioEngine(std::shared_ptr<librados::Rados> rados);
+ explicit AsioEngine(librados::IoCtx& io_ctx);
+ ~AsioEngine();
+
+ AsioEngine(AsioEngine&&) = delete;
+ AsioEngine(const AsioEngine&) = delete;
+ AsioEngine& operator=(const AsioEngine&) = delete;
+
+ inline neorados::RADOS& get_rados_api() {
+ return *m_rados_api;
+ }
+
+ inline boost::asio::io_context& get_io_context() {
+ return m_io_context;
+ }
+ inline operator boost::asio::io_context&() {
+ return m_io_context;
+ }
+
+ using executor_type = boost::asio::io_context::executor_type;
+ inline executor_type get_executor() {
+ return m_io_context.get_executor();
+ }
+
+ inline boost::asio::io_context::strand& get_api_strand() {
+ // API client callbacks should never fire concurrently
+ return *m_api_strand;
+ }
+
+ inline asio::ContextWQ* get_work_queue() {
+ return m_context_wq.get();
+ }
+
+ template <typename T>
+ void dispatch(T&& t) {
+ boost::asio::dispatch(m_io_context, std::forward<T>(t));
+ }
+ void dispatch(Context* ctx, int r);
+
+ template <typename T>
+ void post(T&& t) {
+ boost::asio::post(m_io_context, std::forward<T>(t));
+ }
+ void post(Context* ctx, int r);
+
+private:
+ std::shared_ptr<neorados::RADOS> m_rados_api;
+ CephContext* m_cct;
+
+ boost::asio::io_context& m_io_context;
+ std::unique_ptr<boost::asio::io_context::strand> m_api_strand;
+ std::unique_ptr<asio::ContextWQ> m_context_wq;
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_ASIO_ENGINE_H
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
new file mode 100644
index 000000000..6adba2166
--- /dev/null
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "librbd/AsyncObjectThrottle.h"
+#include "common/RWLock.h"
+#include "librbd/AsyncRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+
+namespace librbd
+{
+
+template <typename T>
+AsyncObjectThrottle<T>::AsyncObjectThrottle(
+ const AsyncRequest<T>* async_request, T &image_ctx,
+ const ContextFactory& context_factory, Context *ctx,
+ ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no)
+ : m_lock(ceph::make_mutex(
+ util::unique_lock_name("librbd::AsyncThrottle::m_lock", this))),
+ m_async_request(async_request), m_image_ctx(image_ctx),
+ m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx),
+ m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0),
+ m_ret(0)
+{
+}
+
+template <typename T>
+void AsyncObjectThrottle<T>::start_ops(uint64_t max_concurrent) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ bool complete;
+ {
+ std::lock_guard l{m_lock};
+ for (uint64_t i = 0; i < max_concurrent; ++i) {
+ start_next_op();
+ if (m_ret < 0 && m_current_ops == 0) {
+ break;
+ }
+ }
+ complete = (m_current_ops == 0);
+ }
+ if (complete) {
+ // avoid re-entrant callback
+ m_image_ctx.op_work_queue->queue(m_ctx, m_ret);
+ delete this;
+ }
+}
+
+template <typename T>
+void AsyncObjectThrottle<T>::finish_op(int r) {
+ bool complete;
+ {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::lock_guard locker{m_lock};
+ --m_current_ops;
+ if (r < 0 && r != -ENOENT && m_ret == 0) {
+ m_ret = r;
+ }
+
+ start_next_op();
+ complete = (m_current_ops == 0);
+ }
+ if (complete) {
+ m_ctx->complete(m_ret);
+ delete this;
+ }
+}
+
+template <typename T>
+void AsyncObjectThrottle<T>::start_next_op() {
+ bool done = false;
+ while (!done) {
+ if (m_async_request != NULL && m_async_request->is_canceled() &&
+ m_ret == 0) {
+ // allow in-flight ops to complete, but don't start new ops
+ m_ret = -ERESTART;
+ return;
+ } else if (m_ret != 0 || m_object_no >= m_end_object_no) {
+ return;
+ }
+
+ uint64_t ono = m_object_no++;
+ C_AsyncObjectThrottle<T> *ctx = m_context_factory(*this, ono);
+
+ int r = ctx->send();
+ if (r < 0) {
+ m_ret = r;
+ delete ctx;
+ return;
+ } else if (r > 0) {
+ // op completed immediately
+ delete ctx;
+ } else {
+ ++m_current_ops;
+ done = true;
+ }
+ if (m_prog_ctx != NULL) {
+ r = m_prog_ctx->update_progress(ono, m_end_object_no);
+ if (r < 0) {
+ m_ret = r;
+ }
+ }
+ }
+}
+
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h
new file mode 100644
index 000000000..64397f9e4
--- /dev/null
+++ b/src/librbd/AsyncObjectThrottle.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
+#define CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+
+#include <boost/function.hpp>
+
+namespace librbd
+{
+template <typename ImageCtxT> class AsyncRequest;
+class ProgressContext;
+struct ImageCtx;
+
+class AsyncObjectThrottleFinisher {
+public:
+ virtual ~AsyncObjectThrottleFinisher() {};
+ virtual void finish_op(int r) = 0;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class C_AsyncObjectThrottle : public Context {
+public:
+ C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher,
+ ImageCtxT &image_ctx)
+ : m_image_ctx(image_ctx), m_finisher(finisher) {
+ }
+
+ virtual int send() = 0;
+
+protected:
+ ImageCtxT &m_image_ctx;
+
+ void finish(int r) override {
+ m_finisher.finish_op(r);
+ }
+
+private:
+ AsyncObjectThrottleFinisher &m_finisher;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AsyncObjectThrottle : public AsyncObjectThrottleFinisher {
+public:
+ typedef boost::function<
+ C_AsyncObjectThrottle<ImageCtxT>* (AsyncObjectThrottle&,
+ uint64_t)> ContextFactory;
+
+ AsyncObjectThrottle(const AsyncRequest<ImageCtxT> *async_request,
+ ImageCtxT &image_ctx,
+ const ContextFactory& context_factory, Context *ctx,
+ ProgressContext *prog_ctx, uint64_t object_no,
+ uint64_t end_object_no);
+
+ void start_ops(uint64_t max_concurrent);
+ void finish_op(int r) override;
+
+private:
+ ceph::mutex m_lock;
+ const AsyncRequest<ImageCtxT> *m_async_request;
+ ImageCtxT &m_image_ctx;
+ ContextFactory m_context_factory;
+ Context *m_ctx;
+ ProgressContext *m_prog_ctx;
+ uint64_t m_object_no;
+ uint64_t m_end_object_no;
+ uint64_t m_current_ops;
+ int m_ret;
+
+ void start_next_op();
+};
+
+} // namespace librbd
+
+extern template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
new file mode 100644
index 000000000..c189613d0
--- /dev/null
+++ b/src/librbd/AsyncRequest.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "librbd/AsyncRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+
+namespace librbd
+{
+
+template <typename T>
+AsyncRequest<T>::AsyncRequest(T &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false),
+ m_xlist_item(this) {
+ ceph_assert(m_on_finish != NULL);
+ start_request();
+}
+
+template <typename T>
+AsyncRequest<T>::~AsyncRequest() {
+}
+
+template <typename T>
+void AsyncRequest<T>::async_complete(int r) {
+ m_image_ctx.op_work_queue->queue(create_callback_context(), r);
+}
+
+template <typename T>
+librados::AioCompletion *AsyncRequest<T>::create_callback_completion() {
+ return util::create_rados_callback(this);
+}
+
+template <typename T>
+Context *AsyncRequest<T>::create_callback_context() {
+ return util::create_context_callback(this);
+}
+
+template <typename T>
+Context *AsyncRequest<T>::create_async_callback_context() {
+ return util::create_context_callback<AsyncRequest<T>,
+ &AsyncRequest<T>::async_complete>(this);
+}
+
+template <typename T>
+void AsyncRequest<T>::start_request() {
+ std::lock_guard async_ops_locker{m_image_ctx.async_ops_lock};
+ m_image_ctx.async_requests.push_back(&m_xlist_item);
+}
+
+template <typename T>
+void AsyncRequest<T>::finish_request() {
+ decltype(m_image_ctx.async_requests_waiters) waiters;
+ {
+ std::lock_guard async_ops_locker{m_image_ctx.async_ops_lock};
+ ceph_assert(m_xlist_item.remove_myself());
+
+ if (m_image_ctx.async_requests.empty()) {
+ waiters = std::move(m_image_ctx.async_requests_waiters);
+ }
+ }
+
+ for (auto ctx : waiters) {
+ ctx->complete(0);
+ }
+}
+
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::AsyncRequest<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h
new file mode 100644
index 000000000..f74368dc6
--- /dev/null
+++ b/src/librbd/AsyncRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_ASYNC_REQUEST_H
+#define CEPH_LIBRBD_ASYNC_REQUEST_H
+
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "include/xlist.h"
+#include "include/compat.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class AsyncRequest
+{
+public:
+ AsyncRequest(ImageCtxT &image_ctx, Context *on_finish);
+ virtual ~AsyncRequest();
+
+ void complete(int r) {
+ if (should_complete(r)) {
+ r = filter_return_code(r);
+ finish_and_destroy(r);
+ }
+ }
+
+ virtual void send() = 0;
+
+ inline bool is_canceled() const {
+ return m_canceled;
+ }
+ inline void cancel() {
+ m_canceled = true;
+ }
+
+protected:
+ ImageCtxT &m_image_ctx;
+
+ librados::AioCompletion *create_callback_completion();
+ Context *create_callback_context();
+ Context *create_async_callback_context();
+
+ void async_complete(int r);
+
+ virtual bool should_complete(int r) = 0;
+ virtual int filter_return_code(int r) const {
+ return r;
+ }
+
+ // NOTE: temporary until converted to new state machine format
+ virtual void finish_and_destroy(int r) {
+ finish(r);
+ delete this;
+ }
+
+ virtual void finish(int r) {
+ finish_request();
+ m_on_finish->complete(r);
+ }
+
+private:
+ Context *m_on_finish;
+ bool m_canceled;
+ typename xlist<AsyncRequest<ImageCtxT> *>::item m_xlist_item;
+
+ void start_request();
+ void finish_request();
+};
+
+} // namespace librbd
+
+extern template class librbd::AsyncRequest<librbd::ImageCtx>;
+
+#endif //CEPH_LIBRBD_ASYNC_REQUEST_H
diff --git a/src/librbd/BlockGuard.h b/src/librbd/BlockGuard.h
new file mode 100644
index 000000000..2474e3c02
--- /dev/null
+++ b/src/librbd/BlockGuard.h
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_BLOCK_GUARD_H
+#define CEPH_LIBRBD_IO_BLOCK_GUARD_H
+
+#include "include/int_types.h"
+#include "common/dout.h"
+#include "common/ceph_mutex.h"
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/set.hpp>
+#include <deque>
+#include <list>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::BlockGuard: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+struct BlockExtent {
+ // [block_start, block_end)
+ uint64_t block_start = 0;
+ uint64_t block_end = 0;
+
+ BlockExtent() {
+ }
+ BlockExtent(uint64_t block_start, uint64_t block_end)
+ : block_start(block_start), block_end(block_end) {
+ }
+
+ friend ostream& operator<< (ostream& os, const BlockExtent& block_extent) {
+ os << "[block_start = " << block_extent.block_start << ", "
+ << "block_end = " << block_extent.block_end << ")";
+ return os;
+ }
+};
+
+struct BlockGuardCell {
+};
+
+/**
+ * Helper class to restrict and order concurrent IO to the same block. The
+ * definition of a block is dependent upon the user of this class. It might
+ * represent a backing object, 512 byte sectors, etc.
+ */
+template <typename BlockOperation>
+class BlockGuard {
+private:
+ struct DetainedBlockExtent;
+
+public:
+ typedef std::list<BlockOperation> BlockOperations;
+
+ BlockGuard(CephContext *cct)
+ : m_cct(cct) {
+ }
+
+ BlockGuard(const BlockGuard&) = delete;
+ BlockGuard &operator=(const BlockGuard&) = delete;
+
+ /**
+ * Detain future IO for a range of blocks. the guard will keep
+ * ownership of the provided operation if the operation is blocked.
+ * @return 0 upon success and IO can be issued
+ * >0 if the IO is blocked,
+ * <0 upon error
+ */
+ int detain(const BlockExtent &block_extent, BlockOperation *block_operation,
+ BlockGuardCell **cell) {
+ std::lock_guard locker{m_lock};
+ ldout(m_cct, 20) << block_extent << ", "
+ << "free_slots=" << m_free_detained_block_extents.size()
+ << dendl;
+
+ DetainedBlockExtent *detained_block_extent;
+ auto it = m_detained_block_extents.find(block_extent);
+ if (it != m_detained_block_extents.end()) {
+ // request against an already detained block
+ detained_block_extent = &(*it);
+ if (block_operation != nullptr) {
+ detained_block_extent->block_operations.emplace_back(
+ std::move(*block_operation));
+ }
+
+ // alert the caller that the IO was detained
+ *cell = nullptr;
+ return detained_block_extent->block_operations.size();
+ } else {
+ if (!m_free_detained_block_extents.empty()) {
+ detained_block_extent = &m_free_detained_block_extents.front();
+ detained_block_extent->block_operations.clear();
+ m_free_detained_block_extents.pop_front();
+ } else {
+ ldout(m_cct, 20) << "no free detained block cells" << dendl;
+ m_detained_block_extent_pool.emplace_back();
+ detained_block_extent = &m_detained_block_extent_pool.back();
+ }
+
+ detained_block_extent->block_extent = block_extent;
+ m_detained_block_extents.insert(*detained_block_extent);
+ *cell = reinterpret_cast<BlockGuardCell*>(detained_block_extent);
+ return 0;
+ }
+ }
+
+ /**
+ * Release any detained IO operations from the provided cell.
+ */
+ void release(BlockGuardCell *cell, BlockOperations *block_operations) {
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(cell != nullptr);
+ auto &detained_block_extent = reinterpret_cast<DetainedBlockExtent &>(
+ *cell);
+ ldout(m_cct, 20) << detained_block_extent.block_extent << ", "
+ << "pending_ops="
+ << detained_block_extent.block_operations.size()
+ << dendl;
+
+ *block_operations = std::move(detained_block_extent.block_operations);
+ m_detained_block_extents.erase(detained_block_extent.block_extent);
+ m_free_detained_block_extents.push_back(detained_block_extent);
+ }
+
+private:
+ struct DetainedBlockExtent : public boost::intrusive::list_base_hook<>,
+ public boost::intrusive::set_base_hook<> {
+ BlockExtent block_extent;
+ BlockOperations block_operations;
+ };
+
+ struct DetainedBlockExtentKey {
+ typedef BlockExtent type;
+ const BlockExtent &operator()(const DetainedBlockExtent &value) {
+ return value.block_extent;
+ }
+ };
+
+ struct DetainedBlockExtentCompare {
+ bool operator()(const BlockExtent &lhs,
+ const BlockExtent &rhs) const {
+ // check for range overlap (lhs < rhs)
+ if (lhs.block_end <= rhs.block_start) {
+ return true;
+ }
+ return false;
+ }
+ };
+
+ typedef std::deque<DetainedBlockExtent> DetainedBlockExtentsPool;
+ typedef boost::intrusive::list<DetainedBlockExtent> DetainedBlockExtents;
+ typedef boost::intrusive::set<
+ DetainedBlockExtent,
+ boost::intrusive::compare<DetainedBlockExtentCompare>,
+ boost::intrusive::key_of_value<DetainedBlockExtentKey> >
+ BlockExtentToDetainedBlockExtents;
+
+ CephContext *m_cct;
+
+ ceph::mutex m_lock = ceph::make_mutex("librbd::BlockGuard::m_lock");
+ DetainedBlockExtentsPool m_detained_block_extent_pool;
+ DetainedBlockExtents m_free_detained_block_extents;
+ BlockExtentToDetainedBlockExtents m_detained_block_extents;
+
+};
+
+} // namespace librbd
+
+#undef dout_subsys
+#undef dout_prefix
+#define dout_prefix *_dout
+
+#endif // CEPH_LIBRBD_IO_BLOCK_GUARD_H
diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt
new file mode 100644
index 000000000..a41d85d04
--- /dev/null
+++ b/src/librbd/CMakeLists.txt
@@ -0,0 +1,355 @@
+set(librbd_types_srcs
+ journal/Types.cc
+ mirroring_watcher/Types.cc
+ trash_watcher/Types.cc
+ watcher/Types.cc
+ WatchNotifyTypes.cc)
+
+if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
+ list(APPEND librbd_types_srcs cache/pwl/Types.cc)
+endif()
+
+add_library(rbd_types STATIC
+ ${librbd_types_srcs})
+
+if (WITH_RBD_RWL)
+ target_link_libraries(rbd_types
+ PUBLIC pmem::pmemobj)
+endif()
+
+set(librbd_internal_srcs
+ AsioEngine.cc
+ AsyncObjectThrottle.cc
+ AsyncRequest.cc
+ ConfigWatcher.cc
+ DeepCopyRequest.cc
+ ExclusiveLock.cc
+ ImageCtx.cc
+ ImageState.cc
+ ImageWatcher.cc
+ internal.cc
+ Journal.cc
+ LibrbdAdminSocketHook.cc
+ ManagedLock.cc
+ MirroringWatcher.cc
+ ObjectMap.cc
+ Operations.cc
+ PluginRegistry.cc
+ TrashWatcher.cc
+ Utils.cc
+ Watcher.cc
+ api/Config.cc
+ api/DiffIterate.cc
+ api/Group.cc
+ api/Image.cc
+ api/Io.cc
+ api/Migration.cc
+ api/Mirror.cc
+ api/Namespace.cc
+ api/Pool.cc
+ api/PoolMetadata.cc
+ api/Snapshot.cc
+ api/Trash.cc
+ api/Utils.cc
+ asio/ContextWQ.cc
+ cache/ImageWriteback.cc
+ cache/ObjectCacherObjectDispatch.cc
+ cache/ObjectCacherWriteback.cc
+ cache/WriteAroundObjectDispatch.cc
+ crypto/BlockCrypto.cc
+ crypto/CryptoContextPool.cc
+ crypto/CryptoImageDispatch.cc
+ crypto/CryptoObjectDispatch.cc
+ crypto/FormatRequest.cc
+ crypto/LoadRequest.cc
+ crypto/ShutDownCryptoRequest.cc
+ crypto/Utils.cc
+ crypto/openssl/DataCryptor.cc
+ deep_copy/ImageCopyRequest.cc
+ deep_copy/MetadataCopyRequest.cc
+ deep_copy/ObjectCopyRequest.cc
+ deep_copy/SetHeadRequest.cc
+ deep_copy/SnapshotCopyRequest.cc
+ deep_copy/SnapshotCreateRequest.cc
+ deep_copy/Utils.cc
+ exclusive_lock/AutomaticPolicy.cc
+ exclusive_lock/ImageDispatch.cc
+ exclusive_lock/PreAcquireRequest.cc
+ exclusive_lock/PostAcquireRequest.cc
+ exclusive_lock/PreReleaseRequest.cc
+ exclusive_lock/StandardPolicy.cc
+ image/AttachChildRequest.cc
+ image/AttachParentRequest.cc
+ image/CloneRequest.cc
+ image/CloseRequest.cc
+ image/CreateRequest.cc
+ image/DetachChildRequest.cc
+ image/DetachParentRequest.cc
+ image/GetMetadataRequest.cc
+ image/ListWatchersRequest.cc
+ image/OpenRequest.cc
+ image/PreRemoveRequest.cc
+ image/RefreshParentRequest.cc
+ image/RefreshRequest.cc
+ image/RemoveRequest.cc
+ image/SetFlagsRequest.cc
+ image/SetSnapRequest.cc
+ image/ValidatePoolRequest.cc
+ image_watcher/NotifyLockOwner.cc
+ io/AioCompletion.cc
+ io/AsyncOperation.cc
+ io/CopyupRequest.cc
+ io/FlushTracker.cc
+ io/ImageDispatch.cc
+ io/ImageDispatchSpec.cc
+ io/ImageDispatcher.cc
+ io/ImageRequest.cc
+ io/ObjectDispatch.cc
+ io/ObjectDispatchSpec.cc
+ io/ObjectDispatcher.cc
+ io/ObjectRequest.cc
+ io/QosImageDispatch.cc
+ io/QueueImageDispatch.cc
+ io/ReadResult.cc
+ io/RefreshImageDispatch.cc
+ io/SimpleSchedulerObjectDispatch.cc
+ io/Types.cc
+ io/Utils.cc
+ io/WriteBlockImageDispatch.cc
+ journal/CreateRequest.cc
+ journal/DemoteRequest.cc
+ journal/ObjectDispatch.cc
+ journal/OpenRequest.cc
+ journal/PromoteRequest.cc
+ journal/RemoveRequest.cc
+ journal/Replay.cc
+ journal/ResetRequest.cc
+ journal/StandardPolicy.cc
+ journal/Utils.cc
+ managed_lock/AcquireRequest.cc
+ managed_lock/BreakRequest.cc
+ managed_lock/GetLockerRequest.cc
+ managed_lock/ReacquireRequest.cc
+ managed_lock/ReleaseRequest.cc
+ managed_lock/Utils.cc
+ migration/FileStream.cc
+ migration/HttpClient.cc
+ migration/HttpStream.cc
+ migration/ImageDispatch.cc
+ migration/NativeFormat.cc
+ migration/OpenSourceImageRequest.cc
+ migration/QCOWFormat.cc
+ migration/RawFormat.cc
+ migration/RawSnapshot.cc
+ migration/S3Stream.cc
+ migration/SourceSpecBuilder.cc
+ migration/Utils.cc
+ mirror/DemoteRequest.cc
+ mirror/DisableRequest.cc
+ mirror/EnableRequest.cc
+ mirror/GetInfoRequest.cc
+ mirror/GetStatusRequest.cc
+ mirror/GetUuidRequest.cc
+ mirror/ImageRemoveRequest.cc
+ mirror/ImageStateUpdateRequest.cc
+ mirror/PromoteRequest.cc
+ mirror/snapshot/CreateNonPrimaryRequest.cc
+ mirror/snapshot/CreatePrimaryRequest.cc
+ mirror/snapshot/DemoteRequest.cc
+ mirror/snapshot/GetImageStateRequest.cc
+ mirror/snapshot/ImageMeta.cc
+ mirror/snapshot/PromoteRequest.cc
+ mirror/snapshot/RemoveImageStateRequest.cc
+ mirror/snapshot/SetImageStateRequest.cc
+ mirror/snapshot/Types.cc
+ mirror/snapshot/UnlinkPeerRequest.cc
+ mirror/snapshot/Utils.cc
+ mirror/snapshot/WriteImageStateRequest.cc
+ object_map/CreateRequest.cc
+ object_map/DiffRequest.cc
+ object_map/InvalidateRequest.cc
+ object_map/LockRequest.cc
+ object_map/RefreshRequest.cc
+ object_map/RemoveRequest.cc
+ object_map/Request.cc
+ object_map/ResizeRequest.cc
+ object_map/SnapshotCreateRequest.cc
+ object_map/SnapshotRemoveRequest.cc
+ object_map/SnapshotRollbackRequest.cc
+ object_map/UnlockRequest.cc
+ object_map/UpdateRequest.cc
+ operation/DisableFeaturesRequest.cc
+ operation/EnableFeaturesRequest.cc
+ operation/FlattenRequest.cc
+ operation/MetadataRemoveRequest.cc
+ operation/MetadataSetRequest.cc
+ operation/MigrateRequest.cc
+ operation/ObjectMapIterate.cc
+ operation/RebuildObjectMapRequest.cc
+ operation/RenameRequest.cc
+ operation/Request.cc
+ operation/ResizeRequest.cc
+ operation/SnapshotCreateRequest.cc
+ operation/SnapshotProtectRequest.cc
+ operation/SnapshotRemoveRequest.cc
+ operation/SnapshotRenameRequest.cc
+ operation/SnapshotRollbackRequest.cc
+ operation/SnapshotUnprotectRequest.cc
+ operation/SnapshotLimitRequest.cc
+ operation/SparsifyRequest.cc
+ operation/TrimRequest.cc
+ plugin/Api.cc
+ trash/MoveRequest.cc
+ trash/RemoveRequest.cc
+ watcher/Notifier.cc
+ watcher/RewatchRequest.cc
+ ${CMAKE_SOURCE_DIR}/src/common/ContextCompletion.cc)
+
+if(WITH_EVENTTRACE)
+ list(APPEND librbd_internal_srcs ../common/EventTrace.cc)
+endif()
+
+if(LINUX AND HAVE_LIBCRYPTSETUP)
+ list(APPEND librbd_internal_srcs
+ crypto/luks/EncryptionFormat.cc
+ crypto/luks/Header.cc
+ crypto/luks/FormatRequest.cc
+ crypto/luks/LoadRequest.cc)
+endif()
+
+add_library(rbd_api STATIC librbd.cc)
+add_library(rbd_internal STATIC
+ ${librbd_internal_srcs}
+ $<TARGET_OBJECTS:rados_snap_set_diff_obj>)
+if(WITH_LTTNG)
+ # librbd.cc includes tracing/librbd.h
+ add_dependencies(rbd_api librbd-tp)
+ # io/AioCompletion.cc includes tracing/librbd.h
+ add_dependencies(rbd_internal librbd-tp)
+endif()
+if(WITH_EVENTTRACE)
+ add_dependencies(rbd_internal eventtrace_tp)
+endif()
+target_link_libraries(rbd_internal PRIVATE
+ osdc rbd_types
+ OpenSSL::SSL)
+target_include_directories(rbd_internal PRIVATE ${OPENSSL_INCLUDE_DIR})
+if(LINUX AND HAVE_LIBCRYPTSETUP)
+ target_include_directories(rbd_internal PRIVATE ${LIBCRYPTSETUP_INCLUDE_DIR})
+ target_link_libraries(rbd_internal PRIVATE ${LIBCRYPTSETUP_LIBRARIES})
+endif()
+
+add_custom_target(librbd_plugins)
+set(librbd_plugins_dir ${CEPH_INSTALL_PKGLIBDIR}/librbd)
+
+set(rbd_plugin_parent_cache_srcs
+ cache/ParentCacheObjectDispatch.cc
+ plugin/ParentCache.cc)
+add_library(librbd_plugin_parent_cache SHARED
+ ${rbd_plugin_parent_cache_srcs})
+target_link_libraries(librbd_plugin_parent_cache PRIVATE
+ ceph_immutable_object_cache_lib ceph-common librbd
+ libneorados
+ librados)
+set_target_properties(librbd_plugin_parent_cache PROPERTIES
+ OUTPUT_NAME ceph_librbd_parent_cache
+ VERSION 1.0.0
+ SOVERSION 1)
+install(TARGETS librbd_plugin_parent_cache DESTINATION ${librbd_plugins_dir})
+add_dependencies(librbd_plugins librbd_plugin_parent_cache)
+
+if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
+ set(rbd_plugin_pwl_srcs
+ cache/WriteLogImageDispatch.cc
+ cache/pwl/AbstractWriteLog.cc
+ cache/pwl/DiscardRequest.cc
+ cache/pwl/ImageCacheState.cc
+ cache/pwl/InitRequest.cc
+ cache/pwl/LogEntry.cc
+ cache/pwl/LogMap.cc
+ cache/pwl/LogOperation.cc
+ cache/pwl/Request.cc
+ cache/pwl/ShutdownRequest.cc
+ cache/pwl/SyncPoint.cc
+ cache/pwl/Types.cc
+ plugin/WriteLogImageCache.cc)
+
+ if(WITH_RBD_SSD_CACHE)
+ set(rbd_plugin_pwl_srcs
+ ${rbd_plugin_pwl_srcs}
+ cache/pwl/ssd/LogEntry.cc
+ cache/pwl/ssd/LogOperation.cc
+ cache/pwl/ssd/ReadRequest.cc
+ cache/pwl/ssd/Request.cc
+ cache/pwl/ssd/WriteLog.cc)
+ endif()
+ if(WITH_RBD_RWL)
+ set(rbd_plugin_pwl_srcs
+ ${rbd_plugin_pwl_srcs}
+ cache/pwl/rwl/WriteLog.cc
+ cache/pwl/rwl/LogEntry.cc
+ cache/pwl/rwl/LogOperation.cc
+ cache/pwl/rwl/ReadRequest.cc
+ cache/pwl/rwl/Request.cc)
+ endif()
+
+ add_library(librbd_plugin_pwl_cache SHARED
+ ${rbd_plugin_pwl_srcs})
+ target_link_libraries(librbd_plugin_pwl_cache PRIVATE
+ blk
+ ceph-common
+ cls_rbd_client
+ libneorados
+ librados
+ StdFilesystem::filesystem)
+
+ if(WITH_RBD_RWL)
+ target_link_libraries(librbd_plugin_pwl_cache
+ PUBLIC pmem::pmemobj
+ PRIVATE pmem::pmem)
+ endif()
+
+ set_target_properties(librbd_plugin_pwl_cache PROPERTIES
+ OUTPUT_NAME ceph_librbd_pwl_cache
+ VERSION 1.0.0
+ SOVERSION 1)
+ install(TARGETS librbd_plugin_pwl_cache DESTINATION ${librbd_plugins_dir})
+ add_dependencies(librbd_plugins librbd_plugin_pwl_cache)
+endif()
+
+add_library(librbd ${CEPH_SHARED}
+ librbd.cc)
+if(WITH_LTTNG)
+ add_dependencies(librbd librbd-tp)
+endif()
+
+target_link_libraries(librbd PRIVATE
+ rbd_internal
+ rbd_types
+ journal
+ cls_rbd_client
+ cls_lock_client
+ cls_journal_client
+ libneorados
+ librados
+ ceph-common
+ pthread
+ ${CMAKE_DL_LIBS}
+ ${EXTRALIBS} ${GSSAPI_LIBRARIES})
+if(HAVE_UDEV)
+ target_link_libraries(librbd PRIVATE
+ udev)
+endif()
+if(ENABLE_SHARED)
+ set_target_properties(librbd PROPERTIES
+ OUTPUT_NAME rbd
+ VERSION 1.16.0
+ SOVERSION 1
+ CXX_VISIBILITY_PRESET hidden
+ VISIBILITY_INLINES_HIDDEN ON)
+ if(NOT APPLE AND NOT WIN32)
+ set_property(TARGET librbd APPEND_STRING PROPERTY
+ LINK_FLAGS " -Wl,--exclude-libs,ALL")
+ endif()
+endif(ENABLE_SHARED)
+install(TARGETS librbd DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/src/librbd/ConfigWatcher.cc b/src/librbd/ConfigWatcher.cc
new file mode 100644
index 000000000..0e4127804
--- /dev/null
+++ b/src/librbd/ConfigWatcher.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ConfigWatcher.h"
+#include "common/config_obs.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/api/Config.h"
+#include <deque>
+#include <string>
+#include <vector>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ConfigWatcher: " \
+ << __func__ << ": "
+
+namespace librbd {
+
+template <typename I>
+struct ConfigWatcher<I>::Observer : public md_config_obs_t {
+ ConfigWatcher<I>* m_config_watcher;
+
+ std::deque<std::string> m_config_key_strs;
+ mutable std::vector<const char*> m_config_keys;
+
+ Observer(CephContext* cct, ConfigWatcher<I>* config_watcher)
+ : m_config_watcher(config_watcher) {
+ const std::string rbd_key_prefix("rbd_");
+ auto& schema = cct->_conf.get_schema();
+ for (auto& pair : schema) {
+ // watch all "rbd_" keys for simplicity
+ if (!boost::starts_with(pair.first, rbd_key_prefix)) {
+ continue;
+ }
+
+ m_config_key_strs.emplace_back(pair.first);
+ }
+
+ m_config_keys.reserve(m_config_key_strs.size());
+ for (auto& key : m_config_key_strs) {
+ m_config_keys.emplace_back(key.c_str());
+ }
+ m_config_keys.emplace_back(nullptr);
+ }
+
+ const char** get_tracked_conf_keys() const override {
+ ceph_assert(!m_config_keys.empty());
+ return &m_config_keys[0];
+ }
+
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override {
+ m_config_watcher->handle_global_config_change(changed);
+ }
+};
+
+template <typename I>
+ConfigWatcher<I>::ConfigWatcher(I& image_ctx)
+ : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+ConfigWatcher<I>::~ConfigWatcher() {
+ ceph_assert(m_observer == nullptr);
+}
+
+template <typename I>
+void ConfigWatcher<I>::init() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ m_observer = new Observer(cct, this);
+ cct->_conf.add_observer(m_observer);
+}
+
+template <typename I>
+void ConfigWatcher<I>::shut_down() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(m_observer != nullptr);
+ cct->_conf.remove_observer(m_observer);
+
+ delete m_observer;
+ m_observer = nullptr;
+}
+
+template <typename I>
+void ConfigWatcher<I>::handle_global_config_change(
+ std::set<std::string> changed_keys) {
+
+ {
+ // ignore any global changes that are being overridden
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ for (auto& key : m_image_ctx.config_overrides) {
+ changed_keys.erase(key);
+ }
+ }
+ if (changed_keys.empty()) {
+ return;
+ }
+
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 10) << "changed_keys=" << changed_keys << dendl;
+
+ // refresh the image to pick up any global config overrides
+ m_image_ctx.state->handle_update_notification();
+}
+
+} // namespace librbd
+
+template class librbd::ConfigWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/ConfigWatcher.h b/src/librbd/ConfigWatcher.h
new file mode 100644
index 000000000..1f10c8cb8
--- /dev/null
+++ b/src/librbd/ConfigWatcher.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CONFIG_WATCHER_H
+#define CEPH_LIBRBD_CONFIG_WATCHER_H
+
+#include <set>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+template <typename ImageCtxT>
+class ConfigWatcher {
+public:
+ static ConfigWatcher* create(ImageCtxT& image_ctx) {
+ return new ConfigWatcher(image_ctx);
+ }
+
+ ConfigWatcher(ImageCtxT& image_ctx);
+ ~ConfigWatcher();
+
+ ConfigWatcher(const ConfigWatcher&) = delete;
+ ConfigWatcher& operator=(const ConfigWatcher&) = delete;
+
+ void init();
+ void shut_down();
+
+private:
+ struct Observer;
+
+ ImageCtxT& m_image_ctx;
+
+ Observer* m_observer = nullptr;
+
+ void handle_global_config_change(std::set<std::string> changed);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ConfigWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CONFIG_WATCHER_H
diff --git a/src/librbd/DeepCopyRequest.cc b/src/librbd/DeepCopyRequest.cc
new file mode 100644
index 000000000..af26ef0c9
--- /dev/null
+++ b/src/librbd/DeepCopyRequest.cc
@@ -0,0 +1,361 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "DeepCopyRequest.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/ImageCopyRequest.h"
+#include "librbd/deep_copy/MetadataCopyRequest.h"
+#include "librbd/deep_copy/SnapshotCopyRequest.h"
+#include "librbd/internal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DeepCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+
+using namespace librbd::deep_copy;
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+DeepCopyRequest<I>::DeepCopyRequest(I *src_image_ctx, I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const ObjectNumber &object_number,
+ asio::ContextWQ *work_queue,
+ SnapSeqs *snap_seqs,
+ deep_copy::Handler *handler,
+ Context *on_finish)
+ : RefCountedObject(dst_image_ctx->cct), m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start),
+ m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start),
+ m_flatten(flatten), m_object_number(object_number),
+ m_work_queue(work_queue), m_snap_seqs(snap_seqs), m_handler(handler),
+ m_on_finish(on_finish), m_cct(dst_image_ctx->cct),
+ m_lock(ceph::make_mutex(unique_lock_name("DeepCopyRequest::m_lock", this))) {
+}
+
+template <typename I>
+DeepCopyRequest<I>::~DeepCopyRequest() {
+ ceph_assert(m_snapshot_copy_request == nullptr);
+ ceph_assert(m_image_copy_request == nullptr);
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send() {
+ if (!m_src_image_ctx->data_ctx.is_valid()) {
+ lderr(m_cct) << "missing data pool for source image" << dendl;
+ finish(-ENODEV);
+ return;
+ }
+
+ if (!m_dst_image_ctx->data_ctx.is_valid()) {
+ lderr(m_cct) << "missing data pool for destination image" << dendl;
+ finish(-ENODEV);
+ return;
+ }
+
+ int r = validate_copy_points();
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ send_copy_snapshots();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::cancel() {
+ std::lock_guard locker{m_lock};
+
+ ldout(m_cct, 20) << dendl;
+
+ m_canceled = true;
+
+ if (m_snapshot_copy_request != nullptr) {
+ m_snapshot_copy_request->cancel();
+ }
+
+ if (m_image_copy_request != nullptr) {
+ m_image_copy_request->cancel();
+ }
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_snapshots() {
+ m_lock.lock();
+ if (m_canceled) {
+ m_lock.unlock();
+ finish(-ECANCELED);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = create_context_callback<
+ DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_snapshots>(this);
+ m_snapshot_copy_request = SnapshotCopyRequest<I>::create(
+ m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end,
+ m_dst_snap_id_start, m_flatten, m_work_queue, m_snap_seqs, ctx);
+ m_snapshot_copy_request->get();
+ m_lock.unlock();
+
+ m_snapshot_copy_request->send();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_snapshots(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_snapshot_copy_request->put();
+ m_snapshot_copy_request = nullptr;
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ }
+
+ if (r == -ECANCELED) {
+ ldout(m_cct, 10) << "snapshot copy canceled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to copy snapshot metadata: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_src_snap_id_end == CEPH_NOSNAP) {
+ (*m_snap_seqs)[CEPH_NOSNAP] = CEPH_NOSNAP;
+ }
+
+ send_copy_image();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_image() {
+ m_lock.lock();
+ if (m_canceled) {
+ m_lock.unlock();
+ finish(-ECANCELED);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = create_context_callback<
+ DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_image>(this);
+ m_image_copy_request = ImageCopyRequest<I>::create(
+ m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end,
+ m_dst_snap_id_start, m_flatten, m_object_number, *m_snap_seqs, m_handler,
+ ctx);
+ m_image_copy_request->get();
+ m_lock.unlock();
+
+ m_image_copy_request->send();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_image_copy_request->put();
+ m_image_copy_request = nullptr;
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ }
+
+ if (r == -ECANCELED) {
+ ldout(m_cct, 10) << "image copy canceled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to copy image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_copy_object_map();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_object_map() {
+ m_dst_image_ctx->owner_lock.lock_shared();
+ m_dst_image_ctx->image_lock.lock_shared();
+
+ if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP,
+ m_dst_image_ctx->image_lock)) {
+ m_dst_image_ctx->image_lock.unlock_shared();
+ m_dst_image_ctx->owner_lock.unlock_shared();
+ send_copy_metadata();
+ return;
+ }
+ if (m_src_snap_id_end == CEPH_NOSNAP) {
+ m_dst_image_ctx->image_lock.unlock_shared();
+ m_dst_image_ctx->owner_lock.unlock_shared();
+ send_refresh_object_map();
+ return;
+ }
+
+ ceph_assert(m_dst_image_ctx->object_map != nullptr);
+
+ ldout(m_cct, 20) << dendl;
+
+ Context *finish_op_ctx = nullptr;
+ int r;
+ if (m_dst_image_ctx->exclusive_lock != nullptr) {
+ finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r);
+ }
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ m_dst_image_ctx->image_lock.unlock_shared();
+ m_dst_image_ctx->owner_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+
+ // rollback the object map (copy snapshot object map to HEAD)
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_copy_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+ ceph_assert(m_snap_seqs->count(m_src_snap_id_end) > 0);
+ librados::snap_t copy_snap_id = (*m_snap_seqs)[m_src_snap_id_end];
+ m_dst_image_ctx->object_map->rollback(copy_snap_id, ctx);
+ m_dst_image_ctx->image_lock.unlock_shared();
+ m_dst_image_ctx->owner_lock.unlock_shared();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_object_map(int r) {
+ ldout(m_cct, 20) << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to roll back object map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_refresh_object_map();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_refresh_object_map() {
+ int r;
+ Context *finish_op_ctx = nullptr;
+ {
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ if (m_dst_image_ctx->exclusive_lock != nullptr) {
+ finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r);
+ }
+ }
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_refresh_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+ m_object_map = m_dst_image_ctx->create_object_map(CEPH_NOSNAP);
+ m_object_map->open(ctx);
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_refresh_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to open object map: " << cpp_strerror(r)
+ << dendl;
+ delete m_object_map;
+
+ finish(r);
+ return;
+ }
+
+ {
+ std::unique_lock image_locker{m_dst_image_ctx->image_lock};
+ std::swap(m_dst_image_ctx->object_map, m_object_map);
+ }
+ m_object_map->put();
+
+ send_copy_metadata();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::send_copy_metadata() {
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = create_context_callback<
+ DeepCopyRequest<I>, &DeepCopyRequest<I>::handle_copy_metadata>(this);
+ auto request = MetadataCopyRequest<I>::create(m_src_image_ctx,
+ m_dst_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void DeepCopyRequest<I>::handle_copy_metadata(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+int DeepCopyRequest<I>::validate_copy_points() {
+ std::shared_lock image_locker{m_src_image_ctx->image_lock};
+
+ if (m_src_snap_id_start != 0 &&
+ m_src_image_ctx->snap_info.find(m_src_snap_id_start) ==
+ m_src_image_ctx->snap_info.end()) {
+ lderr(m_cct) << "invalid start snap_id " << m_src_snap_id_start << dendl;
+ return -EINVAL;
+ }
+
+ if (m_src_snap_id_end != CEPH_NOSNAP &&
+ m_src_image_ctx->snap_info.find(m_src_snap_id_end) ==
+ m_src_image_ctx->snap_info.end()) {
+ lderr(m_cct) << "invalid end snap_id " << m_src_snap_id_end << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void DeepCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ put();
+}
+
+} // namespace librbd
+
+template class librbd::DeepCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/DeepCopyRequest.h b/src/librbd/DeepCopyRequest.h
new file mode 100644
index 000000000..c8bd02299
--- /dev/null
+++ b/src/librbd/DeepCopyRequest.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "common/RefCountedObj.h"
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include "librbd/deep_copy/Types.h"
+
+#include <map>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+namespace asio { struct ContextWQ; }
+
+namespace deep_copy {
+
+template <typename> class ImageCopyRequest;
+template <typename> class SnapshotCopyRequest;
+struct Handler;
+
+}
+
+template <typename ImageCtxT = ImageCtx>
+class DeepCopyRequest : public RefCountedObject {
+public:
+ static DeepCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const deep_copy::ObjectNumber &object_number,
+ asio::ContextWQ *work_queue,
+ SnapSeqs *snap_seqs,
+ deep_copy::Handler *handler,
+ Context *on_finish) {
+ return new DeepCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start,
+ src_snap_id_end, dst_snap_id_start, flatten,
+ object_number, work_queue, snap_seqs, handler,
+ on_finish);
+ }
+
+ DeepCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, const deep_copy::ObjectNumber &object_number,
+ asio::ContextWQ *work_queue, SnapSeqs *snap_seqs,
+ deep_copy::Handler *handler, Context *on_finish);
+ ~DeepCopyRequest();
+
+ void send();
+ void cancel();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * COPY_SNAPSHOTS
+ * |
+ * v
+ * COPY_IMAGE . . . . . . . . . . . . . .
+ * | .
+ * v .
+ * COPY_OBJECT_MAP (skip if object .
+ * | map disabled) .
+ * v .
+ * REFRESH_OBJECT_MAP (skip if object . (image copy canceled)
+ * | map disabled) .
+ * v .
+ * COPY_METADATA .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ typedef std::vector<librados::snap_t> SnapIds;
+ typedef std::map<librados::snap_t, SnapIds> SnapMap;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_src_snap_id_end;
+ librados::snap_t m_dst_snap_id_start;
+ bool m_flatten;
+ deep_copy::ObjectNumber m_object_number;
+ asio::ContextWQ *m_work_queue;
+ SnapSeqs *m_snap_seqs;
+ deep_copy::Handler *m_handler;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ ceph::mutex m_lock;
+ bool m_canceled = false;
+
+ deep_copy::SnapshotCopyRequest<ImageCtxT> *m_snapshot_copy_request = nullptr;
+ deep_copy::ImageCopyRequest<ImageCtxT> *m_image_copy_request = nullptr;
+ decltype(ImageCtxT::object_map) m_object_map = nullptr;
+
+ void send_copy_snapshots();
+ void handle_copy_snapshots(int r);
+
+ void send_copy_image();
+ void handle_copy_image(int r);
+
+ void send_copy_object_map();
+ void handle_copy_object_map(int r);
+
+ void send_refresh_object_map();
+ void handle_refresh_object_map(int r);
+
+ void send_copy_metadata();
+ void handle_copy_metadata(int r);
+
+ int validate_copy_points();
+
+ void finish(int r);
+};
+
+} // namespace librbd
+
+extern template class librbd::DeepCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_REQUEST_H
diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc
new file mode 100644
index 000000000..76945d847
--- /dev/null
+++ b/src/librbd/ExclusiveLock.cc
@@ -0,0 +1,388 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/Utils.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ImageState.h"
+#include "librbd/exclusive_lock/ImageDispatch.h"
+#include "librbd/exclusive_lock/PreAcquireRequest.h"
+#include "librbd/exclusive_lock/PostAcquireRequest.h"
+#include "librbd/exclusive_lock/PreReleaseRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "common/ceph_mutex.h"
+#include "common/dout.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock: " << this << " " \
+ << __func__
+
+namespace librbd {
+
+using namespace exclusive_lock;
+using librbd::util::create_context_callback;
+
+template <typename I>
+using ML = ManagedLock<I>;
+
+template <typename I>
+ExclusiveLock<I>::ExclusiveLock(I &image_ctx)
+ : RefCountedObject(image_ctx.cct),
+ ML<I>(image_ctx.md_ctx, *image_ctx.asio_engine, image_ctx.header_oid,
+ image_ctx.image_watcher, managed_lock::EXCLUSIVE,
+ image_ctx.config.template get_val<bool>("rbd_blocklist_on_break_lock"),
+ image_ctx.config.template get_val<uint64_t>("rbd_blocklist_expire_seconds")),
+ m_image_ctx(image_ctx) {
+ std::lock_guard locker{ML<I>::m_lock};
+ ML<I>::set_state_uninitialized();
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_request(OperationRequestType request_type,
+ int *ret_val) const {
+ std::lock_guard locker{ML<I>::m_lock};
+
+ bool accept_request =
+ (!ML<I>::is_state_shutdown() && ML<I>::is_state_locked() &&
+ (m_request_blocked_count == 0 ||
+ m_image_ctx.get_exclusive_lock_policy()->accept_blocked_request(
+ request_type)));
+ if (ret_val != nullptr) {
+ *ret_val = accept_request ? 0 : m_request_blocked_ret_val;
+ }
+
+ ldout(m_image_ctx.cct, 20) << "=" << accept_request << " (request_type="
+ << request_type << ")" << dendl;
+ return accept_request;
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_ops() const {
+ std::lock_guard locker{ML<I>::m_lock};
+ bool accept = accept_ops(ML<I>::m_lock);
+ ldout(m_image_ctx.cct, 20) << "=" << accept << dendl;
+ return accept;
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_ops(const ceph::mutex &lock) const {
+ return (!ML<I>::is_state_shutdown() &&
+ (ML<I>::is_state_locked() || ML<I>::is_state_post_acquiring()));
+}
+
+template <typename I>
+void ExclusiveLock<I>::set_require_lock(bool init_shutdown,
+ io::Direction direction,
+ Context* on_finish) {
+ m_image_dispatch->set_require_lock(init_shutdown, direction, on_finish);
+}
+
+template <typename I>
+void ExclusiveLock<I>::unset_require_lock(io::Direction direction) {
+ m_image_dispatch->unset_require_lock(direction);
+}
+
+template <typename I>
+void ExclusiveLock<I>::block_requests(int r) {
+ std::lock_guard locker{ML<I>::m_lock};
+
+ m_request_blocked_count++;
+ if (m_request_blocked_ret_val == 0) {
+ m_request_blocked_ret_val = r;
+ }
+
+ ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+}
+
+template <typename I>
+void ExclusiveLock<I>::unblock_requests() {
+ std::lock_guard locker{ML<I>::m_lock};
+
+ ceph_assert(m_request_blocked_count > 0);
+ m_request_blocked_count--;
+ if (m_request_blocked_count == 0) {
+ m_request_blocked_ret_val = 0;
+ }
+
+ ldout(m_image_ctx.cct, 20) << dendl;
+}
+
+template <typename I>
+int ExclusiveLock<I>::get_unlocked_op_error() const {
+ if (m_image_ctx.image_watcher->is_blocklisted()) {
+ return -EBLOCKLISTED;
+ }
+ return -EROFS;
+}
+
+template <typename I>
+void ExclusiveLock<I>::init(uint64_t features, Context *on_init) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ on_init = create_context_callback<Context>(on_init, this);
+
+ ldout(m_image_ctx.cct, 10) << ": features=" << features << dendl;
+
+ {
+ std::lock_guard locker{ML<I>::m_lock};
+ ML<I>::set_state_initializing();
+ }
+
+ m_image_dispatch = exclusive_lock::ImageDispatch<I>::create(&m_image_ctx);
+ m_image_ctx.io_image_dispatcher->register_dispatch(m_image_dispatch);
+
+ on_init = new LambdaContext([this, on_init](int r) {
+ {
+ std::lock_guard locker{ML<I>::m_lock};
+ ML<I>::set_state_unlocked();
+ }
+
+ on_init->complete(r);
+ });
+
+ bool pwl_enabled = cache::util::is_pwl_enabled(m_image_ctx);
+ if (m_image_ctx.clone_copy_on_read ||
+ (features & RBD_FEATURE_JOURNALING) != 0 ||
+ pwl_enabled) {
+ m_image_dispatch->set_require_lock(true, io::DIRECTION_BOTH, on_init);
+ } else {
+ m_image_dispatch->set_require_lock(true, io::DIRECTION_WRITE, on_init);
+ }
+}
+
+template <typename I>
+void ExclusiveLock<I>::shut_down(Context *on_shut_down) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ auto ref = ceph::ref_t<ExclusiveLock<I>>(this);
+ on_shut_down = create_context_callback<Context>(on_shut_down, this);
+
+ ML<I>::shut_down(on_shut_down);
+
+ // if stalled in request state machine -- abort
+ handle_peer_notification(0);
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_peer_notification(int r) {
+ std::lock_guard locker{ML<I>::m_lock};
+ if (!ML<I>::is_state_waiting_for_lock()) {
+ return;
+ }
+
+ ldout(m_image_ctx.cct, 10) << dendl;
+ ceph_assert(ML<I>::is_action_acquire_lock());
+
+ m_acquire_lock_peer_ret_val = r;
+ ML<I>::execute_next_action();
+}
+
+template <typename I>
+Context *ExclusiveLock<I>::start_op(int* ret_val) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ std::lock_guard locker{ML<I>::m_lock};
+
+ if (!accept_ops(ML<I>::m_lock)) {
+ *ret_val = get_unlocked_op_error();
+ return nullptr;
+ }
+
+ m_async_op_tracker.start_op();
+ return new LambdaContext([this](int r) {
+ m_async_op_tracker.finish_op();
+ });
+}
+
+template <typename I>
+void ExclusiveLock<I>::shutdown_handler(int r, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ {
+ std::unique_lock owner_locker{m_image_ctx.owner_lock};
+ m_image_ctx.exclusive_lock = nullptr;
+ }
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ m_image_dispatch = nullptr;
+ m_image_ctx.image_watcher->flush(on_finish);
+ });
+ m_image_ctx.io_image_dispatcher->shut_down_dispatch(
+ m_image_dispatch->get_dispatch_layer(), on_finish);
+}
+
+template <typename I>
+void ExclusiveLock<I>::pre_acquire_lock_handler(Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ int acquire_lock_peer_ret_val = 0;
+ {
+ std::lock_guard locker{ML<I>::m_lock};
+ std::swap(acquire_lock_peer_ret_val, m_acquire_lock_peer_ret_val);
+ }
+
+ if (acquire_lock_peer_ret_val == -EROFS) {
+ ldout(m_image_ctx.cct, 10) << ": peer nacked lock request" << dendl;
+ on_finish->complete(acquire_lock_peer_ret_val);
+ return;
+ }
+
+ PreAcquireRequest<I> *req = PreAcquireRequest<I>::create(m_image_ctx,
+ on_finish);
+ m_image_ctx.op_work_queue->queue(new LambdaContext([req](int r) {
+ req->send();
+ }));
+}
+
+template <typename I>
+void ExclusiveLock<I>::post_acquire_lock_handler(int r, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl;
+
+ if (r == -EROFS) {
+ // peer refused to release the exclusive lock
+ on_finish->complete(r);
+ return;
+ } else if (r < 0) {
+ ML<I>::m_lock.lock();
+ ceph_assert(ML<I>::is_state_acquiring());
+
+ // PostAcquire state machine will not run, so we need complete prepare
+ m_image_ctx.state->handle_prepare_lock_complete();
+
+ // if lock is in-use by another client, request the lock
+ if (ML<I>::is_action_acquire_lock() && (r == -EBUSY || r == -EAGAIN)) {
+ ML<I>::set_state_waiting_for_lock();
+ ML<I>::m_lock.unlock();
+
+ // request the lock from a peer
+ m_image_ctx.image_watcher->notify_request_lock();
+
+ // inform manage lock that we have interrupted the state machine
+ r = -ECANCELED;
+ } else {
+ ML<I>::m_lock.unlock();
+
+ // clear error if peer owns lock
+ if (r == -EAGAIN) {
+ r = 0;
+ }
+ }
+
+ on_finish->complete(r);
+ return;
+ }
+
+ std::lock_guard locker{ML<I>::m_lock};
+ m_pre_post_callback = on_finish;
+ using EL = ExclusiveLock<I>;
+ PostAcquireRequest<I> *req = PostAcquireRequest<I>::create(m_image_ctx,
+ util::create_context_callback<EL, &EL::handle_post_acquiring_lock>(this),
+ util::create_context_callback<EL, &EL::handle_post_acquired_lock>(this));
+
+ m_image_ctx.op_work_queue->queue(new LambdaContext([req](int r) {
+ req->send();
+ }));
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_post_acquiring_lock(int r) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ std::lock_guard locker{ML<I>::m_lock};
+
+ ceph_assert(r == 0);
+
+ // lock is owned at this point
+ ML<I>::set_state_post_acquiring();
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_post_acquired_lock(int r) {
+ ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{ML<I>::m_lock};
+ ceph_assert(ML<I>::is_state_acquiring() ||
+ ML<I>::is_state_post_acquiring());
+
+ assert (m_pre_post_callback != nullptr);
+ std::swap(m_pre_post_callback, on_finish);
+ }
+
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ m_image_ctx.perfcounter->tset(l_librbd_lock_acquired_time,
+ ceph_clock_now());
+ m_image_ctx.image_watcher->notify_acquired_lock();
+ m_image_dispatch->unset_require_lock(io::DIRECTION_BOTH);
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ExclusiveLock<I>::pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+ std::lock_guard locker{ML<I>::m_lock};
+
+ auto req = PreReleaseRequest<I>::create(
+ m_image_ctx, m_image_dispatch, shutting_down, m_async_op_tracker,
+ on_finish);
+ m_image_ctx.op_work_queue->queue(new LambdaContext([req](int r) {
+ req->send();
+ }));
+}
+
+template <typename I>
+void ExclusiveLock<I>::post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << ": r=" << r << " shutting_down="
+ << shutting_down << dendl;
+ if (!shutting_down) {
+ {
+ std::lock_guard locker{ML<I>::m_lock};
+ ceph_assert(ML<I>::is_state_pre_releasing() ||
+ ML<I>::is_state_releasing());
+ }
+
+ if (r >= 0) {
+ m_image_ctx.image_watcher->notify_released_lock();
+ }
+
+ on_finish->complete(r);
+ } else {
+ {
+ std::unique_lock owner_locker{m_image_ctx.owner_lock};
+ m_image_ctx.exclusive_lock = nullptr;
+ }
+
+ on_finish = new LambdaContext([this, r, on_finish](int) {
+ m_image_dispatch = nullptr;
+ m_image_ctx.image_watcher->notify_released_lock();
+ on_finish->complete(r);
+ });
+ m_image_ctx.io_image_dispatcher->shut_down_dispatch(
+ m_image_dispatch->get_dispatch_layer(), on_finish);
+ }
+}
+
+template <typename I>
+void ExclusiveLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << dendl;
+ if (r >= 0) {
+ m_image_ctx.image_watcher->notify_acquired_lock();
+ }
+
+ on_finish->complete(r);
+}
+
+} // namespace librbd
+
+template class librbd::ExclusiveLock<librbd::ImageCtx>;
diff --git a/src/librbd/ExclusiveLock.h b/src/librbd/ExclusiveLock.h
new file mode 100644
index 000000000..9915262f9
--- /dev/null
+++ b/src/librbd/ExclusiveLock.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_H
+
+#include "common/AsyncOpTracker.h"
+#include "librbd/ManagedLock.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/io/Types.h"
+#include "common/RefCountedObj.h"
+
+struct Context;
+
+namespace librbd {
+
+namespace exclusive_lock { template <typename> struct ImageDispatch; }
+
+template <typename ImageCtxT = ImageCtx>
+class ExclusiveLock : public RefCountedObject,
+ public ManagedLock<ImageCtxT> {
+public:
+ static ExclusiveLock *create(ImageCtxT &image_ctx) {
+ return new ExclusiveLock<ImageCtxT>(image_ctx);
+ }
+
+ ExclusiveLock(ImageCtxT &image_ctx);
+
+ bool accept_request(exclusive_lock::OperationRequestType request_type,
+ int *ret_val) const;
+ bool accept_ops() const;
+
+ void set_require_lock(bool init_shutdown, io::Direction direction,
+ Context* on_finish);
+ void unset_require_lock(io::Direction direction);
+
+ void block_requests(int r);
+ void unblock_requests();
+
+ void init(uint64_t features, Context *on_init);
+ void shut_down(Context *on_shutdown);
+
+ void handle_peer_notification(int r);
+
+ int get_unlocked_op_error() const;
+ Context *start_op(int* ret_val);
+
+protected:
+ void shutdown_handler(int r, Context *on_finish) override;
+ void pre_acquire_lock_handler(Context *on_finish) override;
+ void post_acquire_lock_handler(int r, Context *on_finish) override;
+ void pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) override;
+ void post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) override;
+ void post_reacquire_lock_handler(int r, Context *on_finish) override;
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start> * * > WAITING_FOR_REGISTER --------\
+ * | * (watch not registered) |
+ * | * |
+ * | * * > WAITING_FOR_PEER ------------\
+ * | * (request_lock busy) |
+ * | * |
+ * | * * * * * * * * * * * * * * |
+ * | * |
+ * v (init) (try_lock/request_lock) * |
+ * UNINITIALIZED -------> UNLOCKED ------------------------> ACQUIRING <--/
+ * ^ |
+ * | v
+ * RELEASING POST_ACQUIRING
+ * | |
+ * | |
+ * | (release_lock) v
+ * PRE_RELEASING <------------------------ LOCKED
+ *
+ * <LOCKED state>
+ * |
+ * v
+ * REACQUIRING -------------------------------------> <finish>
+ * . ^
+ * . |
+ * . . . > <RELEASE action> ---> <ACQUIRE action> ---/
+ *
+ * <UNLOCKED/LOCKED states>
+ * |
+ * |
+ * v
+ * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ exclusive_lock::ImageDispatch<ImageCtxT>* m_image_dispatch = nullptr;
+ Context *m_pre_post_callback = nullptr;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ uint32_t m_request_blocked_count = 0;
+ int m_request_blocked_ret_val = 0;
+
+ int m_acquire_lock_peer_ret_val = 0;
+
+ bool accept_ops(const ceph::mutex &lock) const;
+
+ void handle_post_acquiring_lock(int r);
+ void handle_post_acquired_lock(int r);
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_H
diff --git a/src/librbd/Features.cc b/src/librbd/Features.cc
new file mode 100644
index 000000000..9da5b1dc4
--- /dev/null
+++ b/src/librbd/Features.cc
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "librbd/Features.h"
+#include "include/rbd/features.h"
+
+#include <map>
+#include <vector>
+
+static const std::map<std::string, uint64_t> RBD_FEATURE_MAP = {
+ {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING},
+ {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2},
+ {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK},
+ {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP},
+ {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF},
+ {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN},
+ {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING},
+ {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL},
+ {RBD_FEATURE_NAME_OPERATIONS, RBD_FEATURE_OPERATIONS},
+ {RBD_FEATURE_NAME_MIGRATING, RBD_FEATURE_MIGRATING},
+ {RBD_FEATURE_NAME_NON_PRIMARY, RBD_FEATURE_NON_PRIMARY},
+ {RBD_FEATURE_NAME_DIRTY_CACHE, RBD_FEATURE_DIRTY_CACHE},
+};
+static_assert((RBD_FEATURE_DIRTY_CACHE << 1) > RBD_FEATURES_ALL,
+ "new RBD feature added");
+
+
+namespace librbd {
+
+std::string rbd_features_to_string(uint64_t features,
+ std::ostream *err)
+{
+ std::string r;
+ for (auto& i : RBD_FEATURE_MAP) {
+ if (features & i.second) {
+ if (!r.empty()) {
+ r += ",";
+ }
+ r += i.first;
+ features &= ~i.second;
+ }
+ }
+ if (err && features) {
+ *err << "ignoring unknown feature mask 0x"
+ << std::hex << features << std::dec;
+ }
+ return r;
+}
+
+uint64_t rbd_features_from_string(const std::string& orig_value,
+ std::ostream *err)
+{
+ uint64_t features = 0;
+ std::string value = orig_value;
+ boost::trim(value);
+
+ // empty string means default features
+ if (!value.size()) {
+ return RBD_FEATURES_DEFAULT;
+ }
+
+ try {
+ // numeric?
+ features = boost::lexical_cast<uint64_t>(value);
+
+ // drop unrecognized bits
+ uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL);
+ if (unsupported_features != 0ull) {
+ features &= RBD_FEATURES_ALL;
+ if (err) {
+ *err << "ignoring unknown feature mask 0x"
+ << std::hex << unsupported_features << std::dec;
+ }
+ }
+
+ uint64_t ignore_features_mask = (
+ RBD_FEATURES_INTERNAL | RBD_FEATURES_MUTABLE_INTERNAL);
+ uint64_t ignored_features = (features & ignore_features_mask);
+ if (ignored_features != 0ULL) {
+ features &= ~ignore_features_mask;
+ if (err) {
+ *err << "ignoring feature mask 0x" << std::hex << ignored_features;
+ }
+ }
+ } catch (boost::bad_lexical_cast&) {
+ // feature name list?
+ bool errors = false;
+ std::vector<std::string> feature_names;
+ boost::split(feature_names, value, boost::is_any_of(","));
+ for (auto feature_name: feature_names) {
+ boost::trim(feature_name);
+ auto feature_it = RBD_FEATURE_MAP.find(feature_name);
+ if (feature_it != RBD_FEATURE_MAP.end()) {
+ features += feature_it->second;
+ } else if (err) {
+ if (errors) {
+ *err << ", ";
+ } else {
+ errors = true;
+ }
+ *err << "ignoring unknown feature " << feature_name;
+ }
+ }
+ }
+ return features;
+}
+
+} // namespace librbd
diff --git a/src/librbd/Features.h b/src/librbd/Features.h
new file mode 100644
index 000000000..6a88827cf
--- /dev/null
+++ b/src/librbd/Features.h
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <ostream>
+
+namespace librbd {
+
+ std::string rbd_features_to_string(uint64_t features,
+ std::ostream *err);
+ uint64_t rbd_features_from_string(const std::string& value,
+ std::ostream *err);
+
+} // librbd
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
new file mode 100644
index 000000000..058352949
--- /dev/null
+++ b/src/librbd/ImageCtx.cc
@@ -0,0 +1,965 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <errno.h>
+#include <boost/assign/list_of.hpp>
+#include <stddef.h>
+
+#include "include/neorados/RADOS.hpp"
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/Timer.h"
+
+#include "librbd/AsioEngine.h"
+#include "librbd/AsyncRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/internal.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/LibrbdAdminSocketHook.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/PluginRegistry.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/exclusive_lock/AutomaticPolicy.h"
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ImageDispatcher.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/QosImageDispatch.h"
+#include "librbd/journal/StandardPolicy.h"
+#include "librbd/operation/ResizeRequest.h"
+
+#include "osdc/Striper.h"
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageCtx: "
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+
+namespace librbd {
+
+namespace {
+
+class SafeTimerSingleton : public CommonSafeTimer<ceph::mutex> {
+public:
+ ceph::mutex lock = ceph::make_mutex("librbd::SafeTimerSingleton::lock");
+
+ explicit SafeTimerSingleton(CephContext *cct)
+ : SafeTimer(cct, lock, true) {
+ init();
+ }
+ ~SafeTimerSingleton() {
+ std::lock_guard locker{lock};
+ shutdown();
+ }
+};
+
+librados::IoCtx duplicate_io_ctx(librados::IoCtx& io_ctx) {
+ librados::IoCtx dup_io_ctx;
+ dup_io_ctx.dup(io_ctx);
+ return dup_io_ctx;
+}
+
+} // anonymous namespace
+
+ const string ImageCtx::METADATA_CONF_PREFIX = "conf_";
+
+ ImageCtx::ImageCtx(const string &image_name, const string &image_id,
+ const char *snap, IoCtx& p, bool ro)
+ : cct((CephContext*)p.cct()),
+ config(cct->_conf),
+ perfcounter(NULL),
+ snap_id(CEPH_NOSNAP),
+ snap_exists(true),
+ read_only(ro),
+ read_only_flags(ro ? IMAGE_READ_ONLY_FLAG_USER : 0U),
+ exclusive_locked(false),
+ name(image_name),
+ asio_engine(std::make_shared<AsioEngine>(p)),
+ rados_api(asio_engine->get_rados_api()),
+ data_ctx(duplicate_io_ctx(p)),
+ md_ctx(duplicate_io_ctx(p)),
+ image_watcher(NULL),
+ journal(NULL),
+ owner_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ImageCtx::owner_lock", this))),
+ image_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ImageCtx::image_lock", this))),
+ timestamp_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ImageCtx::timestamp_lock", this))),
+ async_ops_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageCtx::async_ops_lock", this))),
+ copyup_list_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageCtx::copyup_list_lock", this))),
+ extra_read_flags(0),
+ old_format(false),
+ order(0), size(0), features(0),
+ format_string(NULL),
+ id(image_id), parent(NULL),
+ stripe_unit(0), stripe_count(0), flags(0),
+ readahead(),
+ total_bytes_read(0),
+ state(new ImageState<>(this)),
+ operations(new Operations<>(*this)),
+ exclusive_lock(nullptr), object_map(nullptr),
+ op_work_queue(asio_engine->get_work_queue()),
+ plugin_registry(new PluginRegistry<ImageCtx>(this)),
+ event_socket_completions(32),
+ asok_hook(nullptr),
+ trace_endpoint("librbd")
+ {
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "image_name=" << image_name << ", "
+ << "image_id=" << image_id << dendl;
+
+ if (snap)
+ snap_name = snap;
+
+ rebuild_data_io_context();
+
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&header, 0, sizeof(header));
+
+ io_image_dispatcher = new io::ImageDispatcher<ImageCtx>(this);
+ io_object_dispatcher = new io::ObjectDispatcher<ImageCtx>(this);
+
+ if (cct->_conf.get_val<bool>("rbd_auto_exclusive_lock_until_manual_request")) {
+ exclusive_lock_policy = new exclusive_lock::AutomaticPolicy(this);
+ } else {
+ exclusive_lock_policy = new exclusive_lock::StandardPolicy(this);
+ }
+ journal_policy = new journal::StandardPolicy(this);
+ }
+
+ ImageCtx::ImageCtx(const string &image_name, const string &image_id,
+ uint64_t snap_id, IoCtx& p, bool ro)
+ : ImageCtx(image_name, image_id, "", p, ro) {
+ open_snap_id = snap_id;
+ }
+
+ ImageCtx::~ImageCtx() {
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ ceph_assert(config_watcher == nullptr);
+ ceph_assert(image_watcher == NULL);
+ ceph_assert(exclusive_lock == NULL);
+ ceph_assert(object_map == NULL);
+ ceph_assert(journal == NULL);
+ ceph_assert(asok_hook == NULL);
+
+ if (perfcounter) {
+ perf_stop();
+ }
+ delete[] format_string;
+
+ md_ctx.aio_flush();
+ if (data_ctx.is_valid()) {
+ data_ctx.aio_flush();
+ }
+
+ delete io_object_dispatcher;
+ delete io_image_dispatcher;
+
+ delete journal_policy;
+ delete exclusive_lock_policy;
+ delete operations;
+ delete state;
+
+ delete plugin_registry;
+ }
+
+ void ImageCtx::init() {
+ ceph_assert(!header_oid.empty());
+ ceph_assert(old_format || !id.empty());
+
+ asok_hook = new LibrbdAdminSocketHook(this);
+
+ string pname = string("librbd-") + id + string("-") +
+ md_ctx.get_pool_name() + string("-") + name;
+ if (!snap_name.empty()) {
+ pname += "-";
+ pname += snap_name;
+ }
+
+ trace_endpoint.copy_name(pname);
+ perf_start(pname);
+
+ ceph_assert(image_watcher == NULL);
+ image_watcher = new ImageWatcher<>(*this);
+ }
+
+ void ImageCtx::shutdown() {
+ delete image_watcher;
+ image_watcher = nullptr;
+
+ delete asok_hook;
+ asok_hook = nullptr;
+ }
+
+ void ImageCtx::init_layout(int64_t pool_id)
+ {
+ if (stripe_unit == 0 || stripe_count == 0) {
+ stripe_unit = 1ull << order;
+ stripe_count = 1;
+ }
+
+ vector<uint64_t> alignments;
+ alignments.push_back(stripe_count << order); // object set (in file striping terminology)
+ alignments.push_back(stripe_unit * stripe_count); // stripe
+ alignments.push_back(stripe_unit); // stripe unit
+ readahead.set_alignments(alignments);
+
+ layout = file_layout_t();
+ layout.stripe_unit = stripe_unit;
+ layout.stripe_count = stripe_count;
+ layout.object_size = 1ull << order;
+ layout.pool_id = pool_id; // FIXME: pool id overflow?
+
+ delete[] format_string;
+ size_t len = object_prefix.length() + 16;
+ format_string = new char[len];
+ if (old_format) {
+ snprintf(format_string, len, "%s.%%012llx", object_prefix.c_str());
+ } else {
+ snprintf(format_string, len, "%s.%%016llx", object_prefix.c_str());
+ }
+
+ ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
+ << " stripe_count " << stripe_count
+ << " object_size " << layout.object_size
+ << " prefix " << object_prefix
+ << " format " << format_string
+ << dendl;
+ }
+
+ void ImageCtx::perf_start(string name) {
+ auto perf_prio = PerfCountersBuilder::PRIO_DEBUGONLY;
+ if (child == nullptr) {
+ // ensure top-level IO stats are exported for librbd daemons
+ perf_prio = PerfCountersBuilder::PRIO_USEFUL;
+ }
+
+ PerfCountersBuilder plb(cct, name, l_librbd_first, l_librbd_last);
+
+ plb.add_u64_counter(l_librbd_rd, "rd", "Reads", "r", perf_prio);
+ plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes", "Data size in reads",
+ "rb", perf_prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_rd_latency, "rd_latency", "Latency of reads",
+ "rl", perf_prio);
+ plb.add_u64_counter(l_librbd_wr, "wr", "Writes", "w", perf_prio);
+ plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes", "Written data",
+ "wb", perf_prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_wr_latency, "wr_latency", "Write latency",
+ "wl", perf_prio);
+ plb.add_u64_counter(l_librbd_discard, "discard", "Discards");
+ plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_discard_latency, "discard_latency", "Discard latency");
+ plb.add_u64_counter(l_librbd_flush, "flush", "Flushes");
+ plb.add_time_avg(l_librbd_flush_latency, "flush_latency", "Latency of flushes");
+ plb.add_u64_counter(l_librbd_ws, "ws", "WriteSames");
+ plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_ws_latency, "ws_latency", "WriteSame latency");
+ plb.add_u64_counter(l_librbd_cmp, "cmp", "CompareAndWrites");
+ plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_librbd_cmp_latency, "cmp_latency", "Latency of cmps");
+ plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations");
+ plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals");
+ plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks");
+ plb.add_u64_counter(l_librbd_snap_rename, "snap_rename", "Snap rename");
+ plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications");
+ plb.add_u64_counter(l_librbd_resize, "resize", "Resizes");
+ plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead");
+ plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_librbd_invalidate_cache, "invalidate_cache", "Cache invalidates");
+
+ plb.add_time(l_librbd_opened_time, "opened_time", "Opened time",
+ "ots", perf_prio);
+ plb.add_time(l_librbd_lock_acquired_time, "lock_acquired_time",
+ "Lock acquired time", "lats", perf_prio);
+
+ perfcounter = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perfcounter);
+
+ perfcounter->tset(l_librbd_opened_time, ceph_clock_now());
+ }
+
+ void ImageCtx::perf_stop() {
+ ceph_assert(perfcounter);
+ cct->get_perfcounters_collection()->remove(perfcounter);
+ delete perfcounter;
+ }
+
+ void ImageCtx::set_read_flag(unsigned flag) {
+ extra_read_flags |= flag;
+ }
+
+ int ImageCtx::get_read_flags(snap_t snap_id) {
+ int flags = librados::OPERATION_NOFLAG | read_flags;
+ if (flags != 0)
+ return flags;
+
+ flags = librados::OPERATION_NOFLAG | extra_read_flags;
+ if (snap_id == LIBRADOS_SNAP_HEAD)
+ return flags;
+
+ if (config.get_val<bool>("rbd_balance_snap_reads"))
+ flags |= librados::OPERATION_BALANCE_READS;
+ else if (config.get_val<bool>("rbd_localize_snap_reads"))
+ flags |= librados::OPERATION_LOCALIZE_READS;
+ return flags;
+ }
+
+ int ImageCtx::snap_set(uint64_t in_snap_id) {
+ ceph_assert(ceph_mutex_is_wlocked(image_lock));
+ auto it = snap_info.find(in_snap_id);
+ if (in_snap_id != CEPH_NOSNAP && it != snap_info.end()) {
+ snap_id = in_snap_id;
+ snap_namespace = it->second.snap_namespace;
+ snap_name = it->second.name;
+ snap_exists = true;
+ if (data_ctx.is_valid()) {
+ data_ctx.snap_set_read(snap_id);
+ rebuild_data_io_context();
+ }
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ void ImageCtx::snap_unset()
+ {
+ ceph_assert(ceph_mutex_is_wlocked(image_lock));
+ snap_id = CEPH_NOSNAP;
+ snap_namespace = {};
+ snap_name = "";
+ snap_exists = true;
+ if (data_ctx.is_valid()) {
+ data_ctx.snap_set_read(snap_id);
+ rebuild_data_io_context();
+ }
+ }
+
+ snap_t ImageCtx::get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace,
+ const string& in_snap_name) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ auto it = snap_ids.find({in_snap_namespace, in_snap_name});
+ if (it != snap_ids.end()) {
+ return it->second;
+ }
+ return CEPH_NOSNAP;
+ }
+
+ const SnapInfo* ImageCtx::get_snap_info(snap_t in_snap_id) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ map<snap_t, SnapInfo>::const_iterator it =
+ snap_info.find(in_snap_id);
+ if (it != snap_info.end())
+ return &it->second;
+ return nullptr;
+ }
+
+ int ImageCtx::get_snap_name(snap_t in_snap_id,
+ string *out_snap_name) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *out_snap_name = info->name;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::get_snap_namespace(snap_t in_snap_id,
+ cls::rbd::SnapshotNamespace *out_snap_namespace) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *out_snap_namespace = info->snap_namespace;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::get_parent_spec(snap_t in_snap_id,
+ cls::rbd::ParentImageSpec *out_pspec) const
+ {
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *out_pspec = info->parent.spec;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ uint64_t ImageCtx::get_current_size() const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ return size;
+ }
+
+ uint64_t ImageCtx::get_object_size() const
+ {
+ return 1ull << order;
+ }
+
+ string ImageCtx::get_object_name(uint64_t num) const {
+ return util::data_object_name(this, num);
+ }
+
+ uint64_t ImageCtx::get_stripe_unit() const
+ {
+ return stripe_unit;
+ }
+
+ uint64_t ImageCtx::get_stripe_count() const
+ {
+ return stripe_count;
+ }
+
+ uint64_t ImageCtx::get_stripe_period() const
+ {
+ return stripe_count * (1ull << order);
+ }
+
+ utime_t ImageCtx::get_create_timestamp() const
+ {
+ return create_timestamp;
+ }
+
+ utime_t ImageCtx::get_access_timestamp() const
+ {
+ return access_timestamp;
+ }
+
+ utime_t ImageCtx::get_modify_timestamp() const
+ {
+ return modify_timestamp;
+ }
+
+ void ImageCtx::set_access_timestamp(utime_t at)
+ {
+ ceph_assert(ceph_mutex_is_wlocked(timestamp_lock));
+ access_timestamp = at;
+ }
+
+ void ImageCtx::set_modify_timestamp(utime_t mt)
+ {
+ ceph_assert(ceph_mutex_is_locked(timestamp_lock));
+ modify_timestamp = mt;
+ }
+
+ int ImageCtx::is_snap_protected(snap_t in_snap_id,
+ bool *is_protected) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *is_protected =
+ (info->protection_status == RBD_PROTECTION_STATUS_PROTECTED);
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::is_snap_unprotected(snap_t in_snap_id,
+ bool *is_unprotected) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ *is_unprotected =
+ (info->protection_status == RBD_PROTECTION_STATUS_UNPROTECTED);
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ void ImageCtx::add_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ string in_snap_name,
+ snap_t id, uint64_t in_size,
+ const ParentImageInfo &parent,
+ uint8_t protection_status, uint64_t flags,
+ utime_t timestamp)
+ {
+ ceph_assert(ceph_mutex_is_wlocked(image_lock));
+ snaps.push_back(id);
+ SnapInfo info(in_snap_name, in_snap_namespace,
+ in_size, parent, protection_status, flags, timestamp);
+ snap_info.insert({id, info});
+ snap_ids.insert({{in_snap_namespace, in_snap_name}, id});
+ }
+
+ void ImageCtx::rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ string in_snap_name,
+ snap_t id)
+ {
+ ceph_assert(ceph_mutex_is_wlocked(image_lock));
+ snaps.erase(std::remove(snaps.begin(), snaps.end(), id), snaps.end());
+ snap_info.erase(id);
+ snap_ids.erase({in_snap_namespace, in_snap_name});
+ }
+
+ uint64_t ImageCtx::get_image_size(snap_t in_snap_id) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ if (in_snap_id == CEPH_NOSNAP) {
+ if (!resize_reqs.empty() &&
+ resize_reqs.front()->shrinking()) {
+ return resize_reqs.front()->get_image_size();
+ }
+ return size;
+ }
+
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info) {
+ return info->size;
+ }
+ return 0;
+ }
+
+ uint64_t ImageCtx::get_effective_image_size(snap_t in_snap_id) const {
+ auto raw_size = get_image_size(in_snap_id);
+ if (raw_size == 0) {
+ return 0;
+ }
+
+ io::Extents extents = {{raw_size, 0}};
+ io_image_dispatcher->remap_extents(
+ extents, io::IMAGE_EXTENTS_MAP_TYPE_PHYSICAL_TO_LOGICAL);
+ return extents.front().first;
+ }
+
+ uint64_t ImageCtx::get_object_count(snap_t in_snap_id) const {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ uint64_t image_size = get_image_size(in_snap_id);
+ return Striper::get_num_objects(layout, image_size);
+ }
+
+ bool ImageCtx::test_features(uint64_t features) const
+ {
+ std::shared_lock l{image_lock};
+ return test_features(features, image_lock);
+ }
+
+ bool ImageCtx::test_features(uint64_t in_features,
+ const ceph::shared_mutex &in_image_lock) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ return ((features & in_features) == in_features);
+ }
+
+ bool ImageCtx::test_op_features(uint64_t in_op_features) const
+ {
+ std::shared_lock l{image_lock};
+ return test_op_features(in_op_features, image_lock);
+ }
+
+ bool ImageCtx::test_op_features(uint64_t in_op_features,
+ const ceph::shared_mutex &in_image_lock) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ return ((op_features & in_op_features) == in_op_features);
+ }
+
+ int ImageCtx::get_flags(librados::snap_t _snap_id, uint64_t *_flags) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ if (_snap_id == CEPH_NOSNAP) {
+ *_flags = flags;
+ return 0;
+ }
+ const SnapInfo *info = get_snap_info(_snap_id);
+ if (info) {
+ *_flags = info->flags;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ int ImageCtx::test_flags(librados::snap_t in_snap_id,
+ uint64_t flags, bool *flags_set) const
+ {
+ std::shared_lock l{image_lock};
+ return test_flags(in_snap_id, flags, image_lock, flags_set);
+ }
+
+ int ImageCtx::test_flags(librados::snap_t in_snap_id,
+ uint64_t flags,
+ const ceph::shared_mutex &in_image_lock,
+ bool *flags_set) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ uint64_t snap_flags;
+ int r = get_flags(in_snap_id, &snap_flags);
+ if (r < 0) {
+ return r;
+ }
+ *flags_set = ((snap_flags & flags) == flags);
+ return 0;
+ }
+
+ int ImageCtx::update_flags(snap_t in_snap_id, uint64_t flag, bool enabled)
+ {
+ ceph_assert(ceph_mutex_is_wlocked(image_lock));
+ uint64_t *_flags;
+ if (in_snap_id == CEPH_NOSNAP) {
+ _flags = &flags;
+ } else {
+ map<snap_t, SnapInfo>::iterator it = snap_info.find(in_snap_id);
+ if (it == snap_info.end()) {
+ return -ENOENT;
+ }
+ _flags = &it->second.flags;
+ }
+
+ if (enabled) {
+ (*_flags) |= flag;
+ } else {
+ (*_flags) &= ~flag;
+ }
+ return 0;
+ }
+
+ const ParentImageInfo* ImageCtx::get_parent_info(snap_t in_snap_id) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ if (in_snap_id == CEPH_NOSNAP)
+ return &parent_md;
+ const SnapInfo *info = get_snap_info(in_snap_id);
+ if (info)
+ return &info->parent;
+ return NULL;
+ }
+
+ int64_t ImageCtx::get_parent_pool_id(snap_t in_snap_id) const
+ {
+ const auto info = get_parent_info(in_snap_id);
+ if (info)
+ return info->spec.pool_id;
+ return -1;
+ }
+
+ string ImageCtx::get_parent_image_id(snap_t in_snap_id) const
+ {
+ const auto info = get_parent_info(in_snap_id);
+ if (info)
+ return info->spec.image_id;
+ return "";
+ }
+
+ uint64_t ImageCtx::get_parent_snap_id(snap_t in_snap_id) const
+ {
+ const auto info = get_parent_info(in_snap_id);
+ if (info)
+ return info->spec.snap_id;
+ return CEPH_NOSNAP;
+ }
+
+ int ImageCtx::get_parent_overlap(snap_t in_snap_id, uint64_t *overlap) const
+ {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ const auto info = get_parent_info(in_snap_id);
+ if (info) {
+ *overlap = info->overlap;
+ return 0;
+ }
+ return -ENOENT;
+ }
+
+ void ImageCtx::register_watch(Context *on_finish) {
+ ceph_assert(image_watcher != NULL);
+ image_watcher->register_watch(on_finish);
+ }
+
+ uint64_t ImageCtx::prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
+ uint64_t overlap)
+ {
+ // drop extents completely beyond the overlap
+ while (!objectx.empty() && objectx.back().first >= overlap)
+ objectx.pop_back();
+
+ // trim final overlapping extent
+ if (!objectx.empty() && objectx.back().first + objectx.back().second > overlap)
+ objectx.back().second = overlap - objectx.back().first;
+
+ uint64_t len = 0;
+ for (vector<pair<uint64_t,uint64_t> >::iterator p = objectx.begin();
+ p != objectx.end();
+ ++p)
+ len += p->second;
+ ldout(cct, 10) << "prune_parent_extents image overlap " << overlap
+ << ", object overlap " << len
+ << " from image extents " << objectx << dendl;
+ return len;
+ }
+
+ void ImageCtx::cancel_async_requests() {
+ C_SaferCond ctx;
+ cancel_async_requests(&ctx);
+ ctx.wait();
+ }
+
+ void ImageCtx::cancel_async_requests(Context *on_finish) {
+ {
+ std::lock_guard async_ops_locker{async_ops_lock};
+ if (!async_requests.empty()) {
+ ldout(cct, 10) << "canceling async requests: count="
+ << async_requests.size() << dendl;
+ for (auto req : async_requests) {
+ ldout(cct, 10) << "canceling async request: " << req << dendl;
+ req->cancel();
+ }
+ async_requests_waiters.push_back(on_finish);
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+ }
+
+ void ImageCtx::apply_metadata(const std::map<std::string, bufferlist> &meta,
+ bool thread_safe) {
+ ldout(cct, 20) << __func__ << dendl;
+
+ std::unique_lock image_locker(image_lock);
+
+ // reset settings back to global defaults
+ config_overrides.clear();
+ config.set_config_values(cct->_conf.get_config_values());
+
+ // extract config overrides
+ for (auto meta_pair : meta) {
+ if (!boost::starts_with(meta_pair.first, METADATA_CONF_PREFIX)) {
+ continue;
+ }
+
+ std::string key = meta_pair.first.substr(METADATA_CONF_PREFIX.size());
+ if (!boost::starts_with(key, "rbd_")) {
+ // ignore non-RBD configuration keys
+ // TODO use option schema to determine applicable subsystem
+ ldout(cct, 0) << __func__ << ": ignoring config " << key << dendl;
+ continue;
+ }
+
+ if (config.find_option(key) != nullptr) {
+ std::string val(meta_pair.second.c_str(), meta_pair.second.length());
+ int r = config.set_val(key, val);
+ if (r >= 0) {
+ ldout(cct, 20) << __func__ << ": " << key << "=" << val << dendl;
+ config_overrides.insert(key);
+ } else {
+ lderr(cct) << __func__ << ": failed to set config " << key << " "
+ << "with value " << val << ": " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+ }
+
+ image_locker.unlock();
+
+#define ASSIGN_OPTION(param, type) \
+ param = config.get_val<type>("rbd_"#param)
+
+ bool skip_partial_discard = true;
+ ASSIGN_OPTION(non_blocking_aio, bool);
+ ASSIGN_OPTION(cache, bool);
+ ASSIGN_OPTION(sparse_read_threshold_bytes, Option::size_t);
+ ASSIGN_OPTION(clone_copy_on_read, bool);
+ ASSIGN_OPTION(enable_alloc_hint, bool);
+ ASSIGN_OPTION(mirroring_replay_delay, uint64_t);
+ ASSIGN_OPTION(mtime_update_interval, uint64_t);
+ ASSIGN_OPTION(atime_update_interval, uint64_t);
+ ASSIGN_OPTION(skip_partial_discard, bool);
+ ASSIGN_OPTION(discard_granularity_bytes, uint64_t);
+ ASSIGN_OPTION(blkin_trace_all, bool);
+
+ auto cache_policy = config.get_val<std::string>("rbd_cache_policy");
+ if (cache_policy == "writethrough" || cache_policy == "writeback") {
+ ASSIGN_OPTION(readahead_max_bytes, Option::size_t);
+ ASSIGN_OPTION(readahead_disable_after_bytes, Option::size_t);
+ }
+
+#undef ASSIGN_OPTION
+
+ if (sparse_read_threshold_bytes == 0) {
+ sparse_read_threshold_bytes = get_object_size();
+ }
+
+ bool dirty_cache = test_features(RBD_FEATURE_DIRTY_CACHE);
+ if (!skip_partial_discard || dirty_cache) {
+ discard_granularity_bytes = 0;
+ }
+
+ alloc_hint_flags = 0;
+ auto compression_hint = config.get_val<std::string>("rbd_compression_hint");
+ if (compression_hint == "compressible") {
+ alloc_hint_flags |= librados::ALLOC_HINT_FLAG_COMPRESSIBLE;
+ } else if (compression_hint == "incompressible") {
+ alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ }
+
+ librados::Rados rados(md_ctx);
+ int8_t require_osd_release;
+ int r = rados.get_min_compatible_osd(&require_osd_release);
+ if (r == 0 && require_osd_release >= CEPH_RELEASE_OCTOPUS) {
+ read_flags = 0;
+ auto read_policy = config.get_val<std::string>("rbd_read_from_replica_policy");
+ if (read_policy == "balance") {
+ read_flags |= librados::OPERATION_BALANCE_READS;
+ } else if (read_policy == "localize") {
+ read_flags |= librados::OPERATION_LOCALIZE_READS;
+ }
+ }
+
+ io_image_dispatcher->apply_qos_schedule_tick_min(
+ config.get_val<uint64_t>("rbd_qos_schedule_tick_min"));
+
+ io_image_dispatcher->apply_qos_limit(
+ io::IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_iops_limit"),
+ config.get_val<uint64_t>("rbd_qos_iops_burst"),
+ config.get_val<uint64_t>("rbd_qos_iops_burst_seconds"));
+ io_image_dispatcher->apply_qos_limit(
+ io::IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_bps_limit"),
+ config.get_val<uint64_t>("rbd_qos_bps_burst"),
+ config.get_val<uint64_t>("rbd_qos_bps_burst_seconds"));
+ io_image_dispatcher->apply_qos_limit(
+ io::IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_read_iops_limit"),
+ config.get_val<uint64_t>("rbd_qos_read_iops_burst"),
+ config.get_val<uint64_t>("rbd_qos_read_iops_burst_seconds"));
+ io_image_dispatcher->apply_qos_limit(
+ io::IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_write_iops_limit"),
+ config.get_val<uint64_t>("rbd_qos_write_iops_burst"),
+ config.get_val<uint64_t>("rbd_qos_write_iops_burst_seconds"));
+ io_image_dispatcher->apply_qos_limit(
+ io::IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_read_bps_limit"),
+ config.get_val<uint64_t>("rbd_qos_read_bps_burst"),
+ config.get_val<uint64_t>("rbd_qos_read_bps_burst_seconds"));
+ io_image_dispatcher->apply_qos_limit(
+ io::IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE,
+ config.get_val<uint64_t>("rbd_qos_write_bps_limit"),
+ config.get_val<uint64_t>("rbd_qos_write_bps_burst"),
+ config.get_val<uint64_t>("rbd_qos_write_bps_burst_seconds"));
+
+ if (!disable_zero_copy &&
+ config.get_val<bool>("rbd_disable_zero_copy_writes")) {
+ ldout(cct, 5) << this << ": disabling zero-copy writes" << dendl;
+ disable_zero_copy = true;
+ }
+ }
+
+ ExclusiveLock<ImageCtx> *ImageCtx::create_exclusive_lock() {
+ return new ExclusiveLock<ImageCtx>(*this);
+ }
+
+ ObjectMap<ImageCtx> *ImageCtx::create_object_map(uint64_t snap_id) {
+ return new ObjectMap<ImageCtx>(*this, snap_id);
+ }
+
+ Journal<ImageCtx> *ImageCtx::create_journal() {
+ return new Journal<ImageCtx>(*this);
+ }
+
+ void ImageCtx::set_image_name(const std::string &image_name) {
+ // update the name so rename can be invoked repeatedly
+ std::shared_lock owner_locker{owner_lock};
+ std::unique_lock image_locker{image_lock};
+ name = image_name;
+ if (old_format) {
+ header_oid = util::old_header_name(image_name);
+ }
+ }
+
+ void ImageCtx::notify_update() {
+ state->handle_update_notification();
+ ImageWatcher<>::notify_header_update(md_ctx, header_oid);
+ }
+
+ void ImageCtx::notify_update(Context *on_finish) {
+ state->handle_update_notification();
+ image_watcher->notify_header_update(on_finish);
+ }
+
+ exclusive_lock::Policy *ImageCtx::get_exclusive_lock_policy() const {
+ ceph_assert(ceph_mutex_is_locked(owner_lock));
+ ceph_assert(exclusive_lock_policy != nullptr);
+ return exclusive_lock_policy;
+ }
+
+ void ImageCtx::set_exclusive_lock_policy(exclusive_lock::Policy *policy) {
+ ceph_assert(ceph_mutex_is_wlocked(owner_lock));
+ ceph_assert(policy != nullptr);
+ delete exclusive_lock_policy;
+ exclusive_lock_policy = policy;
+ }
+
+ journal::Policy *ImageCtx::get_journal_policy() const {
+ ceph_assert(ceph_mutex_is_locked(image_lock));
+ ceph_assert(journal_policy != nullptr);
+ return journal_policy;
+ }
+
+ void ImageCtx::set_journal_policy(journal::Policy *policy) {
+ ceph_assert(ceph_mutex_is_wlocked(image_lock));
+ ceph_assert(policy != nullptr);
+ delete journal_policy;
+ journal_policy = policy;
+ }
+
+ void ImageCtx::rebuild_data_io_context() {
+ auto ctx = std::make_shared<neorados::IOContext>(
+ data_ctx.get_id(), data_ctx.get_namespace());
+ if (snap_id != CEPH_NOSNAP) {
+ ctx->read_snap(snap_id);
+ }
+ if (!snapc.snaps.empty()) {
+ ctx->write_snap_context(
+ {{snapc.seq, {snapc.snaps.begin(), snapc.snaps.end()}}});
+ }
+ if (data_ctx.get_pool_full_try()) {
+ ctx->full_try(true);
+ }
+
+ // atomically reset the data IOContext to new version
+ atomic_store(&data_io_context, ctx);
+ }
+
+ IOContext ImageCtx::get_data_io_context() const {
+ return atomic_load(&data_io_context);
+ }
+
+ IOContext ImageCtx::duplicate_data_io_context() const {
+ auto ctx = get_data_io_context();
+ return std::make_shared<neorados::IOContext>(*ctx);
+ }
+
+ void ImageCtx::get_timer_instance(CephContext *cct, SafeTimer **timer,
+ ceph::mutex **timer_lock) {
+ auto safe_timer_singleton =
+ &cct->lookup_or_create_singleton_object<SafeTimerSingleton>(
+ "librbd::journal::safe_timer", false, cct);
+ *timer = safe_timer_singleton;
+ *timer_lock = &safe_timer_singleton->lock;
+ }
+}
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
new file mode 100644
index 000000000..043b26efe
--- /dev/null
+++ b/src/librbd/ImageCtx.h
@@ -0,0 +1,365 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_IMAGECTX_H
+#define CEPH_LIBRBD_IMAGECTX_H
+
+#include "include/int_types.h"
+
+#include <atomic>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "common/allocator.h"
+#include "common/Timer.h"
+#include "common/ceph_mutex.h"
+#include "common/config_proxy.h"
+#include "common/event_socket.h"
+#include "common/Readahead.h"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+
+#include "include/common_fwd.h"
+#include "include/buffer_fwd.h"
+#include "include/rbd/librbd.hpp"
+#include "include/rbd_types.h"
+#include "include/types.h"
+#include "include/xlist.h"
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsyncRequest.h"
+#include "librbd/Types.h"
+
+#include <boost/lockfree/policies.hpp>
+#include <boost/lockfree/queue.hpp>
+
+namespace neorados {
+class IOContext;
+class RADOS;
+} // namespace neorados
+
+namespace librbd {
+
+ struct AsioEngine;
+ template <typename> class ConfigWatcher;
+ template <typename> class ExclusiveLock;
+ template <typename> class ImageState;
+ template <typename> class ImageWatcher;
+ template <typename> class Journal;
+ class LibrbdAdminSocketHook;
+ template <typename> class ObjectMap;
+ template <typename> class Operations;
+ template <typename> class PluginRegistry;
+
+ namespace asio { struct ContextWQ; }
+ namespace crypto { class CryptoInterface; }
+ namespace exclusive_lock { struct Policy; }
+ namespace io {
+ class AioCompletion;
+ class AsyncOperation;
+ template <typename> class CopyupRequest;
+ struct ImageDispatcherInterface;
+ struct ObjectDispatcherInterface;
+ }
+ namespace journal { struct Policy; }
+
+ namespace operation {
+ template <typename> class ResizeRequest;
+ }
+
+ struct ImageCtx {
+ typedef std::pair<cls::rbd::SnapshotNamespace, std::string> SnapKey;
+ struct SnapKeyComparator {
+ inline bool operator()(const SnapKey& lhs, const SnapKey& rhs) const {
+ // only compare by namespace type and name
+ if (lhs.first.which() != rhs.first.which()) {
+ return lhs.first.which() < rhs.first.which();
+ }
+ return lhs.second < rhs.second;
+ }
+ };
+
+ static const string METADATA_CONF_PREFIX;
+
+ CephContext *cct;
+ ConfigProxy config;
+ std::set<std::string> config_overrides;
+
+ PerfCounters *perfcounter;
+ struct rbd_obj_header_ondisk header;
+ ::SnapContext snapc;
+ std::vector<librados::snap_t> snaps; // this mirrors snapc.snaps, but is in
+ // a format librados can understand
+ std::map<librados::snap_t, SnapInfo> snap_info;
+ std::map<SnapKey, librados::snap_t, SnapKeyComparator> snap_ids;
+ uint64_t open_snap_id = CEPH_NOSNAP;
+ uint64_t snap_id;
+ bool snap_exists; // false if our snap_id was deleted
+ // whether the image was opened read-only. cannot be changed after opening
+ bool read_only;
+ uint32_t read_only_flags = 0U;
+ uint32_t read_only_mask = ~0U;
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> lockers;
+ bool exclusive_locked;
+ std::string lock_tag;
+
+ std::string name;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+
+ std::shared_ptr<AsioEngine> asio_engine;
+
+ // New ASIO-style RADOS API
+ neorados::RADOS& rados_api;
+
+ // Legacy RADOS API
+ librados::IoCtx data_ctx;
+ librados::IoCtx md_ctx;
+
+ ConfigWatcher<ImageCtx> *config_watcher = nullptr;
+ ImageWatcher<ImageCtx> *image_watcher;
+ Journal<ImageCtx> *journal;
+
+ /**
+ * Lock ordering:
+ *
+ * owner_lock, image_lock
+ * async_op_lock, timestamp_lock
+ */
+ ceph::shared_mutex owner_lock; // protects exclusive lock leadership updates
+ mutable ceph::shared_mutex image_lock; // protects snapshot-related member variables,
+ // features (and associated helper classes), and flags
+ // protects access to the mutable image metadata that
+ // isn't guarded by other locks below, and blocks writes
+ // when held exclusively, so snapshots can be consistent.
+ // Fields guarded include:
+ // total_bytes_read
+ // exclusive_locked
+ // lock_tag
+ // lockers
+ // object_map
+ // parent_md and parent
+
+ ceph::shared_mutex timestamp_lock; // protects (create/access/modify)_timestamp
+ ceph::mutex async_ops_lock; // protects async_ops and async_requests
+ ceph::mutex copyup_list_lock; // protects copyup_waiting_list
+
+ unsigned extra_read_flags; // librados::OPERATION_*
+
+ bool old_format;
+ uint8_t order;
+ uint64_t size;
+ uint64_t features;
+ std::string object_prefix;
+ char *format_string;
+ std::string header_oid;
+ std::string id; // only used for new-format images
+ ParentImageInfo parent_md;
+ ImageCtx *parent;
+ ImageCtx *child = nullptr;
+ MigrationInfo migration_info;
+ cls::rbd::GroupSpec group_spec;
+ uint64_t stripe_unit, stripe_count;
+ uint64_t flags;
+ uint64_t op_features = 0;
+ bool operations_disabled = false;
+ utime_t create_timestamp;
+ utime_t access_timestamp;
+ utime_t modify_timestamp;
+
+ file_layout_t layout;
+
+ Readahead readahead;
+ std::atomic<uint64_t> total_bytes_read = {0};
+
+ std::map<uint64_t, io::CopyupRequest<ImageCtx>*> copyup_list;
+
+ xlist<io::AsyncOperation*> async_ops;
+ xlist<AsyncRequest<>*> async_requests;
+ std::list<Context*> async_requests_waiters;
+
+ ImageState<ImageCtx> *state;
+ Operations<ImageCtx> *operations;
+
+ ExclusiveLock<ImageCtx> *exclusive_lock;
+ ObjectMap<ImageCtx> *object_map;
+
+ xlist<operation::ResizeRequest<ImageCtx>*> resize_reqs;
+
+ io::ImageDispatcherInterface *io_image_dispatcher = nullptr;
+ io::ObjectDispatcherInterface *io_object_dispatcher = nullptr;
+
+ asio::ContextWQ *op_work_queue;
+
+ PluginRegistry<ImageCtx>* plugin_registry;
+
+ typedef boost::lockfree::queue<
+ io::AioCompletion*,
+ boost::lockfree::allocator<ceph::allocator<void>>> Completions;
+
+ Completions event_socket_completions;
+ EventSocket event_socket;
+
+ bool ignore_migrating = false;
+ bool disable_zero_copy = false;
+ bool enable_sparse_copyup = false;
+
+ /// Cached latency-sensitive configuration settings
+ bool non_blocking_aio;
+ bool cache;
+ uint64_t sparse_read_threshold_bytes;
+ uint64_t readahead_max_bytes = 0;
+ uint64_t readahead_disable_after_bytes = 0;
+ bool clone_copy_on_read;
+ bool enable_alloc_hint;
+ uint32_t alloc_hint_flags = 0U;
+ uint32_t read_flags = 0U; // librados::OPERATION_*
+ uint32_t discard_granularity_bytes = 0;
+ bool blkin_trace_all;
+ uint64_t mirroring_replay_delay;
+ uint64_t mtime_update_interval;
+ uint64_t atime_update_interval;
+
+ LibrbdAdminSocketHook *asok_hook;
+
+ exclusive_lock::Policy *exclusive_lock_policy = nullptr;
+ journal::Policy *journal_policy = nullptr;
+
+ ZTracer::Endpoint trace_endpoint;
+
+ crypto::CryptoInterface* crypto = nullptr;
+
+ // unit test mock helpers
+ static ImageCtx* create(const std::string &image_name,
+ const std::string &image_id,
+ const char *snap, IoCtx& p, bool read_only) {
+ return new ImageCtx(image_name, image_id, snap, p, read_only);
+ }
+ static ImageCtx* create(const std::string &image_name,
+ const std::string &image_id,
+ librados::snap_t snap_id, IoCtx& p,
+ bool read_only) {
+ return new ImageCtx(image_name, image_id, snap_id, p, read_only);
+ }
+
+ /**
+ * Either image_name or image_id must be set.
+ * If id is not known, pass the empty std::string,
+ * and init() will look it up.
+ */
+ ImageCtx(const std::string &image_name, const std::string &image_id,
+ const char *snap, IoCtx& p, bool read_only);
+ ImageCtx(const std::string &image_name, const std::string &image_id,
+ librados::snap_t snap_id, IoCtx& p, bool read_only);
+ ~ImageCtx();
+ void init();
+ void shutdown();
+ void init_layout(int64_t pool_id);
+ void perf_start(std::string name);
+ void perf_stop();
+ void set_read_flag(unsigned flag);
+ int get_read_flags(librados::snap_t snap_id);
+ int snap_set(uint64_t snap_id);
+ void snap_unset();
+ librados::snap_t get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace,
+ const std::string& in_snap_name) const;
+ const SnapInfo* get_snap_info(librados::snap_t in_snap_id) const;
+ int get_snap_name(librados::snap_t in_snap_id,
+ std::string *out_snap_name) const;
+ int get_snap_namespace(librados::snap_t in_snap_id,
+ cls::rbd::SnapshotNamespace *out_snap_namespace) const;
+ int get_parent_spec(librados::snap_t in_snap_id,
+ cls::rbd::ParentImageSpec *pspec) const;
+ int is_snap_protected(librados::snap_t in_snap_id,
+ bool *is_protected) const;
+ int is_snap_unprotected(librados::snap_t in_snap_id,
+ bool *is_unprotected) const;
+
+ uint64_t get_current_size() const;
+ uint64_t get_object_size() const;
+ string get_object_name(uint64_t num) const;
+ uint64_t get_stripe_unit() const;
+ uint64_t get_stripe_count() const;
+ uint64_t get_stripe_period() const;
+ utime_t get_create_timestamp() const;
+ utime_t get_access_timestamp() const;
+ utime_t get_modify_timestamp() const;
+
+ void set_access_timestamp(utime_t at);
+ void set_modify_timestamp(utime_t at);
+
+ void add_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ std::string in_snap_name,
+ librados::snap_t id,
+ uint64_t in_size, const ParentImageInfo &parent,
+ uint8_t protection_status, uint64_t flags, utime_t timestamp);
+ void rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace,
+ std::string in_snap_name,
+ librados::snap_t id);
+ uint64_t get_image_size(librados::snap_t in_snap_id) const;
+ uint64_t get_effective_image_size(librados::snap_t in_snap_id) const;
+ uint64_t get_object_count(librados::snap_t in_snap_id) const;
+ bool test_features(uint64_t test_features) const;
+ bool test_features(uint64_t test_features,
+ const ceph::shared_mutex &in_image_lock) const;
+ bool test_op_features(uint64_t op_features) const;
+ bool test_op_features(uint64_t op_features,
+ const ceph::shared_mutex &in_image_lock) const;
+ int get_flags(librados::snap_t in_snap_id, uint64_t *flags) const;
+ int test_flags(librados::snap_t in_snap_id,
+ uint64_t test_flags, bool *flags_set) const;
+ int test_flags(librados::snap_t in_snap_id,
+ uint64_t test_flags, const ceph::shared_mutex &in_image_lock,
+ bool *flags_set) const;
+ int update_flags(librados::snap_t in_snap_id, uint64_t flag, bool enabled);
+
+ const ParentImageInfo* get_parent_info(librados::snap_t in_snap_id) const;
+ int64_t get_parent_pool_id(librados::snap_t in_snap_id) const;
+ std::string get_parent_image_id(librados::snap_t in_snap_id) const;
+ uint64_t get_parent_snap_id(librados::snap_t in_snap_id) const;
+ int get_parent_overlap(librados::snap_t in_snap_id,
+ uint64_t *overlap) const;
+ void register_watch(Context *on_finish);
+ uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
+ uint64_t overlap);
+
+ void cancel_async_requests();
+ void cancel_async_requests(Context *on_finish);
+
+ void apply_metadata(const std::map<std::string, bufferlist> &meta,
+ bool thread_safe);
+
+ ExclusiveLock<ImageCtx> *create_exclusive_lock();
+ ObjectMap<ImageCtx> *create_object_map(uint64_t snap_id);
+ Journal<ImageCtx> *create_journal();
+
+ void set_image_name(const std::string &name);
+
+ void notify_update();
+ void notify_update(Context *on_finish);
+
+ exclusive_lock::Policy *get_exclusive_lock_policy() const;
+ void set_exclusive_lock_policy(exclusive_lock::Policy *policy);
+
+ journal::Policy *get_journal_policy() const;
+ void set_journal_policy(journal::Policy *policy);
+
+ void rebuild_data_io_context();
+ IOContext get_data_io_context() const;
+ IOContext duplicate_data_io_context() const;
+
+ static void get_timer_instance(CephContext *cct, SafeTimer **timer,
+ ceph::mutex **timer_lock);
+
+ private:
+ std::shared_ptr<neorados::IOContext> data_io_context;
+ };
+}
+
+#endif
diff --git a/src/librbd/ImageState.cc b/src/librbd/ImageState.cc
new file mode 100644
index 000000000..a81a8373d
--- /dev/null
+++ b/src/librbd/ImageState.cc
@@ -0,0 +1,1040 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ImageState.h"
+#include "include/rbd/librbd.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "common/WorkQueue.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/TaskFinisher.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/OpenRequest.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageState: " << this << " "
+
+namespace librbd {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+class ImageUpdateWatchers {
+public:
+
+ explicit ImageUpdateWatchers(CephContext *cct) : m_cct(cct),
+ m_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageUpdateWatchers::m_lock", this))) {
+ }
+
+ ~ImageUpdateWatchers() {
+ ceph_assert(m_watchers.empty());
+ ceph_assert(m_in_flight.empty());
+ ceph_assert(m_pending_unregister.empty());
+ ceph_assert(m_on_shut_down_finish == nullptr);
+
+ destroy_work_queue();
+ }
+
+ void flush(Context *on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ if (!m_in_flight.empty()) {
+ Context *ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing flush" << dendl;
+ on_finish->complete(r);
+ });
+ m_work_queue->queue(ctx, 0);
+ return;
+ }
+ }
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing flush" << dendl;
+ on_finish->complete(0);
+ }
+
+ void shut_down(Context *on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_shut_down_finish == nullptr);
+ m_watchers.clear();
+ if (!m_in_flight.empty()) {
+ m_on_shut_down_finish = on_finish;
+ return;
+ }
+ }
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing shut down" << dendl;
+ on_finish->complete(0);
+ }
+
+ void register_watcher(UpdateWatchCtx *watcher, uint64_t *handle) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": watcher="
+ << watcher << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_shut_down_finish == nullptr);
+
+ create_work_queue();
+
+ *handle = m_next_handle++;
+ m_watchers.insert(std::make_pair(*handle, watcher));
+ }
+
+ void unregister_watcher(uint64_t handle, Context *on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle="
+ << handle << dendl;
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+ auto it = m_watchers.find(handle);
+ if (it == m_watchers.end()) {
+ r = -ENOENT;
+ } else {
+ if (m_in_flight.find(handle) != m_in_flight.end()) {
+ ceph_assert(m_pending_unregister.find(handle) == m_pending_unregister.end());
+ m_pending_unregister[handle] = on_finish;
+ on_finish = nullptr;
+ }
+ m_watchers.erase(it);
+ }
+ }
+
+ if (on_finish) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing unregister" << dendl;
+ on_finish->complete(r);
+ }
+ }
+
+ void notify() {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl;
+
+ std::lock_guard locker{m_lock};
+ for (auto it : m_watchers) {
+ send_notify(it.first, it.second);
+ }
+ }
+
+ void send_notify(uint64_t handle, UpdateWatchCtx *watcher) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle="
+ << handle << ", watcher=" << watcher << dendl;
+
+ m_in_flight.insert(handle);
+
+ Context *ctx = new LambdaContext(
+ [this, handle, watcher](int r) {
+ handle_notify(handle, watcher);
+ });
+
+ m_work_queue->queue(ctx, 0);
+ }
+
+ void handle_notify(uint64_t handle, UpdateWatchCtx *watcher) {
+
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle="
+ << handle << ", watcher=" << watcher << dendl;
+
+ watcher->handle_notify();
+
+ Context *on_unregister_finish = nullptr;
+ Context *on_shut_down_finish = nullptr;
+
+ {
+ std::lock_guard locker{m_lock};
+
+ auto in_flight_it = m_in_flight.find(handle);
+ ceph_assert(in_flight_it != m_in_flight.end());
+ m_in_flight.erase(in_flight_it);
+
+ // If there is no more in flight notifications for this watcher
+ // and it is pending unregister, complete it now.
+ if (m_in_flight.find(handle) == m_in_flight.end()) {
+ auto it = m_pending_unregister.find(handle);
+ if (it != m_pending_unregister.end()) {
+ on_unregister_finish = it->second;
+ m_pending_unregister.erase(it);
+ }
+ }
+
+ if (m_in_flight.empty()) {
+ ceph_assert(m_pending_unregister.empty());
+ if (m_on_shut_down_finish != nullptr) {
+ std::swap(m_on_shut_down_finish, on_shut_down_finish);
+ }
+ }
+ }
+
+ if (on_unregister_finish != nullptr) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing unregister" << dendl;
+ on_unregister_finish->complete(0);
+ }
+
+ if (on_shut_down_finish != nullptr) {
+ ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__
+ << ": completing shut down" << dendl;
+ on_shut_down_finish->complete(0);
+ }
+ }
+
+private:
+ class ThreadPoolSingleton : public ThreadPool {
+ public:
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "librbd::ImageUpdateWatchers::thread_pool", "tp_librbd",
+ 1) {
+ start();
+ }
+ ~ThreadPoolSingleton() override {
+ stop();
+ }
+ };
+
+ CephContext *m_cct;
+ ceph::mutex m_lock;
+ ContextWQ *m_work_queue = nullptr;
+ std::map<uint64_t, UpdateWatchCtx*> m_watchers;
+ uint64_t m_next_handle = 0;
+ std::multiset<uint64_t> m_in_flight;
+ std::map<uint64_t, Context*> m_pending_unregister;
+ Context *m_on_shut_down_finish = nullptr;
+
+ void create_work_queue() {
+ if (m_work_queue != nullptr) {
+ return;
+ }
+ auto& thread_pool = m_cct->lookup_or_create_singleton_object<
+ ThreadPoolSingleton>("librbd::ImageUpdateWatchers::thread_pool",
+ false, m_cct);
+ m_work_queue = new ContextWQ("librbd::ImageUpdateWatchers::work_queue",
+ ceph::make_timespan(
+ m_cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout")),
+ &thread_pool);
+ }
+
+ void destroy_work_queue() {
+ if (m_work_queue == nullptr) {
+ return;
+ }
+ m_work_queue->drain();
+ delete m_work_queue;
+ }
+};
+
+class QuiesceWatchers {
+public:
+ explicit QuiesceWatchers(CephContext *cct, asio::ContextWQ* work_queue)
+ : m_cct(cct),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::QuiesceWatchers::m_lock", this))) {
+ }
+
+ ~QuiesceWatchers() {
+ ceph_assert(m_pending_unregister.empty());
+ ceph_assert(m_on_notify == nullptr);
+ }
+
+ void register_watcher(QuiesceWatchCtx *watcher, uint64_t *handle) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": watcher="
+ << watcher << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ *handle = m_next_handle++;
+ m_watchers[*handle] = watcher;
+ }
+
+ void unregister_watcher(uint64_t handle, Context *on_finish) {
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+ auto it = m_watchers.find(handle);
+ if (it == m_watchers.end()) {
+ r = -ENOENT;
+ } else {
+ if (m_on_notify != nullptr) {
+ ceph_assert(!m_pending_unregister.count(handle));
+ m_pending_unregister[handle] = on_finish;
+ on_finish = nullptr;
+ }
+ m_watchers.erase(it);
+ }
+ }
+
+ if (on_finish) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__
+ << ": completing unregister " << handle << dendl;
+ on_finish->complete(r);
+ }
+ }
+
+ void notify_quiesce(Context *on_finish) {
+ std::lock_guard locker{m_lock};
+ if (m_blocked) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": queue" << dendl;
+ m_pending_notify.push_back(on_finish);
+ return;
+ }
+
+ notify(QUIESCE, on_finish);
+ }
+
+ void notify_unquiesce(Context *on_finish) {
+ std::lock_guard locker{m_lock};
+
+ notify(UNQUIESCE, on_finish);
+ }
+
+ void quiesce_complete(uint64_t handle, int r) {
+ Context *on_notify = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_notify != nullptr);
+ ceph_assert(m_handle_quiesce_cnt > 0);
+
+ m_handle_quiesce_cnt--;
+
+ if (r < 0) {
+ ldout(m_cct, 10) << "QuiesceWatchers::" << __func__ << ": watcher "
+ << handle << " failed" << dendl;
+ m_failed_watchers.insert(handle);
+ m_ret_val = r;
+ }
+
+ if (m_handle_quiesce_cnt > 0) {
+ return;
+ }
+
+ std::swap(on_notify, m_on_notify);
+ r = m_ret_val;
+ }
+
+ on_notify->complete(r);
+ }
+
+private:
+ enum EventType {QUIESCE, UNQUIESCE};
+
+ CephContext *m_cct;
+ asio::ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock;
+ std::map<uint64_t, QuiesceWatchCtx*> m_watchers;
+ uint64_t m_next_handle = 0;
+ Context *m_on_notify = nullptr;
+ std::list<Context *> m_pending_notify;
+ std::map<uint64_t, Context*> m_pending_unregister;
+ uint64_t m_handle_quiesce_cnt = 0;
+ std::set<uint64_t> m_failed_watchers;
+ bool m_blocked = false;
+ int m_ret_val = 0;
+
+ void notify(EventType event_type, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (m_watchers.empty()) {
+ m_work_queue->queue(on_finish);
+ return;
+ }
+
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << " event: "
+ << event_type << dendl;
+
+ Context *ctx = nullptr;
+ if (event_type == QUIESCE) {
+ ceph_assert(!m_blocked);
+ ceph_assert(m_handle_quiesce_cnt == 0);
+
+ m_blocked = true;
+ m_handle_quiesce_cnt = m_watchers.size();
+ m_failed_watchers.clear();
+ m_ret_val = 0;
+ } else {
+ ceph_assert(event_type == UNQUIESCE);
+ ceph_assert(m_blocked);
+
+ ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ QuiesceWatchers, &QuiesceWatchers::handle_notify_unquiesce>(this));
+ }
+ auto gather_ctx = new C_Gather(m_cct, ctx);
+
+ ceph_assert(m_on_notify == nullptr);
+
+ m_on_notify = on_finish;
+
+ for (auto &[handle, watcher] : m_watchers) {
+ send_notify(handle, watcher, event_type, gather_ctx->new_sub());
+ }
+
+ gather_ctx->activate();
+ }
+
+ void send_notify(uint64_t handle, QuiesceWatchCtx *watcher,
+ EventType event_type, Context *on_finish) {
+ auto ctx = new LambdaContext(
+ [this, handle, watcher, event_type, on_finish](int) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": handle="
+ << handle << ", event_type=" << event_type << dendl;
+ switch (event_type) {
+ case QUIESCE:
+ watcher->handle_quiesce();
+ break;
+ case UNQUIESCE:
+ {
+ std::lock_guard locker{m_lock};
+
+ if (m_failed_watchers.count(handle)) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__
+ << ": skip for failed watcher" << dendl;
+ break;
+ }
+ }
+ watcher->handle_unquiesce();
+ break;
+ default:
+ ceph_abort_msgf("invalid event_type %d", event_type);
+ }
+
+ on_finish->complete(0);
+ });
+
+ m_work_queue->queue(ctx);
+ }
+
+ void handle_notify_unquiesce(int r) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": r=" << r
+ << dendl;
+
+ ceph_assert(r == 0);
+
+ std::unique_lock locker{m_lock};
+
+ if (!m_pending_unregister.empty()) {
+ std::map<uint64_t, Context*> pending_unregister;
+ std::swap(pending_unregister, m_pending_unregister);
+ locker.unlock();
+ for (auto &it : pending_unregister) {
+ ldout(m_cct, 20) << "QuiesceWatchers::" << __func__
+ << ": completing unregister " << it.first << dendl;
+ it.second->complete(0);
+ }
+ locker.lock();
+ }
+
+ Context *on_notify = nullptr;
+ std::swap(on_notify, m_on_notify);
+
+ ceph_assert(m_blocked);
+ m_blocked = false;
+
+ if (!m_pending_notify.empty()) {
+ auto on_finish = m_pending_notify.front();
+ m_pending_notify.pop_front();
+ notify(QUIESCE, on_finish);
+ }
+
+ locker.unlock();
+ on_notify->complete(0);
+ }
+};
+
+template <typename I>
+ImageState<I>::ImageState(I *image_ctx)
+ : m_image_ctx(image_ctx), m_state(STATE_UNINITIALIZED),
+ m_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageState::m_lock", this))),
+ m_last_refresh(0), m_refresh_seq(0),
+ m_update_watchers(new ImageUpdateWatchers(image_ctx->cct)),
+ m_quiesce_watchers(new QuiesceWatchers(
+ image_ctx->cct, image_ctx->asio_engine->get_work_queue())) {
+}
+
+template <typename I>
+ImageState<I>::~ImageState() {
+ ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED);
+ delete m_update_watchers;
+ delete m_quiesce_watchers;
+}
+
+template <typename I>
+int ImageState<I>::open(uint64_t flags) {
+ C_SaferCond ctx;
+ open(flags, &ctx);
+
+ int r = ctx.wait();
+ return r;
+}
+
+template <typename I>
+void ImageState<I>::open(uint64_t flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_lock.lock();
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ m_open_flags = flags;
+
+ Action action(ACTION_TYPE_OPEN);
+ action.refresh_seq = m_refresh_seq;
+
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+int ImageState<I>::close() {
+ C_SaferCond ctx;
+ close(&ctx);
+
+ int r = ctx.wait();
+ return r;
+}
+
+template <typename I>
+void ImageState<I>::close(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_lock.lock();
+ ceph_assert(!is_closed());
+
+ Action action(ACTION_TYPE_CLOSE);
+ action.refresh_seq = m_refresh_seq;
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+void ImageState<I>::handle_update_notification() {
+ std::lock_guard locker{m_lock};
+ ++m_refresh_seq;
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": refresh_seq = " << m_refresh_seq << ", "
+ << "last_refresh = " << m_last_refresh << dendl;
+
+ switch (m_state) {
+ case STATE_UNINITIALIZED:
+ case STATE_CLOSED:
+ case STATE_OPENING:
+ case STATE_CLOSING:
+ ldout(cct, 5) << "dropping update notification to watchers" << dendl;
+ return;
+ default:
+ break;
+ }
+
+ m_update_watchers->notify();
+}
+
+template <typename I>
+bool ImageState<I>::is_refresh_required() const {
+ std::lock_guard locker{m_lock};
+ return (m_last_refresh != m_refresh_seq || find_pending_refresh() != nullptr);
+}
+
+template <typename I>
+int ImageState<I>::refresh() {
+ C_SaferCond refresh_ctx;
+ refresh(&refresh_ctx);
+ return refresh_ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::refresh(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_lock.lock();
+ if (is_closed()) {
+ m_lock.unlock();
+ on_finish->complete(-ESHUTDOWN);
+ return;
+ }
+
+ Action action(ACTION_TYPE_REFRESH);
+ action.refresh_seq = m_refresh_seq;
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+int ImageState<I>::refresh_if_required() {
+ C_SaferCond ctx;
+ {
+ m_lock.lock();
+ Action action(ACTION_TYPE_REFRESH);
+ action.refresh_seq = m_refresh_seq;
+
+ auto refresh_action = find_pending_refresh();
+ if (refresh_action != nullptr) {
+ // if a refresh is in-flight, delay until it is finished
+ action = *refresh_action;
+ } else if (m_last_refresh == m_refresh_seq) {
+ m_lock.unlock();
+ return 0;
+ } else if (is_closed()) {
+ m_lock.unlock();
+ return -ESHUTDOWN;
+ }
+
+ execute_action_unlock(action, &ctx);
+ }
+
+ return ctx.wait();
+}
+
+template <typename I>
+const typename ImageState<I>::Action *
+ImageState<I>::find_pending_refresh() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto it = std::find_if(m_actions_contexts.rbegin(),
+ m_actions_contexts.rend(),
+ [](const ActionContexts& action_contexts) {
+ return (action_contexts.first == ACTION_TYPE_REFRESH);
+ });
+ if (it != m_actions_contexts.rend()) {
+ return &it->first;
+ }
+ return nullptr;
+}
+
+template <typename I>
+void ImageState<I>::snap_set(uint64_t snap_id, Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": snap_id=" << snap_id << dendl;
+
+ Action action(ACTION_TYPE_SET_SNAP);
+ action.snap_id = snap_id;
+
+ m_lock.lock();
+ execute_action_unlock(action, on_finish);
+}
+
+template <typename I>
+void ImageState<I>::prepare_lock(Context *on_ready) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_lock.lock();
+ if (is_closed()) {
+ m_lock.unlock();
+ on_ready->complete(-ESHUTDOWN);
+ return;
+ }
+
+ Action action(ACTION_TYPE_LOCK);
+ action.on_ready = on_ready;
+ execute_action_unlock(action, nullptr);
+}
+
+template <typename I>
+void ImageState<I>::handle_prepare_lock_complete() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_lock.lock();
+ if (m_state != STATE_PREPARING_LOCK) {
+ m_lock.unlock();
+ return;
+ }
+
+ complete_action_unlock(STATE_OPEN, 0);
+}
+
+template <typename I>
+int ImageState<I>::register_update_watcher(UpdateWatchCtx *watcher,
+ uint64_t *handle) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_update_watchers->register_watcher(watcher, handle);
+
+ ldout(cct, 20) << __func__ << ": handle=" << *handle << dendl;
+ return 0;
+}
+
+template <typename I>
+void ImageState<I>::unregister_update_watcher(uint64_t handle,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": handle=" << handle << dendl;
+
+ m_update_watchers->unregister_watcher(handle, on_finish);
+}
+
+template <typename I>
+int ImageState<I>::unregister_update_watcher(uint64_t handle) {
+ C_SaferCond ctx;
+ unregister_update_watcher(handle, &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::flush_update_watchers(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_update_watchers->flush(on_finish);
+}
+
+template <typename I>
+void ImageState<I>::shut_down_update_watchers(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_update_watchers->shut_down(on_finish);
+}
+
+template <typename I>
+bool ImageState<I>::is_transition_state() const {
+ switch (m_state) {
+ case STATE_UNINITIALIZED:
+ case STATE_OPEN:
+ case STATE_CLOSED:
+ return false;
+ case STATE_OPENING:
+ case STATE_CLOSING:
+ case STATE_REFRESHING:
+ case STATE_SETTING_SNAP:
+ case STATE_PREPARING_LOCK:
+ break;
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageState<I>::is_closed() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ return ((m_state == STATE_CLOSED) ||
+ (!m_actions_contexts.empty() &&
+ m_actions_contexts.back().first.action_type == ACTION_TYPE_CLOSE));
+}
+
+template <typename I>
+void ImageState<I>::append_context(const Action &action, Context *context) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ ActionContexts *action_contexts = nullptr;
+ for (auto &action_ctxs : m_actions_contexts) {
+ if (action == action_ctxs.first) {
+ action_contexts = &action_ctxs;
+ break;
+ }
+ }
+
+ if (action_contexts == nullptr) {
+ m_actions_contexts.push_back({action, {}});
+ action_contexts = &m_actions_contexts.back();
+ }
+
+ if (context != nullptr) {
+ action_contexts->second.push_back(context);
+ }
+}
+
+template <typename I>
+void ImageState<I>::execute_next_action_unlock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_actions_contexts.empty());
+ switch (m_actions_contexts.front().first.action_type) {
+ case ACTION_TYPE_OPEN:
+ send_open_unlock();
+ return;
+ case ACTION_TYPE_CLOSE:
+ send_close_unlock();
+ return;
+ case ACTION_TYPE_REFRESH:
+ send_refresh_unlock();
+ return;
+ case ACTION_TYPE_SET_SNAP:
+ send_set_snap_unlock();
+ return;
+ case ACTION_TYPE_LOCK:
+ send_prepare_lock_unlock();
+ return;
+ }
+ ceph_abort();
+}
+
+template <typename I>
+void ImageState<I>::execute_action_unlock(const Action &action,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ append_context(action, on_finish);
+ if (!is_transition_state()) {
+ execute_next_action_unlock();
+ } else {
+ m_lock.unlock();
+ }
+}
+
+template <typename I>
+void ImageState<I>::complete_action_unlock(State next_state, int r) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_actions_contexts.empty());
+
+ ActionContexts action_contexts(std::move(m_actions_contexts.front()));
+ m_actions_contexts.pop_front();
+
+ m_state = next_state;
+ m_lock.unlock();
+
+ if (next_state == STATE_CLOSED ||
+ (next_state == STATE_UNINITIALIZED && r < 0)) {
+ // the ImageCtx must be deleted outside the scope of its callback threads
+ auto ctx = new LambdaContext(
+ [image_ctx=m_image_ctx, contexts=std::move(action_contexts.second)]
+ (int r) {
+ delete image_ctx;
+ for (auto ctx : contexts) {
+ ctx->complete(r);
+ }
+ });
+ TaskFinisherSingleton::get_singleton(m_image_ctx->cct).queue(ctx, r);
+ } else {
+ for (auto ctx : action_contexts.second) {
+ if (next_state == STATE_OPEN) {
+ // we couldn't originally wrap the open callback w/ an async wrapper in
+ // case the image failed to open
+ ctx = create_async_context_callback(*m_image_ctx, ctx);
+ }
+ ctx->complete(r);
+ }
+
+ m_lock.lock();
+ if (!is_transition_state() && !m_actions_contexts.empty()) {
+ execute_next_action_unlock();
+ } else {
+ m_lock.unlock();
+ }
+ }
+}
+
+template <typename I>
+void ImageState<I>::send_open_unlock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_state = STATE_OPENING;
+
+ Context *ctx = create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_open>(this);
+ image::OpenRequest<I> *req = image::OpenRequest<I>::create(
+ m_image_ctx, m_open_flags, ctx);
+
+ m_lock.unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_open(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ }
+
+ m_lock.lock();
+ complete_action_unlock(r < 0 ? STATE_UNINITIALIZED : STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_close_unlock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_state = STATE_CLOSING;
+
+ Context *ctx = create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_close>(this);
+ image::CloseRequest<I> *req = image::CloseRequest<I>::create(
+ m_image_ctx, ctx);
+
+ m_lock.unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_close(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "error occurred while closing image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_lock.lock();
+ complete_action_unlock(STATE_CLOSED, r);
+}
+
+template <typename I>
+void ImageState<I>::send_refresh_unlock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_state = STATE_REFRESHING;
+ ceph_assert(!m_actions_contexts.empty());
+ auto &action_context = m_actions_contexts.front().first;
+ ceph_assert(action_context.action_type == ACTION_TYPE_REFRESH);
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_refresh>(this));
+ image::RefreshRequest<I> *req = image::RefreshRequest<I>::create(
+ *m_image_ctx, false, false, ctx);
+
+ m_lock.unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_refresh(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ m_lock.lock();
+ ceph_assert(!m_actions_contexts.empty());
+
+ ActionContexts &action_contexts(m_actions_contexts.front());
+ ceph_assert(action_contexts.first.action_type == ACTION_TYPE_REFRESH);
+ ceph_assert(m_last_refresh <= action_contexts.first.refresh_seq);
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "incomplete refresh: not updating sequence" << dendl;
+ r = 0;
+ } else {
+ m_last_refresh = action_contexts.first.refresh_seq;
+ }
+
+ complete_action_unlock(STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_set_snap_unlock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_state = STATE_SETTING_SNAP;
+
+ ceph_assert(!m_actions_contexts.empty());
+ ActionContexts &action_contexts(m_actions_contexts.front());
+ ceph_assert(action_contexts.first.action_type == ACTION_TYPE_SET_SNAP);
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "snap_id=" << action_contexts.first.snap_id << dendl;
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ ImageState<I>, &ImageState<I>::handle_set_snap>(this));
+ image::SetSnapRequest<I> *req = image::SetSnapRequest<I>::create(
+ *m_image_ctx, action_contexts.first.snap_id, ctx);
+
+ m_lock.unlock();
+ req->send();
+}
+
+template <typename I>
+void ImageState<I>::handle_set_snap(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to set snapshot: " << cpp_strerror(r) << dendl;
+ }
+
+ m_lock.lock();
+ complete_action_unlock(STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_prepare_lock_unlock() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ m_state = STATE_PREPARING_LOCK;
+
+ ceph_assert(!m_actions_contexts.empty());
+ ActionContexts &action_contexts(m_actions_contexts.front());
+ ceph_assert(action_contexts.first.action_type == ACTION_TYPE_LOCK);
+
+ Context *on_ready = action_contexts.first.on_ready;
+ m_lock.unlock();
+
+ if (on_ready == nullptr) {
+ complete_action_unlock(STATE_OPEN, 0);
+ return;
+ }
+
+ // wake up the lock handler now that its safe to proceed
+ on_ready->complete(0);
+}
+
+template <typename I>
+int ImageState<I>::register_quiesce_watcher(QuiesceWatchCtx *watcher,
+ uint64_t *handle) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_quiesce_watchers->register_watcher(watcher, handle);
+
+ ldout(cct, 20) << __func__ << ": handle=" << *handle << dendl;
+ return 0;
+}
+
+template <typename I>
+int ImageState<I>::unregister_quiesce_watcher(uint64_t handle) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": handle=" << handle << dendl;
+
+ C_SaferCond ctx;
+ m_quiesce_watchers->unregister_watcher(handle, &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::notify_quiesce(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_quiesce_watchers->notify_quiesce(on_finish);
+}
+
+template <typename I>
+void ImageState<I>::notify_unquiesce(Context *on_finish) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ m_quiesce_watchers->notify_unquiesce(on_finish);
+}
+
+template <typename I>
+void ImageState<I>::quiesce_complete(uint64_t handle, int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": handle=" << handle << " r=" << r << dendl;
+ m_quiesce_watchers->quiesce_complete(handle, r);
+}
+
+} // namespace librbd
+
+template class librbd::ImageState<librbd::ImageCtx>;
diff --git a/src/librbd/ImageState.h b/src/librbd/ImageState.h
new file mode 100644
index 000000000..5107c1a17
--- /dev/null
+++ b/src/librbd/ImageState.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_STATE_H
+#define CEPH_LIBRBD_IMAGE_STATE_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include <list>
+#include <string>
+#include <utility>
+#include "cls/rbd/cls_rbd_types.h"
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+class QuiesceWatchCtx;
+class QuiesceWatchers;
+class ImageCtx;
+class ImageUpdateWatchers;
+class UpdateWatchCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageState {
+public:
+ ImageState(ImageCtxT *image_ctx);
+ ~ImageState();
+
+ int open(uint64_t flags);
+ void open(uint64_t flags, Context *on_finish);
+
+ int close();
+ void close(Context *on_finish);
+
+ void handle_update_notification();
+
+ bool is_refresh_required() const;
+
+ int refresh();
+ int refresh_if_required();
+ void refresh(Context *on_finish);
+
+ void snap_set(uint64_t snap_id, Context *on_finish);
+
+ void prepare_lock(Context *on_ready);
+ void handle_prepare_lock_complete();
+
+ int register_update_watcher(UpdateWatchCtx *watcher, uint64_t *handle);
+ void unregister_update_watcher(uint64_t handle, Context *on_finish);
+ int unregister_update_watcher(uint64_t handle);
+ void flush_update_watchers(Context *on_finish);
+ void shut_down_update_watchers(Context *on_finish);
+
+ int register_quiesce_watcher(QuiesceWatchCtx *watcher, uint64_t *handle);
+ int unregister_quiesce_watcher(uint64_t handle);
+ void notify_quiesce(Context *on_finish);
+ void notify_unquiesce(Context *on_finish);
+ void quiesce_complete(uint64_t handle, int r);
+
+private:
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_OPEN,
+ STATE_CLOSED,
+ STATE_OPENING,
+ STATE_CLOSING,
+ STATE_REFRESHING,
+ STATE_SETTING_SNAP,
+ STATE_PREPARING_LOCK
+ };
+
+ enum ActionType {
+ ACTION_TYPE_OPEN,
+ ACTION_TYPE_CLOSE,
+ ACTION_TYPE_REFRESH,
+ ACTION_TYPE_SET_SNAP,
+ ACTION_TYPE_LOCK
+ };
+
+ struct Action {
+ ActionType action_type;
+ uint64_t refresh_seq = 0;
+ uint64_t snap_id = CEPH_NOSNAP;
+ Context *on_ready = nullptr;
+
+ Action(ActionType action_type) : action_type(action_type) {
+ }
+ inline bool operator==(const Action &action) const {
+ if (action_type != action.action_type) {
+ return false;
+ }
+ switch (action_type) {
+ case ACTION_TYPE_REFRESH:
+ return (refresh_seq == action.refresh_seq);
+ case ACTION_TYPE_SET_SNAP:
+ return (snap_id == action.snap_id);
+ case ACTION_TYPE_LOCK:
+ return false;
+ default:
+ return true;
+ }
+ }
+ };
+
+ typedef std::list<Context *> Contexts;
+ typedef std::pair<Action, Contexts> ActionContexts;
+ typedef std::list<ActionContexts> ActionsContexts;
+
+ ImageCtxT *m_image_ctx;
+ State m_state;
+
+ mutable ceph::mutex m_lock;
+ ActionsContexts m_actions_contexts;
+
+ uint64_t m_last_refresh;
+ uint64_t m_refresh_seq;
+
+ ImageUpdateWatchers *m_update_watchers;
+ QuiesceWatchers *m_quiesce_watchers;
+
+ uint64_t m_open_flags;
+
+ bool is_transition_state() const;
+ bool is_closed() const;
+
+ const Action *find_pending_refresh() const;
+
+ void append_context(const Action &action, Context *context);
+ void execute_next_action_unlock();
+ void execute_action_unlock(const Action &action, Context *context);
+ void complete_action_unlock(State next_state, int r);
+
+ void send_open_unlock();
+ void handle_open(int r);
+
+ void send_close_unlock();
+ void handle_close(int r);
+
+ void send_refresh_unlock();
+ void handle_refresh(int r);
+
+ void send_set_snap_unlock();
+ void handle_set_snap(int r);
+
+ void send_prepare_lock_unlock();
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ImageState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_STATE_H
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
new file mode 100644
index 000000000..789d7694a
--- /dev/null
+++ b/src/librbd/ImageWatcher.cc
@@ -0,0 +1,1555 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ImageWatcher.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/TaskFinisher.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/image_watcher/NotifyLockOwner.h"
+#include "librbd/io/AioCompletion.h"
+#include "include/encoding.h"
+#include "common/errno.h"
+#include <boost/bind/bind.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageWatcher: "
+
+namespace librbd {
+
+using namespace image_watcher;
+using namespace watch_notify;
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+using ceph::encode;
+using ceph::decode;
+
+using namespace boost::placeholders;
+
+static const double RETRY_DELAY_SECONDS = 1.0;
+
+template <typename I>
+struct ImageWatcher<I>::C_ProcessPayload : public Context {
+ ImageWatcher *image_watcher;
+ uint64_t notify_id;
+ uint64_t handle;
+ std::unique_ptr<watch_notify::Payload> payload;
+
+ C_ProcessPayload(ImageWatcher *image_watcher, uint64_t notify_id,
+ uint64_t handle,
+ std::unique_ptr<watch_notify::Payload> &&payload)
+ : image_watcher(image_watcher), notify_id(notify_id), handle(handle),
+ payload(std::move(payload)) {
+ }
+
+ void finish(int r) override {
+ image_watcher->m_async_op_tracker.start_op();
+ if (image_watcher->notifications_blocked()) {
+ // requests are blocked -- just ack the notification
+ bufferlist bl;
+ image_watcher->acknowledge_notify(notify_id, handle, bl);
+ } else {
+ image_watcher->process_payload(notify_id, handle, payload.get());
+ }
+ image_watcher->m_async_op_tracker.finish_op();
+ }
+};
+
+template <typename I>
+ImageWatcher<I>::ImageWatcher(I &image_ctx)
+ : Watcher(image_ctx.md_ctx, image_ctx.op_work_queue, image_ctx.header_oid),
+ m_image_ctx(image_ctx),
+ m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)),
+ m_async_request_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this))),
+ m_owner_client_id_lock(ceph::make_mutex(
+ util::unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this)))
+{
+}
+
+template <typename I>
+ImageWatcher<I>::~ImageWatcher()
+{
+ delete m_task_finisher;
+}
+
+template <typename I>
+void ImageWatcher<I>::unregister_watch(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " unregistering image watcher" << dendl;
+
+ cancel_async_requests();
+
+ // flush the task finisher queue before completing
+ on_finish = create_async_context_callback(m_task_finisher, on_finish);
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ cancel_quiesce_requests();
+ m_task_finisher->cancel_all();
+ m_async_op_tracker.wait_for_ops(on_finish);
+ });
+ Watcher::unregister_watch(on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::block_notifies(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ cancel_async_requests();
+ on_finish->complete(r);
+ });
+ Watcher::block_notifies(on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_async_progress(const AsyncRequestId &request,
+ uint64_t offset, uint64_t total) {
+ auto ctx = new LambdaContext([this, request, offset, total](int r) {
+ if (r != -ECANCELED) {
+ notify_async_progress(request, offset, total);
+ }
+ });
+ m_task_finisher->queue(Task(TASK_CODE_ASYNC_PROGRESS, request), ctx);
+}
+
+template <typename I>
+int ImageWatcher<I>::notify_async_progress(const AsyncRequestId &request,
+ uint64_t offset, uint64_t total) {
+ ldout(m_image_ctx.cct, 20) << this << " remote async request progress: "
+ << request << " @ " << offset
+ << "/" << total << dendl;
+
+ send_notify(new AsyncProgressPayload(request, offset, total));
+ return 0;
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_async_complete(const AsyncRequestId &request,
+ int r) {
+ m_async_op_tracker.start_op();
+ auto ctx = new LambdaContext([this, request, ret_val=r](int r) {
+ if (r != -ECANCELED) {
+ notify_async_complete(request, ret_val);
+ }
+ });
+ m_task_finisher->queue(ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_async_complete(const AsyncRequestId &request,
+ int r) {
+ ldout(m_image_ctx.cct, 20) << this << " remote async request finished: "
+ << request << " = " << r << dendl;
+
+ send_notify(new AsyncCompletePayload(request, r),
+ new LambdaContext(boost::bind(&ImageWatcher<I>::handle_async_complete,
+ this, request, r, _1)));
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_async_complete(const AsyncRequestId &request,
+ int r, int ret_val) {
+ ldout(m_image_ctx.cct, 20) << this << " " << __func__ << ": "
+ << "request=" << request << ", r=" << ret_val
+ << dendl;
+ if (ret_val < 0) {
+ lderr(m_image_ctx.cct) << this << " failed to notify async complete: "
+ << cpp_strerror(ret_val) << dendl;
+ if (ret_val == -ETIMEDOUT && !is_unregistered()) {
+ schedule_async_complete(request, r);
+ m_async_op_tracker.finish_op();
+ return;
+ }
+ }
+
+ std::unique_lock async_request_locker{m_async_request_lock};
+ mark_async_request_complete(request, r);
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_flatten(uint64_t request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id, new FlattenPayload(async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_resize(uint64_t request_id, uint64_t size,
+ bool allow_shrink,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ new ResizePayload(async_request_id, size, allow_shrink),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_create(uint64_t request_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t flags,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ new SnapCreatePayload(async_request_id, snap_namespace,
+ snap_name, flags),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_rename(uint64_t request_id,
+ const snapid_t &src_snap_id,
+ const std::string &dst_snap_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(
+ async_request_id,
+ new SnapRenamePayload(async_request_id, src_snap_id, dst_snap_name),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_remove(
+ uint64_t request_id, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(
+ async_request_id,
+ new SnapRemovePayload(async_request_id, snap_namespace, snap_name),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_protect(
+ uint64_t request_id, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(
+ async_request_id,
+ new SnapProtectPayload(async_request_id, snap_namespace, snap_name),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_snap_unprotect(
+ uint64_t request_id, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(
+ async_request_id,
+ new SnapUnprotectPayload(async_request_id, snap_namespace, snap_name),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_rebuild_object_map(uint64_t request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ new RebuildObjectMapPayload(async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_rename(uint64_t request_id,
+ const std::string &image_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ new RenamePayload(async_request_id, image_name),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_update_features(uint64_t request_id,
+ uint64_t features, bool enabled,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ new UpdateFeaturesPayload(async_request_id, features, enabled),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_migrate(uint64_t request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id, new MigratePayload(async_request_id),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_sparsify(uint64_t request_id, size_t sparse_size,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(async_request_id,
+ new SparsifyPayload(async_request_id, sparse_size),
+ prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_header_update(Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << this << ": " << __func__ << dendl;
+
+ // supports legacy (empty buffer) clients
+ send_notify(new HeaderUpdatePayload(), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_header_update(librados::IoCtx &io_ctx,
+ const std::string &oid) {
+ // supports legacy (empty buffer) clients
+ bufferlist bl;
+ encode(NotifyMessage(new HeaderUpdatePayload()), bl);
+ io_ctx.notify2(oid, bl, watcher::Notifier::NOTIFY_TIMEOUT, nullptr);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_quiesce(uint64_t *request_id,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ *request_id = util::reserve_async_request_id();
+
+ ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": request_id="
+ << request_id << dendl;
+
+ AsyncRequestId async_request_id(get_client_id(), *request_id);
+
+ auto attempts = m_image_ctx.config.template get_val<uint64_t>(
+ "rbd_quiesce_notification_attempts");
+
+ notify_quiesce(async_request_id, attempts, prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_quiesce(const AsyncRequestId &async_request_id,
+ size_t attempts, ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": async_request_id="
+ << async_request_id << " attempts=" << attempts
+ << dendl;
+
+ ceph_assert(attempts > 0);
+ auto notify_response = new watcher::NotifyResponse();
+ auto on_notify = new LambdaContext(
+ [notify_response=std::unique_ptr<watcher::NotifyResponse>(notify_response),
+ this, async_request_id, &prog_ctx, on_finish, attempts=attempts-1](int r) {
+ auto total_attempts = m_image_ctx.config.template get_val<uint64_t>(
+ "rbd_quiesce_notification_attempts");
+ if (total_attempts < attempts) {
+ total_attempts = attempts;
+ }
+ prog_ctx.update_progress(total_attempts - attempts, total_attempts);
+
+ if (r == -ETIMEDOUT) {
+ ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": async_request_id="
+ << async_request_id << " timed out" << dendl;
+ if (attempts > 0) {
+ notify_quiesce(async_request_id, attempts, prog_ctx, on_finish);
+ return;
+ }
+ } else if (r == 0) {
+ for (auto &[client_id, bl] : notify_response->acks) {
+ if (bl.length() == 0) {
+ continue;
+ }
+ try {
+ auto iter = bl.cbegin();
+
+ ResponseMessage response_message;
+ using ceph::decode;
+ decode(response_message, iter);
+
+ if (response_message.result != -EOPNOTSUPP) {
+ r = response_message.result;
+ }
+ } catch (const buffer::error &err) {
+ r = -EINVAL;
+ }
+ if (r < 0) {
+ break;
+ }
+ }
+ }
+ if (r < 0) {
+ lderr(m_image_ctx.cct) << this << " failed to notify quiesce: "
+ << cpp_strerror(r) << dendl;
+ }
+ on_finish->complete(r);
+ });
+
+ bufferlist bl;
+ encode(NotifyMessage(new QuiescePayload(async_request_id)), bl);
+ Watcher::send_notify(bl, notify_response, on_notify);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_unquiesce(uint64_t request_id, Context *on_finish) {
+ ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": request_id="
+ << request_id << dendl;
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ send_notify(new UnquiescePayload(async_request_id), on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_metadata_set(uint64_t request_id,
+ const std::string &key,
+ const std::string &value,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(
+ async_request_id,
+ new MetadataUpdatePayload(async_request_id, key,
+ std::optional<std::string>{value}),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_metadata_remove(uint64_t request_id,
+ const std::string &key,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock &&
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+
+ AsyncRequestId async_request_id(get_client_id(), request_id);
+
+ notify_async_request(
+ async_request_id,
+ new MetadataUpdatePayload(async_request_id, key, std::nullopt),
+ m_no_op_prog_ctx, on_finish);
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_cancel_async_requests() {
+ auto ctx = new LambdaContext([this](int r) {
+ if (r != -ECANCELED) {
+ cancel_async_requests();
+ }
+ });
+ m_task_finisher->queue(TASK_CODE_CANCEL_ASYNC_REQUESTS, ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::cancel_async_requests() {
+ std::unique_lock l{m_async_request_lock};
+ for (auto iter = m_async_requests.begin(); iter != m_async_requests.end(); ) {
+ if (iter->second.second == nullptr) {
+ // Quiesce notify request. Skip.
+ iter++;
+ } else {
+ iter->second.first->complete(-ERESTART);
+ iter = m_async_requests.erase(iter);
+ }
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::set_owner_client_id(const ClientId& client_id) {
+ ceph_assert(ceph_mutex_is_locked(m_owner_client_id_lock));
+ m_owner_client_id = client_id;
+ ldout(m_image_ctx.cct, 10) << this << " current lock owner: "
+ << m_owner_client_id << dendl;
+}
+
+template <typename I>
+ClientId ImageWatcher<I>::get_client_id() {
+ std::shared_lock l{this->m_watch_lock};
+ return ClientId(m_image_ctx.md_ctx.get_instance_id(), this->m_watch_handle);
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_acquired_lock() {
+ ldout(m_image_ctx.cct, 10) << this << " notify acquired lock" << dendl;
+
+ ClientId client_id = get_client_id();
+ {
+ std::lock_guard owner_client_id_locker{m_owner_client_id_lock};
+ set_owner_client_id(client_id);
+ }
+
+ send_notify(new AcquiredLockPayload(client_id));
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_released_lock() {
+ ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl;
+
+ {
+ std::lock_guard owner_client_id_locker{m_owner_client_id_lock};
+ set_owner_client_id(ClientId());
+ }
+
+ send_notify(new ReleasedLockPayload(get_client_id()));
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_request_lock(bool use_timer, int timer_delay) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ // see notify_request_lock()
+ if (m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner()) {
+ return;
+ }
+
+ std::shared_lock watch_locker{this->m_watch_lock};
+ if (this->is_registered(this->m_watch_lock)) {
+ ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
+
+ auto ctx = new LambdaContext([this](int r) {
+ if (r != -ECANCELED) {
+ notify_request_lock();
+ }
+ });
+
+ if (use_timer) {
+ if (timer_delay < 0) {
+ timer_delay = RETRY_DELAY_SECONDS;
+ }
+ m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK,
+ timer_delay, ctx);
+ } else {
+ m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx);
+ }
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_request_lock() {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+
+ // ExclusiveLock state machine can be dynamically disabled or
+ // race with task cancel
+ if (m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner()) {
+ return;
+ }
+
+ ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
+
+ notify_lock_owner(new RequestLockPayload(get_client_id(), false),
+ create_context_callback<
+ ImageWatcher, &ImageWatcher<I>::handle_request_lock>(this));
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_request_lock(int r) {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+
+ // ExclusiveLock state machine cannot transition -- but can be
+ // dynamically disabled
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ return;
+ }
+
+ if (r == -ETIMEDOUT) {
+ ldout(m_image_ctx.cct, 5) << this << " timed out requesting lock: retrying"
+ << dendl;
+
+ // treat this is a dead client -- so retest acquiring the lock
+ m_image_ctx.exclusive_lock->handle_peer_notification(0);
+ } else if (r == -EROFS) {
+ ldout(m_image_ctx.cct, 5) << this << " peer will not release lock" << dendl;
+ m_image_ctx.exclusive_lock->handle_peer_notification(r);
+ } else if (r < 0) {
+ lderr(m_image_ctx.cct) << this << " error requesting lock: "
+ << cpp_strerror(r) << dendl;
+ schedule_request_lock(true);
+ } else {
+ // lock owner acked -- but resend if we don't see them release the lock
+ int retry_timeout = m_image_ctx.cct->_conf.template get_val<int64_t>(
+ "client_notify_timeout");
+ ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout
+ << " seconds" << dendl;
+ schedule_request_lock(true, retry_timeout);
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_lock_owner(Payload *payload, Context *on_finish) {
+ ceph_assert(on_finish != nullptr);
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ bufferlist bl;
+ encode(NotifyMessage(payload), bl);
+
+ NotifyLockOwner *notify_lock_owner = NotifyLockOwner::create(
+ m_image_ctx, this->m_notifier, std::move(bl), on_finish);
+ notify_lock_owner->send();
+}
+
+template <typename I>
+bool ImageWatcher<I>::is_new_request(const AsyncRequestId &id) const {
+ ceph_assert(ceph_mutex_is_locked(m_async_request_lock));
+
+ return m_async_pending.count(id) == 0 && m_async_complete.count(id) == 0;
+}
+
+template <typename I>
+bool ImageWatcher<I>::mark_async_request_complete(const AsyncRequestId &id,
+ int r) {
+ ceph_assert(ceph_mutex_is_locked(m_async_request_lock));
+
+ bool found = m_async_pending.erase(id);
+
+ auto now = ceph_clock_now();
+
+ auto it = m_async_complete_expiration.begin();
+ while (it != m_async_complete_expiration.end() && it->first < now) {
+ m_async_complete.erase(it->second);
+ it = m_async_complete_expiration.erase(it);
+ }
+
+ if (!m_async_complete.insert({id, r}).second) {
+ for (it = m_async_complete_expiration.begin();
+ it != m_async_complete_expiration.end(); it++) {
+ if (it->second == id) {
+ m_async_complete_expiration.erase(it);
+ break;
+ }
+ }
+ }
+ auto expiration_time = now;
+ expiration_time += 600;
+ m_async_complete_expiration.insert({expiration_time, id});
+
+ return found;
+}
+
+template <typename I>
+Context *ImageWatcher<I>::remove_async_request(const AsyncRequestId &id) {
+ std::unique_lock async_request_locker{m_async_request_lock};
+
+ return remove_async_request(id, m_async_request_lock);
+}
+
+template <typename I>
+Context *ImageWatcher<I>::remove_async_request(const AsyncRequestId &id,
+ ceph::shared_mutex &lock) {
+ ceph_assert(ceph_mutex_is_locked(lock));
+
+ auto it = m_async_requests.find(id);
+ if (it != m_async_requests.end()) {
+ Context *on_complete = it->second.first;
+ m_async_requests.erase(it);
+ return on_complete;
+ }
+ return nullptr;
+}
+
+template <typename I>
+void ImageWatcher<I>::schedule_async_request_timed_out(const AsyncRequestId &id) {
+ ldout(m_image_ctx.cct, 20) << "scheduling async request time out: " << id
+ << dendl;
+
+ auto ctx = new LambdaContext([this, id](int r) {
+ if (r != -ECANCELED) {
+ async_request_timed_out(id);
+ }
+ });
+
+ Task task(TASK_CODE_ASYNC_REQUEST, id);
+ m_task_finisher->cancel(task);
+
+ m_task_finisher->add_event_after(
+ task, m_image_ctx.config.template get_val<uint64_t>("rbd_request_timed_out_seconds"),
+ ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::async_request_timed_out(const AsyncRequestId &id) {
+ Context *on_complete = remove_async_request(id);
+ if (on_complete != nullptr) {
+ ldout(m_image_ctx.cct, 5) << "async request timed out: " << id << dendl;
+ m_image_ctx.op_work_queue->queue(on_complete, -ETIMEDOUT);
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::notify_async_request(
+ const AsyncRequestId &async_request_id, Payload *payload,
+ ProgressContext& prog_ctx, Context *on_finish) {
+ ceph_assert(on_finish != nullptr);
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ ldout(m_image_ctx.cct, 10) << this << " async request: " << async_request_id
+ << dendl;
+
+ Context *on_notify = new LambdaContext([this, async_request_id](int r) {
+ if (r < 0) {
+ // notification failed -- don't expect updates
+ Context *on_complete = remove_async_request(async_request_id);
+ if (on_complete != nullptr) {
+ on_complete->complete(r);
+ }
+ }
+ });
+
+ Context *on_complete = new LambdaContext(
+ [this, async_request_id, on_finish](int r) {
+ m_task_finisher->cancel(Task(TASK_CODE_ASYNC_REQUEST, async_request_id));
+ on_finish->complete(r);
+ });
+
+ {
+ std::unique_lock async_request_locker{m_async_request_lock};
+ m_async_requests[async_request_id] = AsyncRequest(on_complete, &prog_ctx);
+ }
+
+ schedule_async_request_timed_out(async_request_id);
+ notify_lock_owner(payload, on_notify);
+}
+
+template <typename I>
+int ImageWatcher<I>::prepare_async_request(const AsyncRequestId& async_request_id,
+ bool* new_request, Context** ctx,
+ ProgressContext** prog_ctx) {
+ if (async_request_id.client_id == get_client_id()) {
+ return -ERESTART;
+ } else {
+ std::unique_lock l{m_async_request_lock};
+ if (is_new_request(async_request_id)) {
+ m_async_pending.insert(async_request_id);
+ *new_request = true;
+ *prog_ctx = new RemoteProgressContext(*this, async_request_id);
+ *ctx = new RemoteContext(*this, async_request_id, *prog_ctx);
+ } else {
+ *new_request = false;
+ auto it = m_async_complete.find(async_request_id);
+ if (it != m_async_complete.end()) {
+ int r = it->second;
+ // reset complete request expiration time
+ mark_async_request_complete(async_request_id, r);
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+Context *ImageWatcher<I>::prepare_quiesce_request(
+ const AsyncRequestId &request, C_NotifyAck *ack_ctx) {
+ std::unique_lock locker{m_async_request_lock};
+
+ auto timeout = 2 * watcher::Notifier::NOTIFY_TIMEOUT / 1000;
+
+ if (!is_new_request(request)) {
+ auto it = m_async_requests.find(request);
+ if (it != m_async_requests.end()) {
+ delete it->second.first;
+ it->second.first = ack_ctx;
+ } else {
+ auto it = m_async_complete.find(request);
+ ceph_assert(it != m_async_complete.end());
+ m_task_finisher->queue(new C_ResponseMessage(ack_ctx), it->second);
+ // reset complete request expiration time
+ mark_async_request_complete(request, it->second);
+ }
+ locker.unlock();
+
+ m_task_finisher->reschedule_event_after(Task(TASK_CODE_QUIESCE, request),
+ timeout);
+ return nullptr;
+ }
+
+ m_async_pending.insert(request);
+ m_async_requests[request] = AsyncRequest(ack_ctx, nullptr);
+ m_async_op_tracker.start_op();
+
+ return new LambdaContext(
+ [this, request, timeout](int r) {
+ auto unquiesce_ctx = new LambdaContext(
+ [this, request](int r) {
+ if (r == 0) {
+ ldout(m_image_ctx.cct, 10) << this << " quiesce request "
+ << request << " timed out" << dendl;
+ }
+
+ auto on_finish = new LambdaContext(
+ [this](int r) {
+ m_async_op_tracker.finish_op();
+ });
+
+ m_image_ctx.state->notify_unquiesce(on_finish);
+ });
+
+ m_task_finisher->add_event_after(Task(TASK_CODE_QUIESCE, request),
+ timeout, unquiesce_ctx);
+
+ std::unique_lock async_request_locker{m_async_request_lock};
+ mark_async_request_complete(request, r);
+ auto ctx = remove_async_request(request, m_async_request_lock);
+ async_request_locker.unlock();
+ if (ctx != nullptr) {
+ ctx = new C_ResponseMessage(static_cast<C_NotifyAck *>(ctx));
+ ctx->complete(r);
+ } else {
+ m_task_finisher->cancel(Task(TASK_CODE_QUIESCE, request));
+ }
+ });
+}
+
+template <typename I>
+void ImageWatcher<I>::prepare_unquiesce_request(const AsyncRequestId &request) {
+ {
+ std::unique_lock async_request_locker{m_async_request_lock};
+ auto it = m_async_complete.find(request);
+ if (it == m_async_complete.end()) {
+ ldout(m_image_ctx.cct, 20) << this << " " << request
+ << ": not found in complete" << dendl;
+ return;
+ }
+ // reset complete request expiration time
+ mark_async_request_complete(request, it->second);
+ }
+
+ bool canceled = m_task_finisher->cancel(Task(TASK_CODE_QUIESCE, request));
+ if (!canceled) {
+ ldout(m_image_ctx.cct, 20) << this << " " << request
+ << ": timer task not found" << dendl;
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::cancel_quiesce_requests() {
+ std::unique_lock l{m_async_request_lock};
+ for (auto it = m_async_requests.begin(); it != m_async_requests.end(); ) {
+ if (it->second.second == nullptr) {
+ // Quiesce notify request.
+ mark_async_request_complete(it->first, 0);
+ delete it->second.first;
+ it = m_async_requests.erase(it);
+ } else {
+ it++;
+ }
+ }
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_operation_request(
+ const AsyncRequestId& async_request_id,
+ exclusive_lock::OperationRequestType request_type, Operation operation,
+ std::function<void(ProgressContext &prog_ctx, Context*)> execute,
+ C_NotifyAck *ack_ctx) {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r = 0;
+ if (m_image_ctx.exclusive_lock->accept_request(request_type, &r)) {
+ bool new_request;
+ Context *ctx;
+ ProgressContext *prog_ctx;
+ bool complete;
+ if (async_request_id) {
+ r = prepare_async_request(async_request_id, &new_request, &ctx,
+ &prog_ctx);
+ encode(ResponseMessage(r), ack_ctx->out);
+ complete = true;
+ } else {
+ new_request = true;
+ ctx = new C_ResponseMessage(ack_ctx);
+ prog_ctx = &m_no_op_prog_ctx;
+ complete = false;
+ }
+ if (r == 0 && new_request) {
+ ctx = new LambdaContext(
+ [this, operation, ctx](int r) {
+ m_image_ctx.operations->finish_op(operation, r);
+ ctx->complete(r);
+ });
+ ctx = new LambdaContext(
+ [this, execute, prog_ctx, ctx](int r) {
+ if (r < 0) {
+ ctx->complete(r);
+ return;
+ }
+ std::shared_lock l{m_image_ctx.owner_lock};
+ execute(*prog_ctx, ctx);
+ });
+ m_image_ctx.operations->start_op(operation, ctx);
+ }
+ return complete;
+ } else if (r < 0) {
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const HeaderUpdatePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl;
+
+ m_image_ctx.state->handle_update_notification();
+ m_image_ctx.perfcounter->inc(l_librbd_notify);
+ if (ack_ctx != nullptr) {
+ m_image_ctx.state->flush_update_watchers(new C_ResponseMessage(ack_ctx));
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const AcquiredLockPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement"
+ << dendl;
+
+ bool cancel_async_requests = true;
+ if (payload.client_id.is_valid()) {
+ std::lock_guard owner_client_id_locker{m_owner_client_id_lock};
+ if (payload.client_id == m_owner_client_id) {
+ cancel_async_requests = false;
+ }
+ set_owner_client_id(payload.client_id);
+ }
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ // potentially wake up the exclusive lock state machine now that
+ // a lock owner has advertised itself
+ m_image_ctx.exclusive_lock->handle_peer_notification(0);
+ }
+ if (cancel_async_requests &&
+ (m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner())) {
+ schedule_cancel_async_requests();
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const ReleasedLockPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl;
+
+ bool cancel_async_requests = true;
+ if (payload.client_id.is_valid()) {
+ std::lock_guard l{m_owner_client_id_lock};
+ if (payload.client_id != m_owner_client_id) {
+ ldout(m_image_ctx.cct, 10) << this << " unexpected owner: "
+ << payload.client_id << " != "
+ << m_owner_client_id << dendl;
+ cancel_async_requests = false;
+ } else {
+ set_owner_client_id(ClientId());
+ }
+ }
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ if (cancel_async_requests &&
+ (m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner())) {
+ schedule_cancel_async_requests();
+ }
+
+ // alert the exclusive lock state machine that the lock is available
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.exclusive_lock->is_lock_owner()) {
+ m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK);
+ m_image_ctx.exclusive_lock->handle_peer_notification(0);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const RequestLockPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl;
+ if (payload.client_id == get_client_id()) {
+ return true;
+ }
+
+ std::shared_lock l{m_image_ctx.owner_lock};
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ m_image_ctx.exclusive_lock->is_lock_owner()) {
+ int r = 0;
+ bool accept_request = m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r);
+
+ if (accept_request) {
+ ceph_assert(r == 0);
+ std::lock_guard owner_client_id_locker{m_owner_client_id_lock};
+ if (!m_owner_client_id.is_valid()) {
+ return true;
+ }
+
+ ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
+ << dendl;
+ r = m_image_ctx.get_exclusive_lock_policy()->lock_requested(
+ payload.force);
+ }
+ encode(ResponseMessage(r), ack_ctx->out);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const AsyncProgressPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ std::shared_lock l{m_async_request_lock};
+ std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
+ m_async_requests.find(payload.async_request_id);
+ if (req_it != m_async_requests.end()) {
+ ldout(m_image_ctx.cct, 20) << this << " request progress: "
+ << payload.async_request_id << " @ "
+ << payload.offset << "/" << payload.total
+ << dendl;
+ schedule_async_request_timed_out(payload.async_request_id);
+ req_it->second.second->update_progress(payload.offset, payload.total);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const AsyncCompletePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ Context *on_complete = remove_async_request(payload.async_request_id);
+ if (on_complete != nullptr) {
+ ldout(m_image_ctx.cct, 10) << this << " request finished: "
+ << payload.async_request_id << "="
+ << payload.result << dendl;
+ on_complete->complete(payload.result);
+ }
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const FlattenPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote flatten request: "
+ << payload.async_request_id << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_FLATTEN, std::bind(&Operations<I>::execute_flatten,
+ m_image_ctx.operations,
+ std::placeholders::_1,
+ std::placeholders::_2),
+ ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const ResizePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote resize request: "
+ << payload.async_request_id << " "
+ << payload.size << " "
+ << payload.allow_shrink << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_RESIZE, std::bind(&Operations<I>::execute_resize,
+ m_image_ctx.operations, payload.size,
+ payload.allow_shrink, std::placeholders::_1,
+ std::placeholders::_2, 0), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapCreatePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: "
+ << payload.async_request_id << " "
+ << payload.snap_namespace << " "
+ << payload.snap_name << " "
+ << payload.flags << dendl;
+
+ auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL;
+
+ // rbd-mirror needs to accept forced promotion orphan snap create requests
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &payload.snap_namespace);
+ if (mirror_ns != nullptr && mirror_ns->is_orphan()) {
+ request_type = exclusive_lock::OPERATION_REQUEST_TYPE_FORCE_PROMOTION;
+ }
+
+ return handle_operation_request(
+ payload.async_request_id, request_type,
+ OPERATION_SNAP_CREATE, std::bind(&Operations<I>::execute_snap_create,
+ m_image_ctx.operations,
+ payload.snap_namespace,
+ payload.snap_name, std::placeholders::_2,
+ 0, payload.flags, std::placeholders::_1),
+ ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapRenamePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_rename request: "
+ << payload.async_request_id << " "
+ << payload.snap_id << " to "
+ << payload.snap_name << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_SNAP_RENAME, std::bind(&Operations<I>::execute_snap_rename,
+ m_image_ctx.operations, payload.snap_id,
+ payload.snap_name,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapRemovePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_remove request: "
+ << payload.snap_name << dendl;
+
+ auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL;
+ if (cls::rbd::get_snap_namespace_type(payload.snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) {
+ request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE;
+ }
+
+ return handle_operation_request(
+ payload.async_request_id, request_type, OPERATION_SNAP_REMOVE,
+ std::bind(&Operations<I>::execute_snap_remove, m_image_ctx.operations,
+ payload.snap_namespace, payload.snap_name,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapProtectPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_protect request: "
+ << payload.async_request_id << " "
+ << payload.snap_name << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_SNAP_PROTECT, std::bind(&Operations<I>::execute_snap_protect,
+ m_image_ctx.operations,
+ payload.snap_namespace,
+ payload.snap_name,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SnapUnprotectPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_unprotect request: "
+ << payload.async_request_id << " "
+ << payload.snap_name << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_SNAP_UNPROTECT, std::bind(&Operations<I>::execute_snap_unprotect,
+ m_image_ctx.operations,
+ payload.snap_namespace,
+ payload.snap_name,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const RebuildObjectMapPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote rebuild object map request: "
+ << payload.async_request_id << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_REBUILD_OBJECT_MAP,
+ std::bind(&Operations<I>::execute_rebuild_object_map,
+ m_image_ctx.operations, std::placeholders::_1,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const RenamePayload& payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote rename request: "
+ << payload.async_request_id << " "
+ << payload.image_name << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_RENAME, std::bind(&Operations<I>::execute_rename,
+ m_image_ctx.operations, payload.image_name,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const UpdateFeaturesPayload& payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote update_features request: "
+ << payload.async_request_id << " "
+ << payload.features << " "
+ << (payload.enabled ? "enabled" : "disabled")
+ << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_UPDATE_FEATURES,
+ std::bind(&Operations<I>::execute_update_features, m_image_ctx.operations,
+ payload.features, payload.enabled, std::placeholders::_2, 0),
+ ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const MigratePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote migrate request: "
+ << payload.async_request_id << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_MIGRATE, std::bind(&Operations<I>::execute_migrate,
+ m_image_ctx.operations,
+ std::placeholders::_1,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const SparsifyPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " remote sparsify request: "
+ << payload.async_request_id << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_SPARSIFY, std::bind(&Operations<I>::execute_sparsify,
+ m_image_ctx.operations,
+ payload.sparse_size, std::placeholders::_1,
+ std::placeholders::_2), ack_ctx);
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const MetadataUpdatePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ if (payload.value) {
+ ldout(m_image_ctx.cct, 10) << this << " remote metadata_set request: "
+ << payload.async_request_id << " "
+ << "key=" << payload.key << ", value="
+ << *payload.value << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_METADATA_UPDATE,
+ std::bind(&Operations<I>::execute_metadata_set,
+ m_image_ctx.operations, payload.key, *payload.value,
+ std::placeholders::_2),
+ ack_ctx);
+ } else {
+ ldout(m_image_ctx.cct, 10) << this << " remote metadata_remove request: "
+ << payload.async_request_id << " "
+ << "key=" << payload.key << dendl;
+
+ return handle_operation_request(
+ payload.async_request_id,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ OPERATION_METADATA_UPDATE,
+ std::bind(&Operations<I>::execute_metadata_remove,
+ m_image_ctx.operations, payload.key, std::placeholders::_2),
+ ack_ctx);
+ }
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const QuiescePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ auto on_finish = prepare_quiesce_request(payload.async_request_id, ack_ctx);
+ if (on_finish == nullptr) {
+ ldout(m_image_ctx.cct, 10) << this << " duplicate quiesce request: "
+ << payload.async_request_id << dendl;
+ return false;
+ }
+
+ ldout(m_image_ctx.cct, 10) << this << " quiesce request: "
+ << payload.async_request_id << dendl;
+ m_image_ctx.state->notify_quiesce(on_finish);
+ return false;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const UnquiescePayload &payload,
+ C_NotifyAck *ack_ctx) {
+ ldout(m_image_ctx.cct, 10) << this << " unquiesce request: "
+ << payload.async_request_id << dendl;
+
+ prepare_unquiesce_request(payload.async_request_id);
+ return true;
+}
+
+template <typename I>
+bool ImageWatcher<I>::handle_payload(const UnknownPayload &payload,
+ C_NotifyAck *ack_ctx) {
+ std::shared_lock l{m_image_ctx.owner_lock};
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ int r;
+ if (m_image_ctx.exclusive_lock->accept_request(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r) || r < 0) {
+ encode(ResponseMessage(-EOPNOTSUPP), ack_ctx->out);
+ }
+ }
+ return true;
+}
+
+template <typename I>
+void ImageWatcher<I>::process_payload(uint64_t notify_id, uint64_t handle,
+ Payload *payload) {
+ auto ctx = new Watcher::C_NotifyAck(this, notify_id, handle);
+ bool complete;
+
+ switch (payload->get_notify_op()) {
+ case NOTIFY_OP_ACQUIRED_LOCK:
+ complete = handle_payload(*(static_cast<AcquiredLockPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_RELEASED_LOCK:
+ complete = handle_payload(*(static_cast<ReleasedLockPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_REQUEST_LOCK:
+ complete = handle_payload(*(static_cast<RequestLockPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_HEADER_UPDATE:
+ complete = handle_payload(*(static_cast<HeaderUpdatePayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_ASYNC_PROGRESS:
+ complete = handle_payload(*(static_cast<AsyncProgressPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_ASYNC_COMPLETE:
+ complete = handle_payload(*(static_cast<AsyncCompletePayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_FLATTEN:
+ complete = handle_payload(*(static_cast<FlattenPayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_RESIZE:
+ complete = handle_payload(*(static_cast<ResizePayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_SNAP_CREATE:
+ complete = handle_payload(*(static_cast<SnapCreatePayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_SNAP_REMOVE:
+ complete = handle_payload(*(static_cast<SnapRemovePayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_SNAP_RENAME:
+ complete = handle_payload(*(static_cast<SnapRenamePayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_SNAP_PROTECT:
+ complete = handle_payload(*(static_cast<SnapProtectPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_SNAP_UNPROTECT:
+ complete = handle_payload(*(static_cast<SnapUnprotectPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_REBUILD_OBJECT_MAP:
+ complete = handle_payload(*(static_cast<RebuildObjectMapPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_RENAME:
+ complete = handle_payload(*(static_cast<RenamePayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_UPDATE_FEATURES:
+ complete = handle_payload(*(static_cast<UpdateFeaturesPayload *>(payload)),
+ ctx);
+ break;
+ case NOTIFY_OP_MIGRATE:
+ complete = handle_payload(*(static_cast<MigratePayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_SPARSIFY:
+ complete = handle_payload(*(static_cast<SparsifyPayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_QUIESCE:
+ complete = handle_payload(*(static_cast<QuiescePayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_UNQUIESCE:
+ complete = handle_payload(*(static_cast<UnquiescePayload *>(payload)), ctx);
+ break;
+ case NOTIFY_OP_METADATA_UPDATE:
+ complete = handle_payload(*(static_cast<MetadataUpdatePayload *>(payload)), ctx);
+ break;
+ default:
+ ceph_assert(payload->get_notify_op() == static_cast<NotifyOp>(-1));
+ complete = handle_payload(*(static_cast<UnknownPayload *>(payload)), ctx);
+ }
+
+ if (complete) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ NotifyMessage notify_message;
+ if (bl.length() == 0) {
+ // legacy notification for header updates
+ notify_message = NotifyMessage(new HeaderUpdatePayload());
+ } else {
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ lderr(m_image_ctx.cct) << this << " error decoding image notification: "
+ << err.what() << dendl;
+ return;
+ }
+ }
+
+ // if an image refresh is required, refresh before processing the request
+ if (notify_message.check_for_refresh() &&
+ m_image_ctx.state->is_refresh_required()) {
+
+ m_image_ctx.state->refresh(
+ new C_ProcessPayload(this, notify_id, handle,
+ std::move(notify_message.payload)));
+ } else {
+ process_payload(notify_id, handle, notify_message.payload.get());
+ }
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_error(uint64_t handle, int err) {
+ lderr(m_image_ctx.cct) << this << " image watch failed: " << handle << ", "
+ << cpp_strerror(err) << dendl;
+
+ {
+ std::lock_guard l{m_owner_client_id_lock};
+ set_owner_client_id(ClientId());
+ }
+
+ Watcher::handle_error(handle, err);
+}
+
+template <typename I>
+void ImageWatcher<I>::handle_rewatch_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ // update the lock cookie with the new watch handle
+ m_image_ctx.exclusive_lock->reacquire_lock(nullptr);
+ }
+ }
+
+ // image might have been updated while we didn't have active watch
+ handle_payload(HeaderUpdatePayload(), nullptr);
+}
+
+template <typename I>
+void ImageWatcher<I>::send_notify(Payload *payload, Context *ctx) {
+ bufferlist bl;
+
+ encode(NotifyMessage(payload), bl);
+ Watcher::send_notify(bl, nullptr, ctx);
+}
+
+template <typename I>
+void ImageWatcher<I>::RemoteContext::finish(int r) {
+ m_image_watcher.schedule_async_complete(m_async_request_id, r);
+}
+
+template <typename I>
+void ImageWatcher<I>::C_ResponseMessage::finish(int r) {
+ CephContext *cct = notify_ack->cct;
+ ldout(cct, 10) << this << " C_ResponseMessage: r=" << r << dendl;
+
+ encode(ResponseMessage(r), notify_ack->out);
+ notify_ack->complete(0);
+}
+
+} // namespace librbd
+
+template class librbd::ImageWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
new file mode 100644
index 000000000..cda9a246e
--- /dev/null
+++ b/src/librbd/ImageWatcher.h
@@ -0,0 +1,313 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_WATCHER_H
+#define CEPH_LIBRBD_IMAGE_WATCHER_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rbd/librbd.hpp"
+#include "librbd/Operations.h"
+#include "librbd/Watcher.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/internal.h"
+#include <functional>
+#include <set>
+#include <string>
+#include <utility>
+
+class entity_name_t;
+
+namespace librbd {
+
+class ImageCtx;
+template <typename> class TaskFinisher;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWatcher : public Watcher {
+public:
+ ImageWatcher(ImageCtxT& image_ctx);
+ ~ImageWatcher() override;
+
+ void unregister_watch(Context *on_finish) override;
+ void block_notifies(Context *on_finish) override;
+
+ void notify_flatten(uint64_t request_id, ProgressContext &prog_ctx,
+ Context *on_finish);
+ void notify_resize(uint64_t request_id, uint64_t size, bool allow_shrink,
+ ProgressContext &prog_ctx, Context *on_finish);
+ void notify_snap_create(uint64_t request_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t flags,
+ ProgressContext &prog_ctx,
+ Context *on_finish);
+ void notify_snap_rename(uint64_t request_id,
+ const snapid_t &src_snap_id,
+ const std::string &dst_snap_name,
+ Context *on_finish);
+ void notify_snap_remove(uint64_t request_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_snap_protect(uint64_t request_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_snap_unprotect(uint64_t request_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+ void notify_rebuild_object_map(uint64_t request_id,
+ ProgressContext &prog_ctx, Context *on_finish);
+ void notify_rename(uint64_t request_id,
+ const std::string &image_name, Context *on_finish);
+
+ void notify_update_features(uint64_t request_id,
+ uint64_t features, bool enabled,
+ Context *on_finish);
+
+ void notify_migrate(uint64_t request_id, ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ void notify_sparsify(uint64_t request_id, size_t sparse_size,
+ ProgressContext &prog_ctx, Context *on_finish);
+
+ void notify_acquired_lock();
+ void notify_released_lock();
+ void notify_request_lock();
+
+ void notify_header_update(Context *on_finish);
+ static void notify_header_update(librados::IoCtx &io_ctx,
+ const std::string &oid);
+
+ void notify_quiesce(uint64_t *request_id, ProgressContext &prog_ctx,
+ Context *on_finish);
+ void notify_unquiesce(uint64_t request_id, Context *on_finish);
+
+ void notify_metadata_set(uint64_t request_id,
+ const std::string &key, const std::string &value,
+ Context *on_finish);
+ void notify_metadata_remove(uint64_t request_id,
+ const std::string &key, Context *on_finish);
+
+private:
+ enum TaskCode {
+ TASK_CODE_REQUEST_LOCK,
+ TASK_CODE_CANCEL_ASYNC_REQUESTS,
+ TASK_CODE_REREGISTER_WATCH,
+ TASK_CODE_ASYNC_REQUEST,
+ TASK_CODE_ASYNC_PROGRESS,
+ TASK_CODE_QUIESCE,
+ };
+
+ typedef std::pair<Context *, ProgressContext *> AsyncRequest;
+
+ class Task {
+ public:
+ Task(TaskCode task_code) : m_task_code(task_code) {}
+ Task(TaskCode task_code, const watch_notify::AsyncRequestId &id)
+ : m_task_code(task_code), m_async_request_id(id) {}
+
+ inline bool operator<(const Task& rhs) const {
+ if (m_task_code != rhs.m_task_code) {
+ return m_task_code < rhs.m_task_code;
+ } else if ((m_task_code == TASK_CODE_ASYNC_REQUEST ||
+ m_task_code == TASK_CODE_ASYNC_PROGRESS ||
+ m_task_code == TASK_CODE_QUIESCE) &&
+ m_async_request_id != rhs.m_async_request_id) {
+ return m_async_request_id < rhs.m_async_request_id;
+ }
+ return false;
+ }
+ private:
+ TaskCode m_task_code;
+ watch_notify::AsyncRequestId m_async_request_id;
+ };
+
+ class RemoteProgressContext : public ProgressContext {
+ public:
+ RemoteProgressContext(ImageWatcher &image_watcher,
+ const watch_notify::AsyncRequestId &id)
+ : m_image_watcher(image_watcher), m_async_request_id(id)
+ {
+ }
+
+ int update_progress(uint64_t offset, uint64_t total) override {
+ m_image_watcher.schedule_async_progress(m_async_request_id, offset,
+ total);
+ return 0;
+ }
+
+ private:
+ ImageWatcher &m_image_watcher;
+ watch_notify::AsyncRequestId m_async_request_id;
+ };
+
+ class RemoteContext : public Context {
+ public:
+ RemoteContext(ImageWatcher &image_watcher,
+ const watch_notify::AsyncRequestId &id,
+ ProgressContext *prog_ctx)
+ : m_image_watcher(image_watcher), m_async_request_id(id),
+ m_prog_ctx(prog_ctx)
+ {
+ }
+
+ ~RemoteContext() override {
+ delete m_prog_ctx;
+ }
+
+ void finish(int r) override;
+
+ private:
+ ImageWatcher &m_image_watcher;
+ watch_notify::AsyncRequestId m_async_request_id;
+ ProgressContext *m_prog_ctx;
+ };
+
+ struct C_ProcessPayload;
+ struct C_ResponseMessage : public Context {
+ C_NotifyAck *notify_ack;
+
+ C_ResponseMessage(C_NotifyAck *notify_ack) : notify_ack(notify_ack) {
+ }
+ void finish(int r) override;
+ };
+
+ ImageCtxT &m_image_ctx;
+
+ TaskFinisher<Task> *m_task_finisher;
+
+ ceph::shared_mutex m_async_request_lock;
+ std::map<watch_notify::AsyncRequestId, AsyncRequest> m_async_requests;
+ std::set<watch_notify::AsyncRequestId> m_async_pending;
+ std::map<watch_notify::AsyncRequestId, int> m_async_complete;
+ std::set<std::pair<utime_t,
+ watch_notify::AsyncRequestId>> m_async_complete_expiration;
+
+ ceph::mutex m_owner_client_id_lock;
+ watch_notify::ClientId m_owner_client_id;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ NoOpProgressContext m_no_op_prog_ctx;
+
+ void handle_register_watch(int r);
+
+ void schedule_cancel_async_requests();
+ void cancel_async_requests();
+
+ void set_owner_client_id(const watch_notify::ClientId &client_id);
+ watch_notify::ClientId get_client_id();
+
+ void handle_request_lock(int r);
+ void schedule_request_lock(bool use_timer, int timer_delay = -1);
+
+ void notify_lock_owner(watch_notify::Payload *payload, Context *on_finish);
+
+ bool is_new_request(const watch_notify::AsyncRequestId &id) const;
+ bool mark_async_request_complete(const watch_notify::AsyncRequestId &id,
+ int r);
+ Context *remove_async_request(const watch_notify::AsyncRequestId &id);
+ Context *remove_async_request(const watch_notify::AsyncRequestId &id,
+ ceph::shared_mutex &lock);
+ void schedule_async_request_timed_out(const watch_notify::AsyncRequestId &id);
+ void async_request_timed_out(const watch_notify::AsyncRequestId &id);
+ void notify_async_request(const watch_notify::AsyncRequestId &id,
+ watch_notify::Payload *payload,
+ ProgressContext& prog_ctx,
+ Context *on_finish);
+
+ void schedule_async_progress(const watch_notify::AsyncRequestId &id,
+ uint64_t offset, uint64_t total);
+ int notify_async_progress(const watch_notify::AsyncRequestId &id,
+ uint64_t offset, uint64_t total);
+ void schedule_async_complete(const watch_notify::AsyncRequestId &id, int r);
+ void notify_async_complete(const watch_notify::AsyncRequestId &id, int r);
+ void handle_async_complete(const watch_notify::AsyncRequestId &request, int r,
+ int ret_val);
+
+ int prepare_async_request(const watch_notify::AsyncRequestId& id,
+ bool* new_request, Context** ctx,
+ ProgressContext** prog_ctx);
+
+ Context *prepare_quiesce_request(const watch_notify::AsyncRequestId &request,
+ C_NotifyAck *ack_ctx);
+ void prepare_unquiesce_request(const watch_notify::AsyncRequestId &request);
+ void cancel_quiesce_requests();
+
+ void notify_quiesce(const watch_notify::AsyncRequestId &async_request_id,
+ size_t attempts, ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ bool handle_operation_request(
+ const watch_notify::AsyncRequestId& async_request_id,
+ exclusive_lock::OperationRequestType request_type, Operation operation,
+ std::function<void(ProgressContext &prog_ctx, Context*)> execute,
+ C_NotifyAck *ack_ctx);
+
+ bool handle_payload(const watch_notify::HeaderUpdatePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::AcquiredLockPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::ReleasedLockPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::RequestLockPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::AsyncProgressPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::AsyncCompletePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::FlattenPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::ResizePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapCreatePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapRenamePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapRemovePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapProtectPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SnapUnprotectPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::RebuildObjectMapPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::RenamePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::UpdateFeaturesPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::MigratePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::SparsifyPayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::QuiescePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::UnquiescePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::MetadataUpdatePayload& payload,
+ C_NotifyAck *ctx);
+ bool handle_payload(const watch_notify::UnknownPayload& payload,
+ C_NotifyAck *ctx);
+ void process_payload(uint64_t notify_id, uint64_t handle,
+ watch_notify::Payload *payload);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+ void handle_error(uint64_t cookie, int err) override;
+ void handle_rewatch_complete(int r) override;
+
+ void send_notify(watch_notify::Payload *payload, Context *ctx = nullptr);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ImageWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_WATCHER_H
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
new file mode 100644
index 000000000..0cd38b22a
--- /dev/null
+++ b/src/librbd/Journal.cc
@@ -0,0 +1,1819 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/Journal.h"
+#include "include/rados/librados.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Journaler.h"
+#include "journal/Policy.h"
+#include "journal/ReplayEntry.h"
+#include "journal/Settings.h"
+#include "journal/Utils.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/DemoteRequest.h"
+#include "librbd/journal/ObjectDispatch.h"
+#include "librbd/journal/OpenRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/journal/ResetRequest.h"
+#include "librbd/journal/Replay.h"
+#include "librbd/journal/PromoteRequest.h"
+
+#include <boost/scope_exit.hpp>
+#include <utility>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal: "
+
+namespace librbd {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using journal::util::C_DecodeTag;
+using journal::util::C_DecodeTags;
+
+namespace {
+
+// TODO: once journaler is 100% async and converted to ASIO, remove separate
+// threads and reuse librbd's AsioEngine
+class ThreadPoolSingleton : public ThreadPool {
+public:
+ ContextWQ *work_queue;
+
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "librbd::Journal", "tp_librbd_journ", 1),
+ work_queue(new ContextWQ("librbd::journal::work_queue",
+ ceph::make_timespan(
+ cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout")),
+ this)) {
+ start();
+ }
+ ~ThreadPoolSingleton() override {
+ work_queue->drain();
+ delete work_queue;
+
+ stop();
+ }
+};
+
+template <typename I>
+struct C_IsTagOwner : public Context {
+ librados::IoCtx &io_ctx;
+ std::string image_id;
+ bool *is_tag_owner;
+ asio::ContextWQ *op_work_queue;
+ Context *on_finish;
+
+ CephContext *cct = nullptr;
+ Journaler *journaler;
+ cls::journal::Client client;
+ journal::ImageClientMeta client_meta;
+ uint64_t tag_tid = 0;
+ journal::TagData tag_data;
+
+ C_IsTagOwner(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool *is_tag_owner, asio::ContextWQ *op_work_queue,
+ Context *on_finish)
+ : io_ctx(io_ctx), image_id(image_id), is_tag_owner(is_tag_owner),
+ op_work_queue(op_work_queue), on_finish(on_finish),
+ cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ journaler(new Journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID,
+ {}, nullptr)) {
+ }
+
+ void finish(int r) override {
+ ldout(cct, 20) << this << " C_IsTagOwner::" << __func__ << ": r=" << r
+ << dendl;
+ if (r < 0) {
+ lderr(cct) << this << " C_IsTagOwner::" << __func__ << ": "
+ << "failed to get tag owner: " << cpp_strerror(r) << dendl;
+ } else {
+ *is_tag_owner = (tag_data.mirror_uuid == Journal<>::LOCAL_MIRROR_UUID);
+ }
+
+ Journaler *journaler = this->journaler;
+ Context *on_finish = this->on_finish;
+ auto ctx = new LambdaContext(
+ [journaler, on_finish](int r) {
+ on_finish->complete(r);
+ delete journaler;
+ });
+ op_work_queue->queue(ctx, r);
+ }
+};
+
+struct C_GetTagOwner : public Context {
+ std::string *mirror_uuid;
+ Context *on_finish;
+
+ Journaler journaler;
+ cls::journal::Client client;
+ journal::ImageClientMeta client_meta;
+ uint64_t tag_tid = 0;
+ journal::TagData tag_data;
+
+ C_GetTagOwner(librados::IoCtx &io_ctx, const std::string &image_id,
+ std::string *mirror_uuid, Context *on_finish)
+ : mirror_uuid(mirror_uuid), on_finish(on_finish),
+ journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID, {}, nullptr) {
+ }
+
+ virtual void finish(int r) {
+ if (r >= 0) {
+ *mirror_uuid = tag_data.mirror_uuid;
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename J>
+struct GetTagsRequest {
+ CephContext *cct;
+ J *journaler;
+ cls::journal::Client *client;
+ journal::ImageClientMeta *client_meta;
+ uint64_t *tag_tid;
+ journal::TagData *tag_data;
+ Context *on_finish;
+
+ ceph::mutex lock = ceph::make_mutex("lock");
+
+ GetTagsRequest(CephContext *cct, J *journaler, cls::journal::Client *client,
+ journal::ImageClientMeta *client_meta, uint64_t *tag_tid,
+ journal::TagData *tag_data, Context *on_finish)
+ : cct(cct), journaler(journaler), client(client), client_meta(client_meta),
+ tag_tid(tag_tid), tag_data(tag_data), on_finish(on_finish) {
+ }
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_CLIENT * * * * * * * * * * * *
+ * | *
+ * v *
+ * GET_TAGS * * * * * * * * * * * * * (error)
+ * | *
+ * v *
+ * <finish> * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ void send() {
+ send_get_client();
+ }
+
+ void send_get_client() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto ctx = new LambdaContext(
+ [this](int r) {
+ handle_get_client(r);
+ });
+ journaler->get_client(Journal<ImageCtx>::IMAGE_CLIENT_ID, client, ctx);
+ }
+
+ void handle_get_client(int r) {
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto bl_it = client->data.cbegin();
+ try {
+ decode(client_data, bl_it);
+ } catch (const buffer::error &err) {
+ lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": "
+ << "failed to decode client data" << dendl;
+ complete(-EBADMSG);
+ return;
+ }
+
+ journal::ImageClientMeta *image_client_meta =
+ boost::get<journal::ImageClientMeta>(&client_data.client_meta);
+ if (image_client_meta == nullptr) {
+ lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": "
+ << "failed to get client meta" << dendl;
+ complete(-EINVAL);
+ return;
+ }
+ *client_meta = *image_client_meta;
+
+ send_get_tags();
+ }
+
+ void send_get_tags() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto ctx = new LambdaContext(
+ [this](int r) {
+ handle_get_tags(r);
+ });
+ C_DecodeTags *tags_ctx = new C_DecodeTags(cct, &lock, tag_tid, tag_data,
+ ctx);
+ journaler->get_tags(client_meta->tag_class, &tags_ctx->tags, tags_ctx);
+ }
+
+ void handle_get_tags(int r) {
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ complete(r);
+ }
+
+ void complete(int r) {
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+template <typename J>
+void get_tags(CephContext *cct, J *journaler,
+ cls::journal::Client *client,
+ journal::ImageClientMeta *client_meta,
+ uint64_t *tag_tid, journal::TagData *tag_data,
+ Context *on_finish) {
+ ldout(cct, 20) << __func__ << dendl;
+
+ GetTagsRequest<J> *req =
+ new GetTagsRequest<J>(cct, journaler, client, client_meta, tag_tid,
+ tag_data, on_finish);
+ req->send();
+}
+
+template <typename J>
+int allocate_journaler_tag(CephContext *cct, J *journaler,
+ uint64_t tag_class,
+ const journal::TagPredecessor &predecessor,
+ const std::string &mirror_uuid,
+ cls::journal::Tag *new_tag) {
+ journal::TagData tag_data;
+ tag_data.mirror_uuid = mirror_uuid;
+ tag_data.predecessor = predecessor;
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ C_SaferCond allocate_tag_ctx;
+ journaler->allocate_tag(tag_class, tag_bl, new_tag, &allocate_tag_ctx);
+
+ int r = allocate_tag_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << __func__ << ": "
+ << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+// client id for local image
+template <typename I>
+const std::string Journal<I>::IMAGE_CLIENT_ID("");
+
+// mirror uuid to use for local images
+template <typename I>
+const std::string Journal<I>::LOCAL_MIRROR_UUID("");
+
+// mirror uuid to use for orphaned (demoted) images
+template <typename I>
+const std::string Journal<I>::ORPHAN_MIRROR_UUID("<orphan>");
+
+template <typename I>
+std::ostream &operator<<(std::ostream &os,
+ const typename Journal<I>::State &state) {
+ switch (state) {
+ case Journal<I>::STATE_UNINITIALIZED:
+ os << "Uninitialized";
+ break;
+ case Journal<I>::STATE_INITIALIZING:
+ os << "Initializing";
+ break;
+ case Journal<I>::STATE_REPLAYING:
+ os << "Replaying";
+ break;
+ case Journal<I>::STATE_FLUSHING_RESTART:
+ os << "FlushingRestart";
+ break;
+ case Journal<I>::STATE_RESTARTING_REPLAY:
+ os << "RestartingReplay";
+ break;
+ case Journal<I>::STATE_FLUSHING_REPLAY:
+ os << "FlushingReplay";
+ break;
+ case Journal<I>::STATE_READY:
+ os << "Ready";
+ break;
+ case Journal<I>::STATE_STOPPING:
+ os << "Stopping";
+ break;
+ case Journal<I>::STATE_CLOSING:
+ os << "Closing";
+ break;
+ case Journal<I>::STATE_CLOSED:
+ os << "Closed";
+ break;
+ default:
+ os << "Unknown (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+
+template <typename I>
+void Journal<I>::MetadataListener::handle_update(::journal::JournalMetadata *) {
+ auto ctx = new LambdaContext([this](int r) {
+ journal->handle_metadata_updated();
+ });
+ journal->m_work_queue->queue(ctx, 0);
+}
+
+
+template <typename I>
+void Journal<I>::get_work_queue(CephContext *cct, ContextWQ **work_queue) {
+ auto thread_pool_singleton =
+ &cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+ "librbd::journal::thread_pool", false, cct);
+ *work_queue = thread_pool_singleton->work_queue;
+}
+
+template <typename I>
+Journal<I>::Journal(I &image_ctx)
+ : RefCountedObject(image_ctx.cct),
+ m_image_ctx(image_ctx), m_journaler(NULL),
+ m_state(STATE_UNINITIALIZED),
+ m_error_result(0), m_replay_handler(this), m_close_pending(false),
+ m_event_tid(0),
+ m_blocking_writes(false), m_journal_replay(NULL),
+ m_metadata_listener(this) {
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << ": ictx=" << &m_image_ctx << dendl;
+
+ get_work_queue(cct, &m_work_queue);
+ ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
+}
+
+template <typename I>
+Journal<I>::~Journal() {
+ if (m_work_queue != nullptr) {
+ m_work_queue->drain();
+ }
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED);
+ ceph_assert(m_journaler == NULL);
+ ceph_assert(m_journal_replay == NULL);
+ ceph_assert(m_wait_for_state_contexts.empty());
+}
+
+template <typename I>
+bool Journal<I>::is_journal_supported(I &image_ctx) {
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+ return ((image_ctx.features & RBD_FEATURE_JOURNALING) &&
+ !image_ctx.read_only && image_ctx.snap_id == CEPH_NOSNAP);
+}
+
+template <typename I>
+int Journal<I>::create(librados::IoCtx &io_ctx, const std::string &image_id,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+ ContextWQ *work_queue;
+ get_work_queue(cct, &work_queue);
+
+ C_SaferCond cond;
+ journal::TagData tag_data(LOCAL_MIRROR_UUID);
+ journal::CreateRequest<I> *req = journal::CreateRequest<I>::create(
+ io_ctx, image_id, order, splay_width, object_pool, cls::journal::Tag::TAG_CLASS_NEW,
+ tag_data, IMAGE_CLIENT_ID, work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+int Journal<I>::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+ ContextWQ *work_queue;
+ get_work_queue(cct, &work_queue);
+
+ C_SaferCond cond;
+ journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create(
+ io_ctx, image_id, IMAGE_CLIENT_ID, work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+int Journal<I>::reset(librados::IoCtx &io_ctx, const std::string &image_id) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+ ContextWQ *work_queue;
+ get_work_queue(cct, &work_queue);
+
+ C_SaferCond cond;
+ auto req = journal::ResetRequest<I>::create(io_ctx, image_id, IMAGE_CLIENT_ID,
+ Journal<>::LOCAL_MIRROR_UUID,
+ work_queue, &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+void Journal<I>::is_tag_owner(I *image_ctx, bool *owner,
+ Context *on_finish) {
+ Journal<I>::is_tag_owner(image_ctx->md_ctx, image_ctx->id, owner,
+ image_ctx->op_work_queue, on_finish);
+}
+
+template <typename I>
+void Journal<I>::is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
+ bool *is_tag_owner,
+ asio::ContextWQ *op_work_queue,
+ Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << __func__ << dendl;
+
+ C_IsTagOwner<I> *is_tag_owner_ctx = new C_IsTagOwner<I>(
+ io_ctx, image_id, is_tag_owner, op_work_queue, on_finish);
+ get_tags(cct, is_tag_owner_ctx->journaler, &is_tag_owner_ctx->client,
+ &is_tag_owner_ctx->client_meta, &is_tag_owner_ctx->tag_tid,
+ &is_tag_owner_ctx->tag_data, is_tag_owner_ctx);
+}
+
+template <typename I>
+void Journal<I>::get_tag_owner(IoCtx& io_ctx, std::string& image_id,
+ std::string *mirror_uuid,
+ asio::ContextWQ *op_work_queue,
+ Context *on_finish) {
+ CephContext *cct = static_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto ctx = new C_GetTagOwner(io_ctx, image_id, mirror_uuid, on_finish);
+ get_tags(cct, &ctx->journaler, &ctx->client, &ctx->client_meta, &ctx->tag_tid,
+ &ctx->tag_data, create_async_context_callback(op_work_queue, ctx));
+}
+
+template <typename I>
+int Journal<I>::request_resync(I *image_ctx) {
+ CephContext *cct = image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Journaler journaler(image_ctx->md_ctx, image_ctx->id, IMAGE_CLIENT_ID, {},
+ nullptr);
+
+ ceph::mutex lock = ceph::make_mutex("lock");
+ journal::ImageClientMeta client_meta;
+ uint64_t tag_tid;
+ journal::TagData tag_data;
+
+ C_SaferCond open_ctx;
+ auto open_req = journal::OpenRequest<I>::create(image_ctx, &journaler, &lock,
+ &client_meta, &tag_tid,
+ &tag_data, &open_ctx);
+ open_req->send();
+
+ BOOST_SCOPE_EXIT_ALL(&journaler) {
+ journaler.shut_down();
+ };
+
+ int r = open_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ client_meta.resync_requested = true;
+
+ journal::ClientData client_data(client_meta);
+ bufferlist client_data_bl;
+ encode(client_data, client_data_bl);
+
+ C_SaferCond update_client_ctx;
+ journaler.update_client(client_data_bl, &update_client_ctx);
+
+ r = update_client_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << __func__ << ": "
+ << "failed to update client: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Journal<I>::promote(I *image_ctx, Context *on_finish) {
+ CephContext *cct = image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto promote_req = journal::PromoteRequest<I>::create(image_ctx, false,
+ on_finish);
+ promote_req->send();
+}
+
+template <typename I>
+void Journal<I>::demote(I *image_ctx, Context *on_finish) {
+ CephContext *cct = image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto req = journal::DemoteRequest<I>::create(*image_ctx, on_finish);
+ req->send();
+}
+
+template <typename I>
+bool Journal<I>::is_journal_ready() const {
+ std::lock_guard locker{m_lock};
+ return (m_state == STATE_READY);
+}
+
+template <typename I>
+bool Journal<I>::is_journal_replaying() const {
+ std::lock_guard locker{m_lock};
+ return is_journal_replaying(m_lock);
+}
+
+template <typename I>
+bool Journal<I>::is_journal_replaying(const ceph::mutex &) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return (m_state == STATE_REPLAYING ||
+ m_state == STATE_FLUSHING_REPLAY ||
+ m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_RESTARTING_REPLAY);
+}
+
+template <typename I>
+bool Journal<I>::is_journal_appending() const {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ std::lock_guard locker{m_lock};
+ return (m_state == STATE_READY &&
+ !m_image_ctx.get_journal_policy()->append_disabled());
+}
+
+template <typename I>
+void Journal<I>::wait_for_journal_ready(Context *on_ready) {
+ on_ready = create_async_context_callback(m_image_ctx, on_ready);
+
+ std::lock_guard locker{m_lock};
+ if (m_state == STATE_READY) {
+ on_ready->complete(m_error_result);
+ } else {
+ wait_for_steady_state(on_ready);
+ }
+}
+
+template <typename I>
+void Journal<I>::open(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ on_finish = create_context_callback<Context>(on_finish, this);
+
+ on_finish = create_async_context_callback(m_image_ctx, on_finish);
+
+ // inject our handler into the object dispatcher chain
+ m_image_ctx.io_object_dispatcher->register_dispatch(
+ journal::ObjectDispatch<I>::create(&m_image_ctx, this));
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ wait_for_steady_state(on_finish);
+ create_journaler();
+}
+
+template <typename I>
+void Journal<I>::close(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ on_finish = create_context_callback<Context>(on_finish, this);
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ // remove our handler from object dispatcher chain - preserve error
+ auto ctx = new LambdaContext([on_finish, r](int _) {
+ on_finish->complete(r);
+ });
+ m_image_ctx.io_object_dispatcher->shut_down_dispatch(
+ io::OBJECT_DISPATCH_LAYER_JOURNAL, ctx);
+ });
+ on_finish = create_async_context_callback(m_image_ctx, on_finish);
+
+ std::unique_lock locker{m_lock};
+ m_listener_cond.wait(locker, [this] { return !m_listener_notify; });
+
+ Listeners listeners(m_listeners);
+ m_listener_notify = true;
+ locker.unlock();
+ for (auto listener : listeners) {
+ listener->handle_close();
+ }
+
+ locker.lock();
+ m_listener_notify = false;
+ m_listener_cond.notify_all();
+
+ ceph_assert(m_state != STATE_UNINITIALIZED);
+ if (m_state == STATE_CLOSED) {
+ on_finish->complete(m_error_result);
+ return;
+ }
+
+ if (m_state == STATE_READY) {
+ stop_recording();
+ }
+
+ m_close_pending = true;
+ wait_for_steady_state(on_finish);
+}
+
+template <typename I>
+bool Journal<I>::is_tag_owner() const {
+ std::lock_guard locker{m_lock};
+ return is_tag_owner(m_lock);
+}
+
+template <typename I>
+bool Journal<I>::is_tag_owner(const ceph::mutex &) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return (m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
+}
+
+template <typename I>
+uint64_t Journal<I>::get_tag_tid() const {
+ std::lock_guard locker{m_lock};
+ return m_tag_tid;
+}
+
+template <typename I>
+journal::TagData Journal<I>::get_tag_data() const {
+ std::lock_guard locker{m_lock};
+ return m_tag_data;
+}
+
+template <typename I>
+void Journal<I>::allocate_local_tag(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ journal::TagPredecessor predecessor;
+ predecessor.mirror_uuid = LOCAL_MIRROR_UUID;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_journaler != nullptr && is_tag_owner(m_lock));
+
+ cls::journal::Client client;
+ int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, r);
+ return;
+ }
+
+ // since we are primary, populate the predecessor with our known commit
+ // position
+ ceph_assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
+ if (!client.commit_position.object_positions.empty()) {
+ auto position = client.commit_position.object_positions.front();
+ predecessor.commit_valid = true;
+ predecessor.tag_tid = position.tag_tid;
+ predecessor.entry_tid = position.entry_tid;
+ }
+ }
+
+ allocate_tag(LOCAL_MIRROR_UUID, predecessor, on_finish);
+}
+
+template <typename I>
+void Journal<I>::allocate_tag(const std::string &mirror_uuid,
+ const journal::TagPredecessor &predecessor,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": mirror_uuid=" << mirror_uuid
+ << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_journaler != nullptr);
+
+ journal::TagData tag_data;
+ tag_data.mirror_uuid = mirror_uuid;
+ tag_data.predecessor = predecessor;
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ C_DecodeTag *decode_tag_ctx = new C_DecodeTag(cct, &m_lock, &m_tag_tid,
+ &m_tag_data, on_finish);
+ m_journaler->allocate_tag(m_tag_class, tag_bl, &decode_tag_ctx->tag,
+ decode_tag_ctx);
+}
+
+template <typename I>
+void Journal<I>::flush_commit_position(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_journaler != nullptr);
+ m_journaler->flush_commit_position(on_finish);
+}
+
+template <typename I>
+void Journal<I>::user_flushed() {
+ if (m_state == STATE_READY && !m_user_flushed.exchange(true) &&
+ m_image_ctx.config.template get_val<bool>("rbd_journal_object_writethrough_until_flush")) {
+ std::lock_guard locker{m_lock};
+ if (m_state == STATE_READY) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_journaler != nullptr);
+ m_journaler->set_append_batch_options(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_flush_interval"),
+ m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_object_flush_bytes"),
+ m_image_ctx.config.template get_val<double>("rbd_journal_object_flush_age"));
+ } else {
+ m_user_flushed = false;
+ }
+ }
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_event(uint64_t offset, size_t length,
+ const bufferlist &bl,
+ bool flush_entry) {
+ ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size());
+ uint64_t max_write_data_size =
+ m_max_append_size - journal::AioWriteEvent::get_fixed_size();
+
+ // ensure that the write event fits within the journal entry
+ Bufferlists bufferlists;
+ uint64_t bytes_remaining = length;
+ uint64_t event_offset = 0;
+ do {
+ uint64_t event_length = std::min(bytes_remaining, max_write_data_size);
+
+ bufferlist event_bl;
+ event_bl.substr_of(bl, event_offset, event_length);
+ journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset,
+ event_length,
+ event_bl),
+ ceph_clock_now());
+
+ bufferlists.emplace_back();
+ encode(event_entry, bufferlists.back());
+
+ event_offset += event_length;
+ bytes_remaining -= event_length;
+ } while (bytes_remaining > 0);
+
+ return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, offset,
+ length, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_io_event(journal::EventEntry &&event_entry,
+ uint64_t offset, size_t length,
+ bool flush_entry, int filter_ret_val) {
+ bufferlist bl;
+ event_entry.timestamp = ceph_clock_now();
+ encode(event_entry, bl);
+ return append_io_events(event_entry.get_event_type(), {bl}, offset, length,
+ flush_entry, filter_ret_val);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_io_events(journal::EventType event_type,
+ const Bufferlists &bufferlists,
+ uint64_t offset, size_t length,
+ bool flush_entry, int filter_ret_val) {
+ ceph_assert(!bufferlists.empty());
+
+ uint64_t tid;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_READY);
+
+ tid = ++m_event_tid;
+ ceph_assert(tid != 0);
+ }
+
+ Futures futures;
+ for (auto &bl : bufferlists) {
+ ceph_assert(bl.length() <= m_max_append_size);
+ futures.push_back(m_journaler->append(m_tag_tid, bl));
+ }
+
+ {
+ std::lock_guard event_locker{m_event_lock};
+ m_events[tid] = Event(futures, offset, length, filter_ret_val);
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "event=" << event_type << ", "
+ << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "flush=" << flush_entry << ", tid=" << tid << dendl;
+
+ Context *on_safe = create_async_context_callback(
+ m_image_ctx, new C_IOEventSafe(this, tid));
+ if (flush_entry) {
+ futures.back().flush(on_safe);
+ } else {
+ futures.back().wait(on_safe);
+ }
+
+ return tid;
+}
+
+template <typename I>
+void Journal<I>::commit_io_event(uint64_t tid, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ "r=" << r << dendl;
+
+ std::lock_guard event_locker{m_event_lock};
+ typename Events::iterator it = m_events.find(tid);
+ if (it == m_events.end()) {
+ return;
+ }
+ complete_event(it, r);
+}
+
+template <typename I>
+void Journal<I>::commit_io_event_extent(uint64_t tid, uint64_t offset,
+ uint64_t length, int r) {
+ ceph_assert(length > 0);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "r=" << r << dendl;
+
+ std::lock_guard event_locker{m_event_lock};
+ typename Events::iterator it = m_events.find(tid);
+ if (it == m_events.end()) {
+ return;
+ }
+
+ Event &event = it->second;
+ if (event.ret_val == 0 && r < 0) {
+ event.ret_val = r;
+ }
+
+ ExtentInterval extent;
+ extent.insert(offset, length);
+
+ ExtentInterval intersect;
+ intersect.intersection_of(extent, event.pending_extents);
+
+ event.pending_extents.subtract(intersect);
+ if (!event.pending_extents.empty()) {
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "pending extents: " << event.pending_extents << dendl;
+ return;
+ }
+ complete_event(it, event.ret_val);
+}
+
+template <typename I>
+void Journal<I>::append_op_event(uint64_t op_tid,
+ journal::EventEntry &&event_entry,
+ Context *on_safe) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ bufferlist bl;
+ event_entry.timestamp = ceph_clock_now();
+ encode(event_entry, bl);
+
+ Future future;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_READY);
+
+ future = m_journaler->append(m_tag_tid, bl);
+
+ // delay committing op event to ensure consistent replay
+ ceph_assert(m_op_futures.count(op_tid) == 0);
+ m_op_futures[op_tid] = future;
+ }
+
+ on_safe = create_async_context_callback(m_image_ctx, on_safe);
+ on_safe = new LambdaContext([this, on_safe](int r) {
+ // ensure all committed IO before this op is committed
+ m_journaler->flush_commit_position(on_safe);
+ });
+ future.flush(on_safe);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "op_tid=" << op_tid << ", "
+ << "event=" << event_entry.get_event_type() << dendl;
+}
+
+template <typename I>
+void Journal<I>::commit_op_event(uint64_t op_tid, int r, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << ", "
+ << "r=" << r << dendl;
+
+ journal::EventEntry event_entry((journal::OpFinishEvent(op_tid, r)),
+ ceph_clock_now());
+
+ bufferlist bl;
+ encode(event_entry, bl);
+
+ Future op_start_future;
+ Future op_finish_future;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_READY);
+
+ // ready to commit op event
+ auto it = m_op_futures.find(op_tid);
+ ceph_assert(it != m_op_futures.end());
+ op_start_future = it->second;
+ m_op_futures.erase(it);
+
+ op_finish_future = m_journaler->append(m_tag_tid, bl);
+ }
+
+ op_finish_future.flush(create_async_context_callback(
+ m_image_ctx, new C_OpEventSafe(this, op_tid, op_start_future,
+ op_finish_future, on_safe)));
+}
+
+template <typename I>
+void Journal<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_journal_replay != nullptr);
+ m_journal_replay->replay_op_ready(op_tid, on_resume);
+ }
+}
+
+template <typename I>
+void Journal<I>::flush_event(uint64_t tid, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ << "on_safe=" << on_safe << dendl;
+
+ on_safe = create_context_callback<Context>(on_safe, this);
+
+ Future future;
+ {
+ std::lock_guard event_locker{m_event_lock};
+ future = wait_event(m_lock, tid, on_safe);
+ }
+
+ if (future.is_valid()) {
+ future.flush(nullptr);
+ }
+}
+
+template <typename I>
+void Journal<I>::wait_event(uint64_t tid, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+ << "on_safe=" << on_safe << dendl;
+
+ on_safe = create_context_callback<Context>(on_safe, this);
+
+ std::lock_guard event_locker{m_event_lock};
+ wait_event(m_lock, tid, on_safe);
+}
+
+template <typename I>
+typename Journal<I>::Future Journal<I>::wait_event(ceph::mutex &lock, uint64_t tid,
+ Context *on_safe) {
+ ceph_assert(ceph_mutex_is_locked(m_event_lock));
+ CephContext *cct = m_image_ctx.cct;
+
+ typename Events::iterator it = m_events.find(tid);
+ ceph_assert(it != m_events.end());
+
+ Event &event = it->second;
+ if (event.safe) {
+ // journal entry already safe
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "journal entry already safe" << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, event.ret_val);
+ return Future();
+ }
+
+ event.on_safe_contexts.push_back(create_async_context_callback(m_image_ctx,
+ on_safe));
+ return event.futures.back();
+}
+
+template <typename I>
+void Journal<I>::start_external_replay(journal::Replay<I> **journal_replay,
+ Context *on_start) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_READY);
+ ceph_assert(m_journal_replay == nullptr);
+
+ on_start = util::create_async_context_callback(m_image_ctx, on_start);
+ on_start = new LambdaContext(
+ [this, journal_replay, on_start](int r) {
+ handle_start_external_replay(r, journal_replay, on_start);
+ });
+
+ // safely flush all in-flight events before starting external replay
+ m_journaler->stop_append(util::create_async_context_callback(m_image_ctx,
+ on_start));
+}
+
+template <typename I>
+void Journal<I>::handle_start_external_replay(int r,
+ journal::Replay<I> **journal_replay,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_READY);
+ ceph_assert(m_journal_replay == nullptr);
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to stop recording: " << cpp_strerror(r) << dendl;
+ *journal_replay = nullptr;
+
+ // get back to a sane-state
+ start_append();
+ on_finish->complete(r);
+ return;
+ }
+
+ transition_state(STATE_REPLAYING, 0);
+ m_journal_replay = journal::Replay<I>::create(m_image_ctx);
+ *journal_replay = m_journal_replay;
+ on_finish->complete(0);
+}
+
+template <typename I>
+void Journal<I>::stop_external_replay() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_journal_replay != nullptr);
+ ceph_assert(m_state == STATE_REPLAYING);
+
+ delete m_journal_replay;
+ m_journal_replay = nullptr;
+
+ if (m_close_pending) {
+ destroy_journaler(0);
+ return;
+ }
+
+ start_append();
+}
+
+template <typename I>
+void Journal<I>::create_journaler() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_RESTARTING_REPLAY);
+ ceph_assert(m_journaler == NULL);
+
+ transition_state(STATE_INITIALIZING, 0);
+ ::journal::Settings settings;
+ settings.commit_interval =
+ m_image_ctx.config.template get_val<double>("rbd_journal_commit_age");
+ settings.max_payload_bytes =
+ m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_max_payload_bytes");
+ settings.max_concurrent_object_sets =
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_max_concurrent_object_sets");
+ // TODO: a configurable filter to exclude certain peers from being
+ // disconnected.
+ settings.ignored_laggy_clients = {IMAGE_CLIENT_ID};
+
+ m_journaler = new Journaler(m_work_queue, m_timer, m_timer_lock,
+ m_image_ctx.md_ctx, m_image_ctx.id,
+ IMAGE_CLIENT_ID, settings, nullptr);
+ m_journaler->add_listener(&m_metadata_listener);
+
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_open>(this));
+ auto open_req = journal::OpenRequest<I>::create(&m_image_ctx, m_journaler,
+ &m_lock, &m_client_meta,
+ &m_tag_tid, &m_tag_data, ctx);
+ open_req->send();
+}
+
+template <typename I>
+void Journal<I>::destroy_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ delete m_journal_replay;
+ m_journal_replay = NULL;
+
+ m_journaler->remove_listener(&m_metadata_listener);
+
+ transition_state(STATE_CLOSING, r);
+
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_journal_destroyed>(this));
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ std::lock_guard locker{m_lock};
+ m_journaler->shut_down(ctx);
+ });
+ ctx = create_async_context_callback(m_image_ctx, ctx);
+ m_async_journal_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Journal<I>::recreate_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_FLUSHING_REPLAY);
+
+ delete m_journal_replay;
+ m_journal_replay = NULL;
+
+ m_journaler->remove_listener(&m_metadata_listener);
+
+ transition_state(STATE_RESTARTING_REPLAY, r);
+ m_journaler->shut_down(create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_journal_destroyed>(this)));
+}
+
+template <typename I>
+void Journal<I>::complete_event(typename Events::iterator it, int r) {
+ ceph_assert(ceph_mutex_is_locked(m_event_lock));
+ ceph_assert(m_state == STATE_READY);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": tid=" << it->first << " "
+ << "r=" << r << dendl;
+
+ Event &event = it->second;
+ if (r < 0 && r == event.filter_ret_val) {
+ // ignore allowed error codes
+ r = 0;
+ }
+ if (r < 0) {
+ // event recorded to journal but failed to update disk, we cannot
+ // commit this IO event. this event must be replayed.
+ ceph_assert(event.safe);
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit IO to disk, replay required: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ event.committed_io = true;
+ if (event.safe) {
+ if (r >= 0) {
+ for (auto &future : event.futures) {
+ m_journaler->committed(future);
+ }
+ }
+ m_events.erase(it);
+ }
+}
+
+template <typename I>
+void Journal<I>::start_append() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_journaler->start_append(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_max_in_flight_appends"));
+ if (!m_image_ctx.config.template get_val<bool>("rbd_journal_object_writethrough_until_flush")) {
+ m_journaler->set_append_batch_options(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_journal_object_flush_interval"),
+ m_image_ctx.config.template get_val<Option::size_t>("rbd_journal_object_flush_bytes"),
+ m_image_ctx.config.template get_val<double>("rbd_journal_object_flush_age"));
+ }
+
+ transition_state(STATE_READY, 0);
+}
+
+template <typename I>
+void Journal<I>::handle_open(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_INITIALIZING);
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to initialize journal: " << cpp_strerror(r)
+ << dendl;
+ destroy_journaler(r);
+ return;
+ }
+
+ m_tag_class = m_client_meta.tag_class;
+ m_max_append_size = m_journaler->get_max_append_size();
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "tag_class=" << m_tag_class << ", "
+ << "max_append_size=" << m_max_append_size << dendl;
+
+ transition_state(STATE_REPLAYING, 0);
+ m_journal_replay = journal::Replay<I>::create(m_image_ctx);
+ m_journaler->start_replay(&m_replay_handler);
+}
+
+template <typename I>
+void Journal<I>::handle_replay_ready() {
+ CephContext *cct = m_image_ctx.cct;
+ ReplayEntry replay_entry;
+ {
+ std::lock_guard locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+ if (!m_journaler->try_pop_front(&replay_entry)) {
+ return;
+ }
+
+ // only one entry should be in-flight at a time
+ ceph_assert(!m_processing_entry);
+ m_processing_entry = true;
+ }
+
+ m_async_journal_op_tracker.start_op();
+
+ bufferlist data = replay_entry.get_data();
+ auto it = data.cbegin();
+
+ journal::EventEntry event_entry;
+ int r = m_journal_replay->decode(&it, &event_entry);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to decode journal event entry" << dendl;
+ handle_replay_process_safe(replay_entry, r);
+ return;
+ }
+
+ Context *on_ready = create_context_callback<
+ Journal<I>, &Journal<I>::handle_replay_process_ready>(this);
+ Context *on_commit = new C_ReplayProcessSafe(this, std::move(replay_entry));
+ m_journal_replay->process(event_entry, on_ready, on_commit);
+}
+
+template <typename I>
+void Journal<I>::handle_replay_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+
+ bool cancel_ops = false;
+ {
+ std::lock_guard locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+ if (r < 0) {
+ cancel_ops = true;
+ transition_state(STATE_FLUSHING_RESTART, r);
+ } else {
+ // state might change back to FLUSHING_RESTART on flush error
+ transition_state(STATE_FLUSHING_REPLAY, 0);
+ }
+ }
+
+ Context *ctx = new LambdaContext([this, cct](int r) {
+ ldout(cct, 20) << this << " handle_replay_complete: "
+ << "handle shut down replay" << dendl;
+
+ State state;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_FLUSHING_REPLAY);
+ state = m_state;
+ }
+
+ if (state == STATE_FLUSHING_RESTART) {
+ handle_flushing_restart(0);
+ } else {
+ handle_flushing_replay();
+ }
+ });
+ ctx = new LambdaContext([this, ctx](int r) {
+ // ensure the commit position is flushed to disk
+ m_journaler->flush_commit_position(ctx);
+ });
+ ctx = create_async_context_callback(m_image_ctx, ctx);
+ ctx = new LambdaContext([this, ctx](int r) {
+ m_async_journal_op_tracker.wait_for_ops(ctx);
+ });
+ ctx = new LambdaContext([this, cct, cancel_ops, ctx](int r) {
+ ldout(cct, 20) << this << " handle_replay_complete: "
+ << "shut down replay" << dendl;
+ m_journal_replay->shut_down(cancel_ops, ctx);
+ });
+
+ m_journaler->stop_replay(ctx);
+}
+
+template <typename I>
+void Journal<I>::handle_replay_process_ready(int r) {
+ // journal::Replay is ready for more events -- attempt to pop another
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(r == 0);
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_processing_entry);
+ m_processing_entry = false;
+ }
+ handle_replay_ready();
+}
+
+template <typename I>
+void Journal<I>::handle_replay_process_safe(ReplayEntry replay_entry, int r) {
+ CephContext *cct = m_image_ctx.cct;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_state == STATE_REPLAYING ||
+ m_state == STATE_FLUSHING_RESTART ||
+ m_state == STATE_FLUSHING_REPLAY);
+
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+ if (r < 0) {
+ if (r != -ECANCELED) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit journal event to disk: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ if (m_state == STATE_REPLAYING) {
+ // abort the replay if we have an error
+ transition_state(STATE_FLUSHING_RESTART, r);
+ locker.unlock();
+
+ // stop replay, shut down, and restart
+ Context* ctx = create_context_callback<
+ Journal<I>, &Journal<I>::handle_flushing_restart>(this);
+ ctx = new LambdaContext([this, ctx](int r) {
+ // ensure the commit position is flushed to disk
+ m_journaler->flush_commit_position(ctx);
+ });
+ ctx = new LambdaContext([this, cct, ctx](int r) {
+ ldout(cct, 20) << this << " handle_replay_process_safe: "
+ << "shut down replay" << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_FLUSHING_RESTART);
+ }
+
+ m_journal_replay->shut_down(true, ctx);
+ });
+ m_journaler->stop_replay(ctx);
+ m_async_journal_op_tracker.finish_op();
+ return;
+ } else if (m_state == STATE_FLUSHING_REPLAY) {
+ // end-of-replay flush in-progress -- we need to restart replay
+ transition_state(STATE_FLUSHING_RESTART, r);
+ locker.unlock();
+ m_async_journal_op_tracker.finish_op();
+ return;
+ }
+ } else {
+ // only commit the entry if written successfully
+ m_journaler->committed(replay_entry);
+ }
+ locker.unlock();
+ m_async_journal_op_tracker.finish_op();
+}
+
+template <typename I>
+void Journal<I>::handle_flushing_restart(int r) {
+ std::lock_guard locker{m_lock};
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(r == 0);
+ ceph_assert(m_state == STATE_FLUSHING_RESTART);
+ if (m_close_pending) {
+ destroy_journaler(r);
+ return;
+ }
+
+ recreate_journaler(r);
+}
+
+template <typename I>
+void Journal<I>::handle_flushing_replay() {
+ std::lock_guard locker{m_lock};
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_state == STATE_FLUSHING_REPLAY ||
+ m_state == STATE_FLUSHING_RESTART);
+ if (m_close_pending) {
+ destroy_journaler(0);
+ return;
+ } else if (m_state == STATE_FLUSHING_RESTART) {
+ // failed to replay one-or-more events -- restart
+ recreate_journaler(0);
+ return;
+ }
+
+ delete m_journal_replay;
+ m_journal_replay = NULL;
+
+ m_error_result = 0;
+ start_append();
+}
+
+template <typename I>
+void Journal<I>::handle_recording_stopped(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_STOPPING);
+
+ destroy_journaler(r);
+}
+
+template <typename I>
+void Journal<I>::handle_journal_destroyed(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__
+ << "error detected while closing journal: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ delete m_journaler;
+ m_journaler = nullptr;
+
+ ceph_assert(m_state == STATE_CLOSING || m_state == STATE_RESTARTING_REPLAY);
+ if (m_state == STATE_RESTARTING_REPLAY) {
+ create_journaler();
+ return;
+ }
+
+ transition_state(STATE_CLOSED, r);
+}
+
+template <typename I>
+void Journal<I>::handle_io_event_safe(int r, uint64_t tid) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", "
+ << "tid=" << tid << dendl;
+
+ // journal will be flushed before closing
+ ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit IO event: " << cpp_strerror(r) << dendl;
+ }
+
+ Contexts on_safe_contexts;
+ {
+ std::lock_guard event_locker{m_event_lock};
+ typename Events::iterator it = m_events.find(tid);
+ ceph_assert(it != m_events.end());
+
+ Event &event = it->second;
+ on_safe_contexts.swap(event.on_safe_contexts);
+
+ if (r < 0 || event.committed_io) {
+ // failed journal write so IO won't be sent -- or IO extent was
+ // overwritten by future IO operations so this was a no-op IO event
+ event.ret_val = r;
+ for (auto &future : event.futures) {
+ m_journaler->committed(future);
+ }
+ }
+
+ if (event.committed_io) {
+ m_events.erase(it);
+ } else {
+ event.safe = true;
+ }
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "completing tid=" << tid << dendl;
+
+ // alert the cache about the journal event status
+ for (Contexts::iterator it = on_safe_contexts.begin();
+ it != on_safe_contexts.end(); ++it) {
+ (*it)->complete(r);
+ }
+}
+
+template <typename I>
+void Journal<I>::handle_op_event_safe(int r, uint64_t tid,
+ const Future &op_start_future,
+ const Future &op_finish_future,
+ Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", "
+ << "tid=" << tid << dendl;
+
+ // journal will be flushed before closing
+ ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to commit op event: " << cpp_strerror(r) << dendl;
+ }
+
+ m_journaler->committed(op_start_future);
+ m_journaler->committed(op_finish_future);
+
+ // reduce the replay window after committing an op event
+ m_journaler->flush_commit_position(on_safe);
+}
+
+template <typename I>
+void Journal<I>::stop_recording() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_journaler != NULL);
+
+ ceph_assert(m_state == STATE_READY);
+ transition_state(STATE_STOPPING, 0);
+
+ m_journaler->stop_append(util::create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ Journal<I>, &Journal<I>::handle_recording_stopped>(this)));
+}
+
+template <typename I>
+void Journal<I>::transition_state(State state, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ m_state = state;
+
+ if (m_error_result == 0 && r < 0) {
+ m_error_result = r;
+ }
+
+ if (is_steady_state()) {
+ auto wait_for_state_contexts(std::move(m_wait_for_state_contexts));
+ m_wait_for_state_contexts.clear();
+
+ for (auto ctx : wait_for_state_contexts) {
+ ctx->complete(m_error_result);
+ }
+ }
+}
+
+template <typename I>
+bool Journal<I>::is_steady_state() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ switch (m_state) {
+ case STATE_READY:
+ case STATE_CLOSED:
+ return true;
+ case STATE_UNINITIALIZED:
+ case STATE_INITIALIZING:
+ case STATE_REPLAYING:
+ case STATE_FLUSHING_RESTART:
+ case STATE_RESTARTING_REPLAY:
+ case STATE_FLUSHING_REPLAY:
+ case STATE_STOPPING:
+ case STATE_CLOSING:
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+void Journal<I>::wait_for_steady_state(Context *on_state) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!is_steady_state());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": on_state=" << on_state
+ << dendl;
+ m_wait_for_state_contexts.push_back(on_state);
+}
+
+template <typename I>
+int Journal<I>::is_resync_requested(bool *do_resync) {
+ std::lock_guard l{m_lock};
+ return check_resync_requested(do_resync);
+}
+
+template <typename I>
+int Journal<I>::check_resync_requested(bool *do_resync) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(do_resync != nullptr);
+
+ cls::journal::Client client;
+ int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto bl_it = client.data.cbegin();
+ try {
+ decode(client_data, bl_it);
+ } catch (const buffer::error &err) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to decode client data: " << err.what() << dendl;
+ return -EINVAL;
+ }
+
+ journal::ImageClientMeta *image_client_meta =
+ boost::get<journal::ImageClientMeta>(&client_data.client_meta);
+ if (image_client_meta == nullptr) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to access image client meta struct" << dendl;
+ return -EINVAL;
+ }
+
+ *do_resync = image_client_meta->resync_requested;
+
+ return 0;
+}
+
+struct C_RefreshTags : public Context {
+ AsyncOpTracker &async_op_tracker;
+ Context *on_finish = nullptr;
+
+ ceph::mutex lock =
+ ceph::make_mutex("librbd::Journal::C_RefreshTags::lock");
+ uint64_t tag_tid = 0;
+ journal::TagData tag_data;
+
+ explicit C_RefreshTags(AsyncOpTracker &async_op_tracker)
+ : async_op_tracker(async_op_tracker) {
+ async_op_tracker.start_op();
+ }
+ ~C_RefreshTags() override {
+ async_op_tracker.finish_op();
+ }
+
+ void finish(int r) override {
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+void Journal<I>::handle_metadata_updated() {
+ CephContext *cct = m_image_ctx.cct;
+ std::lock_guard locker{m_lock};
+
+ if (m_state != STATE_READY && !is_journal_replaying(m_lock)) {
+ return;
+ } else if (is_tag_owner(m_lock)) {
+ ldout(cct, 20) << this << " " << __func__ << ": primary image" << dendl;
+ return;
+ } else if (m_listeners.empty()) {
+ ldout(cct, 20) << this << " " << __func__ << ": no listeners" << dendl;
+ return;
+ }
+
+ uint64_t refresh_sequence = ++m_refresh_sequence;
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "refresh_sequence=" << refresh_sequence << dendl;
+
+ // pull the most recent tags from the journal, decode, and
+ // update the internal tag state
+ C_RefreshTags *refresh_ctx = new C_RefreshTags(m_async_journal_op_tracker);
+ refresh_ctx->on_finish = new LambdaContext(
+ [this, refresh_sequence, refresh_ctx](int r) {
+ handle_refresh_metadata(refresh_sequence, refresh_ctx->tag_tid,
+ refresh_ctx->tag_data, r);
+ });
+ C_DecodeTags *decode_tags_ctx = new C_DecodeTags(
+ cct, &refresh_ctx->lock, &refresh_ctx->tag_tid,
+ &refresh_ctx->tag_data, refresh_ctx);
+ m_journaler->get_tags(m_tag_tid == 0 ? 0 : m_tag_tid - 1, m_tag_class,
+ &decode_tags_ctx->tags, decode_tags_ctx);
+}
+
+template <typename I>
+void Journal<I>::handle_refresh_metadata(uint64_t refresh_sequence,
+ uint64_t tag_tid,
+ journal::TagData tag_data, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ std::unique_lock locker{m_lock};
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to refresh metadata: "
+ << cpp_strerror(r) << dendl;
+ return;
+ } else if (m_state != STATE_READY && !is_journal_replaying(m_lock)) {
+ return;
+ } else if (refresh_sequence != m_refresh_sequence) {
+ // another, more up-to-date refresh is in-flight
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "refresh_sequence=" << refresh_sequence << ", "
+ << "tag_tid=" << tag_tid << ", "
+ << "tag_data=" << tag_data << dendl;
+ m_listener_cond.wait(locker, [this] { return !m_listener_notify; });
+
+ bool was_tag_owner = is_tag_owner(m_lock);
+ if (m_tag_tid < tag_tid) {
+ m_tag_tid = tag_tid;
+ m_tag_data = tag_data;
+ }
+ bool promoted_to_primary = (!was_tag_owner && is_tag_owner(m_lock));
+
+ bool resync_requested = false;
+ r = check_resync_requested(&resync_requested);
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to check if a resync was requested" << dendl;
+ return;
+ }
+
+ Listeners listeners(m_listeners);
+ m_listener_notify = true;
+ locker.unlock();
+
+ if (promoted_to_primary) {
+ for (auto listener : listeners) {
+ listener->handle_promoted();
+ }
+ } else if (resync_requested) {
+ for (auto listener : listeners) {
+ listener->handle_resync();
+ }
+ }
+
+ locker.lock();
+ m_listener_notify = false;
+ m_listener_cond.notify_all();
+}
+
+template <typename I>
+void Journal<I>::add_listener(journal::Listener *listener) {
+ std::lock_guard locker{m_lock};
+ m_listeners.insert(listener);
+}
+
+template <typename I>
+void Journal<I>::remove_listener(journal::Listener *listener) {
+ std::unique_lock locker{m_lock};
+ m_listener_cond.wait(locker, [this] { return !m_listener_notify; });
+ m_listeners.erase(listener);
+}
+
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::Journal<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h
new file mode 100644
index 000000000..406c0e34c
--- /dev/null
+++ b/src/librbd/Journal.h
@@ -0,0 +1,375 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_H
+#define CEPH_LIBRBD_JOURNAL_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/interval_set.h"
+#include "include/rados/librados_fwd.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/Cond.h"
+#include "common/Timer.h"
+#include "common/RefCountedObj.h"
+#include "journal/Future.h"
+#include "journal/JournalMetadataListener.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <atomic>
+#include <unordered_map>
+
+class ContextWQ;
+namespace journal { class Journaler; }
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal { template <typename> class Replay; }
+
+template <typename ImageCtxT = ImageCtx>
+class Journal : public RefCountedObject {
+public:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> FLUSHING ---> READY
+ * | * . ^ * . * |
+ * | * . | * . * |
+ * | * . | (error) * . . . . . . . * |
+ * | * . | * . * |
+ * | * . | v . * |
+ * | * . | FLUSHING_RESTART . * |
+ * | * . | | . * |
+ * | * . | | . * |
+ * | * . | v . * v
+ * | * . | RESTARTING < * * * * * STOPPING
+ * | * . | | . |
+ * | * . | | . |
+ * | * * * * * * . \-------------/ . |
+ * | * (error) . . |
+ * | * . . . . . . . . . . . . . . . . |
+ * | * . . |
+ * | v v v |
+ * | CLOSED <----- CLOSING <---------------------------------------/
+ * | |
+ * | v
+ * \---> <finish>
+ *
+ * @endverbatim
+ */
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_INITIALIZING,
+ STATE_REPLAYING,
+ STATE_FLUSHING_RESTART,
+ STATE_RESTARTING_REPLAY,
+ STATE_FLUSHING_REPLAY,
+ STATE_READY,
+ STATE_STOPPING,
+ STATE_CLOSING,
+ STATE_CLOSED
+ };
+
+ static const std::string IMAGE_CLIENT_ID;
+ static const std::string LOCAL_MIRROR_UUID;
+ static const std::string ORPHAN_MIRROR_UUID;
+
+ Journal(ImageCtxT &image_ctx);
+ ~Journal();
+
+ static void get_work_queue(CephContext *cct, ContextWQ **work_queue);
+
+ static bool is_journal_supported(ImageCtxT &image_ctx);
+ static int create(librados::IoCtx &io_ctx, const std::string &image_id,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool);
+ static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
+ static int reset(librados::IoCtx &io_ctx, const std::string &image_id);
+
+ static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner,
+ Context *on_finish);
+ static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
+ bool *is_tag_owner, asio::ContextWQ *op_work_queue,
+ Context *on_finish);
+ static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
+ std::string *mirror_uuid,
+ asio::ContextWQ *op_work_queue, Context *on_finish);
+ static int request_resync(ImageCtxT *image_ctx);
+ static void promote(ImageCtxT *image_ctx, Context *on_finish);
+ static void demote(ImageCtxT *image_ctx, Context *on_finish);
+
+ bool is_journal_ready() const;
+ bool is_journal_replaying() const;
+ bool is_journal_appending() const;
+
+ void wait_for_journal_ready(Context *on_ready);
+
+ void open(Context *on_finish);
+ void close(Context *on_finish);
+
+ bool is_tag_owner() const;
+ uint64_t get_tag_tid() const;
+ journal::TagData get_tag_data() const;
+
+ void allocate_local_tag(Context *on_finish);
+ void allocate_tag(const std::string &mirror_uuid,
+ const journal::TagPredecessor &predecessor,
+ Context *on_finish);
+
+ void flush_commit_position(Context *on_finish);
+
+ void user_flushed();
+
+ uint64_t append_write_event(uint64_t offset, size_t length,
+ const bufferlist &bl,
+ bool flush_entry);
+ uint64_t append_io_event(journal::EventEntry &&event_entry,
+ uint64_t offset, size_t length,
+ bool flush_entry, int filter_ret_val);
+ void commit_io_event(uint64_t tid, int r);
+ void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length,
+ int r);
+
+ void append_op_event(uint64_t op_tid, journal::EventEntry &&event_entry,
+ Context *on_safe);
+ void commit_op_event(uint64_t tid, int r, Context *on_safe);
+ void replay_op_ready(uint64_t op_tid, Context *on_resume);
+
+ void flush_event(uint64_t tid, Context *on_safe);
+ void wait_event(uint64_t tid, Context *on_safe);
+
+ uint64_t allocate_op_tid() {
+ uint64_t op_tid = ++m_op_tid;
+ ceph_assert(op_tid != 0);
+ return op_tid;
+ }
+
+ void start_external_replay(journal::Replay<ImageCtxT> **journal_replay,
+ Context *on_start);
+ void stop_external_replay();
+
+ void add_listener(journal::Listener *listener);
+ void remove_listener(journal::Listener *listener);
+
+ int is_resync_requested(bool *do_resync);
+
+ inline ContextWQ *get_work_queue() {
+ return m_work_queue;
+ }
+
+private:
+ ImageCtxT &m_image_ctx;
+
+ // mock unit testing support
+ typedef journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef typename TypeTraits::Future Future;
+ typedef typename TypeTraits::ReplayEntry ReplayEntry;
+
+ typedef std::list<bufferlist> Bufferlists;
+ typedef std::list<Context *> Contexts;
+ typedef std::list<Future> Futures;
+ typedef interval_set<uint64_t> ExtentInterval;
+
+ struct Event {
+ Futures futures;
+ Contexts on_safe_contexts;
+ ExtentInterval pending_extents;
+ int filter_ret_val = 0;
+ bool committed_io = false;
+ bool safe = false;
+ int ret_val = 0;
+
+ Event() {
+ }
+ Event(const Futures &_futures, uint64_t offset, size_t length,
+ int filter_ret_val)
+ : futures(_futures), filter_ret_val(filter_ret_val) {
+ if (length > 0) {
+ pending_extents.insert(offset, length);
+ }
+ }
+ };
+
+ typedef std::unordered_map<uint64_t, Event> Events;
+ typedef std::unordered_map<uint64_t, Future> TidToFutures;
+
+ struct C_IOEventSafe : public Context {
+ Journal *journal;
+ uint64_t tid;
+
+ C_IOEventSafe(Journal *_journal, uint64_t _tid)
+ : journal(_journal), tid(_tid) {
+ }
+
+ void finish(int r) override {
+ journal->handle_io_event_safe(r, tid);
+ }
+ };
+
+ struct C_OpEventSafe : public Context {
+ Journal *journal;
+ uint64_t tid;
+ Future op_start_future;
+ Future op_finish_future;
+ Context *on_safe;
+
+ C_OpEventSafe(Journal *journal, uint64_t tid, const Future &op_start_future,
+ const Future &op_finish_future, Context *on_safe)
+ : journal(journal), tid(tid), op_start_future(op_start_future),
+ op_finish_future(op_finish_future), on_safe(on_safe) {
+ }
+
+ void finish(int r) override {
+ journal->handle_op_event_safe(r, tid, op_start_future, op_finish_future,
+ on_safe);
+ }
+ };
+
+ struct C_ReplayProcessSafe : public Context {
+ Journal *journal;
+ ReplayEntry replay_entry;
+
+ C_ReplayProcessSafe(Journal *journal, ReplayEntry &&replay_entry) :
+ journal(journal), replay_entry(std::move(replay_entry)) {
+ }
+ void finish(int r) override {
+ journal->handle_replay_process_safe(replay_entry, r);
+ }
+ };
+
+ struct ReplayHandler : public ::journal::ReplayHandler {
+ Journal *journal;
+ ReplayHandler(Journal *_journal) : journal(_journal) {
+ }
+
+ void handle_entries_available() override {
+ journal->handle_replay_ready();
+ }
+ void handle_complete(int r) override {
+ journal->handle_replay_complete(r);
+ }
+ };
+
+ ContextWQ *m_work_queue = nullptr;
+ SafeTimer *m_timer = nullptr;
+ ceph::mutex *m_timer_lock = nullptr;
+
+ Journaler *m_journaler;
+ mutable ceph::mutex m_lock = ceph::make_mutex("Journal<I>::m_lock");
+ State m_state;
+ uint64_t m_max_append_size = 0;
+ uint64_t m_tag_class = 0;
+ uint64_t m_tag_tid = 0;
+ journal::ImageClientMeta m_client_meta;
+ journal::TagData m_tag_data;
+
+ int m_error_result;
+ Contexts m_wait_for_state_contexts;
+
+ ReplayHandler m_replay_handler;
+ bool m_close_pending;
+
+ ceph::mutex m_event_lock = ceph::make_mutex("Journal<I>::m_event_lock");
+ uint64_t m_event_tid;
+ Events m_events;
+
+ std::atomic<bool> m_user_flushed = false;
+
+ std::atomic<uint64_t> m_op_tid = { 0 };
+ TidToFutures m_op_futures;
+
+ bool m_processing_entry = false;
+ bool m_blocking_writes;
+
+ journal::Replay<ImageCtxT> *m_journal_replay;
+
+ AsyncOpTracker m_async_journal_op_tracker;
+
+ struct MetadataListener : public ::journal::JournalMetadataListener {
+ Journal<ImageCtxT> *journal;
+
+ MetadataListener(Journal<ImageCtxT> *journal) : journal(journal) { }
+
+ void handle_update(::journal::JournalMetadata *) override;
+ } m_metadata_listener;
+
+ typedef std::set<journal::Listener *> Listeners;
+ Listeners m_listeners;
+ ceph::condition_variable m_listener_cond;
+ bool m_listener_notify = false;
+
+ uint64_t m_refresh_sequence = 0;
+
+ bool is_journal_replaying(const ceph::mutex &) const;
+ bool is_tag_owner(const ceph::mutex &) const;
+
+ uint64_t append_io_events(journal::EventType event_type,
+ const Bufferlists &bufferlists,
+ uint64_t offset, size_t length, bool flush_entry,
+ int filter_ret_val);
+ Future wait_event(ceph::mutex &lock, uint64_t tid, Context *on_safe);
+
+ void create_journaler();
+ void destroy_journaler(int r);
+ void recreate_journaler(int r);
+
+ void complete_event(typename Events::iterator it, int r);
+
+ void start_append();
+
+ void handle_open(int r);
+
+ void handle_replay_ready();
+ void handle_replay_complete(int r);
+ void handle_replay_process_ready(int r);
+ void handle_replay_process_safe(ReplayEntry replay_entry, int r);
+
+ void handle_start_external_replay(int r,
+ journal::Replay<ImageCtxT> **journal_replay,
+ Context *on_finish);
+
+ void handle_flushing_restart(int r);
+ void handle_flushing_replay();
+
+ void handle_recording_stopped(int r);
+
+ void handle_journal_destroyed(int r);
+
+ void handle_io_event_safe(int r, uint64_t tid);
+ void handle_op_event_safe(int r, uint64_t tid, const Future &op_start_future,
+ const Future &op_finish_future, Context *on_safe);
+
+ void stop_recording();
+
+ void transition_state(State state, int r);
+
+ bool is_steady_state() const;
+ void wait_for_steady_state(Context *on_state);
+
+ int check_resync_requested(bool *do_resync);
+
+ void handle_metadata_updated();
+ void handle_refresh_metadata(uint64_t refresh_sequence, uint64_t tag_tid,
+ journal::TagData tag_data, int r);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::Journal<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_H
diff --git a/src/librbd/LibrbdAdminSocketHook.cc b/src/librbd/LibrbdAdminSocketHook.cc
new file mode 100644
index 000000000..f91bda3f0
--- /dev/null
+++ b/src/librbd/LibrbdAdminSocketHook.cc
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/LibrbdAdminSocketHook.h"
+#include "librbd/internal.h"
+#include "librbd/api/Io.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbdadminsocket: "
+
+namespace librbd {
+
+class LibrbdAdminSocketCommand {
+public:
+ virtual ~LibrbdAdminSocketCommand() {}
+ virtual int call(Formatter *f) = 0;
+};
+
+class FlushCacheCommand : public LibrbdAdminSocketCommand {
+public:
+ explicit FlushCacheCommand(ImageCtx *ictx) : ictx(ictx) {}
+
+ int call(Formatter *f) override {
+ return api::Io<>::flush(*ictx);
+ }
+
+private:
+ ImageCtx *ictx;
+};
+
+struct InvalidateCacheCommand : public LibrbdAdminSocketCommand {
+public:
+ explicit InvalidateCacheCommand(ImageCtx *ictx) : ictx(ictx) {}
+
+ int call(Formatter *f) override {
+ return invalidate_cache(ictx);
+ }
+
+private:
+ ImageCtx *ictx;
+};
+
+LibrbdAdminSocketHook::LibrbdAdminSocketHook(ImageCtx *ictx) :
+ admin_socket(ictx->cct->get_admin_socket()) {
+
+ std::string command;
+ std::string imagename;
+ int r;
+
+ imagename = ictx->md_ctx.get_pool_name() + "/" + ictx->name;
+ command = "rbd cache flush " + imagename;
+
+ r = admin_socket->register_command(command, this,
+ "flush rbd image " + imagename +
+ " cache");
+ if (r == 0) {
+ commands[command] = new FlushCacheCommand(ictx);
+ }
+
+ command = "rbd cache invalidate " + imagename;
+ r = admin_socket->register_command(command, this,
+ "invalidate rbd image " + imagename +
+ " cache");
+ if (r == 0) {
+ commands[command] = new InvalidateCacheCommand(ictx);
+ }
+}
+
+LibrbdAdminSocketHook::~LibrbdAdminSocketHook() {
+ (void)admin_socket->unregister_commands(this);
+ for (Commands::const_iterator i = commands.begin(); i != commands.end();
+ ++i) {
+ delete i->second;
+ }
+}
+
+int LibrbdAdminSocketHook::call(std::string_view command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) {
+ Commands::const_iterator i = commands.find(command);
+ ceph_assert(i != commands.end());
+ return i->second->call(f);
+}
+
+} // namespace librbd
diff --git a/src/librbd/LibrbdAdminSocketHook.h b/src/librbd/LibrbdAdminSocketHook.h
new file mode 100644
index 000000000..d07a9280e
--- /dev/null
+++ b/src/librbd/LibrbdAdminSocketHook.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H
+#define CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H
+
+#include <map>
+
+#include "common/admin_socket.h"
+
+namespace librbd {
+
+ struct ImageCtx;
+ class LibrbdAdminSocketCommand;
+
+ class LibrbdAdminSocketHook : public AdminSocketHook {
+ public:
+ LibrbdAdminSocketHook(ImageCtx *ictx);
+ ~LibrbdAdminSocketHook() override;
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override;
+
+ private:
+ typedef std::map<std::string,LibrbdAdminSocketCommand*,
+ std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+ };
+}
+
+#endif
diff --git a/src/librbd/ManagedLock.cc b/src/librbd/ManagedLock.cc
new file mode 100644
index 000000000..53a0cf911
--- /dev/null
+++ b/src/librbd/ManagedLock.cc
@@ -0,0 +1,854 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ManagedLock.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Watcher.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/managed_lock/AcquireRequest.h"
+#include "librbd/managed_lock/BreakRequest.h"
+#include "librbd/managed_lock/GetLockerRequest.h"
+#include "librbd/managed_lock/ReleaseRequest.h"
+#include "librbd/managed_lock/ReacquireRequest.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/managed_lock/Utils.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ManagedLock: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+using std::string;
+using namespace managed_lock;
+
+namespace {
+
+template <typename R>
+struct C_SendLockRequest : public Context {
+ R* request;
+ explicit C_SendLockRequest(R* request) : request(request) {
+ }
+ void finish(int r) override {
+ request->send();
+ }
+};
+
+struct C_Tracked : public Context {
+ AsyncOpTracker &tracker;
+ Context *ctx;
+ C_Tracked(AsyncOpTracker &tracker, Context *ctx)
+ : tracker(tracker), ctx(ctx) {
+ tracker.start_op();
+ }
+ ~C_Tracked() override {
+ tracker.finish_op();
+ }
+ void finish(int r) override {
+ ctx->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+using managed_lock::util::decode_lock_cookie;
+using managed_lock::util::encode_lock_cookie;
+
+template <typename I>
+ManagedLock<I>::ManagedLock(librados::IoCtx &ioctx, AsioEngine& asio_engine,
+ const string& oid, Watcher *watcher, Mode mode,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds)
+ : m_lock(ceph::make_mutex(unique_lock_name("librbd::ManagedLock<I>::m_lock", this))),
+ m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
+ m_asio_engine(asio_engine),
+ m_work_queue(asio_engine.get_work_queue()),
+ m_oid(oid),
+ m_watcher(watcher),
+ m_mode(mode),
+ m_blocklist_on_break_lock(blocklist_on_break_lock),
+ m_blocklist_expire_seconds(blocklist_expire_seconds),
+ m_state(STATE_UNLOCKED) {
+}
+
+template <typename I>
+ManagedLock<I>::~ManagedLock() {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_SHUTDOWN || m_state == STATE_UNLOCKED ||
+ m_state == STATE_UNINITIALIZED);
+ if (m_state == STATE_UNINITIALIZED) {
+ // never initialized -- ensure any in-flight ops are complete
+ // since we wouldn't expect shut_down to be invoked
+ C_SaferCond ctx;
+ m_async_op_tracker.wait_for_ops(&ctx);
+ ctx.wait();
+ }
+ ceph_assert(m_async_op_tracker.empty());
+}
+
+template <typename I>
+bool ManagedLock<I>::is_lock_owner() const {
+ std::lock_guard locker{m_lock};
+
+ return is_lock_owner(m_lock);
+}
+
+template <typename I>
+bool ManagedLock<I>::is_lock_owner(ceph::mutex &lock) const {
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ bool lock_owner;
+
+ switch (m_state) {
+ case STATE_LOCKED:
+ case STATE_REACQUIRING:
+ case STATE_PRE_SHUTTING_DOWN:
+ case STATE_POST_ACQUIRING:
+ case STATE_PRE_RELEASING:
+ lock_owner = true;
+ break;
+ default:
+ lock_owner = false;
+ break;
+ }
+
+ ldout(m_cct, 20) << "=" << lock_owner << dendl;
+ return lock_owner;
+}
+
+template <typename I>
+void ManagedLock<I>::shut_down(Context *on_shut_down) {
+ ldout(m_cct, 10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(!is_state_shutdown());
+
+ if (m_state == STATE_WAITING_FOR_REGISTER) {
+ // abort stalled acquire lock state
+ ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl;
+ Action active_action = get_active_action();
+ ceph_assert(active_action == ACTION_TRY_LOCK ||
+ active_action == ACTION_ACQUIRE_LOCK);
+ complete_active_action(STATE_UNLOCKED, -ESHUTDOWN);
+ }
+
+ execute_action(ACTION_SHUT_DOWN, on_shut_down);
+}
+
+template <typename I>
+void ManagedLock<I>::acquire_lock(Context *on_acquired) {
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) {
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_ACQUIRE_LOCK, on_acquired);
+ return;
+ }
+ }
+
+ if (on_acquired != nullptr) {
+ on_acquired->complete(r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::try_acquire_lock(Context *on_acquired) {
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) {
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_TRY_LOCK, on_acquired);
+ return;
+ }
+ }
+
+ if (on_acquired != nullptr) {
+ on_acquired->complete(r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::release_lock(Context *on_released) {
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (m_state != STATE_UNLOCKED || !m_actions_contexts.empty()) {
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_RELEASE_LOCK, on_released);
+ return;
+ }
+ }
+
+ if (on_released != nullptr) {
+ on_released->complete(r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::reacquire_lock(Context *on_reacquired) {
+ {
+ std::lock_guard locker{m_lock};
+
+ if (m_state == STATE_WAITING_FOR_REGISTER) {
+ // restart the acquire lock process now that watch is valid
+ ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl;
+ Action active_action = get_active_action();
+ ceph_assert(active_action == ACTION_TRY_LOCK ||
+ active_action == ACTION_ACQUIRE_LOCK);
+ execute_next_action();
+ } else if (!is_state_shutdown() &&
+ (m_state == STATE_LOCKED ||
+ m_state == STATE_ACQUIRING ||
+ m_state == STATE_POST_ACQUIRING ||
+ m_state == STATE_WAITING_FOR_LOCK)) {
+ // interlock the lock operation with other state ops
+ ldout(m_cct, 10) << dendl;
+ execute_action(ACTION_REACQUIRE_LOCK, on_reacquired);
+ return;
+ }
+ }
+
+ // ignore request if shutdown or not in a locked-related state
+ if (on_reacquired != nullptr) {
+ on_reacquired->complete(0);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::get_locker(managed_lock::Locker *locker,
+ Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+ {
+ std::lock_guard l{m_lock};
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else {
+ on_finish = new C_Tracked(m_async_op_tracker, on_finish);
+ auto req = managed_lock::GetLockerRequest<I>::create(
+ m_ioctx, m_oid, m_mode == EXCLUSIVE, locker, on_finish);
+ req->send();
+ return;
+ }
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::break_lock(const managed_lock::Locker &locker,
+ bool force_break_lock, Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+ {
+ std::lock_guard l{m_lock};
+ if (is_state_shutdown()) {
+ r = -ESHUTDOWN;
+ } else if (is_lock_owner(m_lock)) {
+ r = -EBUSY;
+ } else {
+ on_finish = new C_Tracked(m_async_op_tracker, on_finish);
+ auto req = managed_lock::BreakRequest<I>::create(
+ m_ioctx, m_asio_engine, m_oid, locker, m_mode == EXCLUSIVE,
+ m_blocklist_on_break_lock, m_blocklist_expire_seconds, force_break_lock,
+ on_finish);
+ req->send();
+ return;
+ }
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+int ManagedLock<I>::assert_header_locked() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ {
+ std::lock_guard locker{m_lock};
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME,
+ (m_mode == EXCLUSIVE ? ClsLockType::EXCLUSIVE :
+ ClsLockType::SHARED),
+ m_cookie,
+ managed_lock::util::get_watcher_lock_tag());
+ }
+
+ int r = m_ioctx.operate(m_oid, &op, nullptr);
+ if (r < 0) {
+ if (r == -EBLOCKLISTED) {
+ ldout(m_cct, 5) << "client is not lock owner -- client blocklisted"
+ << dendl;
+ } else if (r == -ENOENT) {
+ ldout(m_cct, 5) << "client is not lock owner -- no lock detected"
+ << dendl;
+ } else if (r == -EBUSY) {
+ ldout(m_cct, 5) << "client is not lock owner -- owned by different client"
+ << dendl;
+ } else {
+ lderr(m_cct) << "failed to verify lock ownership: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ManagedLock<I>::shutdown_handler(int r, Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::pre_acquire_lock_handler(Context *on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ManagedLock<I>::post_acquire_lock_handler(int r, Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ManagedLock<I>::post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ManagedLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) {
+ on_finish->complete(r);
+}
+
+template <typename I>
+bool ManagedLock<I>::is_transition_state() const {
+ switch (m_state) {
+ case STATE_ACQUIRING:
+ case STATE_WAITING_FOR_REGISTER:
+ case STATE_REACQUIRING:
+ case STATE_RELEASING:
+ case STATE_PRE_SHUTTING_DOWN:
+ case STATE_SHUTTING_DOWN:
+ case STATE_INITIALIZING:
+ case STATE_WAITING_FOR_LOCK:
+ case STATE_POST_ACQUIRING:
+ case STATE_PRE_RELEASING:
+ return true;
+ case STATE_UNLOCKED:
+ case STATE_LOCKED:
+ case STATE_SHUTDOWN:
+ case STATE_UNINITIALIZED:
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+void ManagedLock<I>::append_context(Action action, Context *ctx) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ for (auto &action_ctxs : m_actions_contexts) {
+ if (action == action_ctxs.first) {
+ if (ctx != nullptr) {
+ action_ctxs.second.push_back(ctx);
+ }
+ return;
+ }
+ }
+
+ Contexts contexts;
+ if (ctx != nullptr) {
+ contexts.push_back(ctx);
+ }
+ m_actions_contexts.push_back({action, std::move(contexts)});
+}
+
+template <typename I>
+void ManagedLock<I>::execute_action(Action action, Context *ctx) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ append_context(action, ctx);
+ if (!is_transition_state()) {
+ execute_next_action();
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::execute_next_action() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_actions_contexts.empty());
+ switch (get_active_action()) {
+ case ACTION_ACQUIRE_LOCK:
+ case ACTION_TRY_LOCK:
+ send_acquire_lock();
+ break;
+ case ACTION_REACQUIRE_LOCK:
+ send_reacquire_lock();
+ break;
+ case ACTION_RELEASE_LOCK:
+ send_release_lock();
+ break;
+ case ACTION_SHUT_DOWN:
+ send_shutdown();
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+}
+
+template <typename I>
+typename ManagedLock<I>::Action ManagedLock<I>::get_active_action() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_actions_contexts.empty());
+ return m_actions_contexts.front().first;
+}
+
+template <typename I>
+void ManagedLock<I>::complete_active_action(State next_state, int r) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_actions_contexts.empty());
+
+ ActionContexts action_contexts(std::move(m_actions_contexts.front()));
+ m_actions_contexts.pop_front();
+ m_state = next_state;
+
+ m_lock.unlock();
+ for (auto ctx : action_contexts.second) {
+ ctx->complete(r);
+ }
+ m_lock.lock();
+
+ if (!is_transition_state() && !m_actions_contexts.empty()) {
+ execute_next_action();
+ }
+}
+
+template <typename I>
+bool ManagedLock<I>::is_state_shutdown() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ switch (m_state) {
+ case STATE_PRE_SHUTTING_DOWN:
+ case STATE_SHUTTING_DOWN:
+ case STATE_SHUTDOWN:
+ return true;
+ default:
+ break;
+ }
+
+ return (!m_actions_contexts.empty() &&
+ m_actions_contexts.back().first == ACTION_SHUT_DOWN);
+}
+
+template <typename I>
+void ManagedLock<I>::send_acquire_lock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ if (m_state == STATE_LOCKED) {
+ complete_active_action(STATE_LOCKED, 0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ uint64_t watch_handle = m_watcher->get_watch_handle();
+ if (watch_handle == 0) {
+ lderr(m_cct) << "watcher not registered - delaying request" << dendl;
+ m_state = STATE_WAITING_FOR_REGISTER;
+
+ // shut down might race w/ release/re-acquire of the lock
+ if (is_state_shutdown()) {
+ complete_active_action(STATE_UNLOCKED, -ESHUTDOWN);
+ }
+ return;
+ }
+
+ m_state = STATE_ACQUIRING;
+ m_cookie = encode_lock_cookie(watch_handle);
+
+ m_work_queue->queue(new LambdaContext([this](int r) {
+ pre_acquire_lock_handler(create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_pre_acquire_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_pre_acquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ handle_acquire_lock(r);
+ return;
+ }
+
+ using managed_lock::AcquireRequest;
+ AcquireRequest<I>* req = AcquireRequest<I>::create(
+ m_ioctx, m_watcher, m_asio_engine, m_oid, m_cookie, m_mode == EXCLUSIVE,
+ m_blocklist_on_break_lock, m_blocklist_expire_seconds,
+ create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_acquire_lock>(this));
+ m_work_queue->queue(new C_SendLockRequest<AcquireRequest<I>>(req), 0);
+}
+
+template <typename I>
+void ManagedLock<I>::handle_acquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -EBUSY || r == -EAGAIN || r == -EROFS) {
+ ldout(m_cct, 5) << "unable to acquire exclusive lock" << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to acquire exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ } else {
+ ldout(m_cct, 5) << "successfully acquired exclusive lock" << dendl;
+ }
+
+ m_post_next_state = (r < 0 ? STATE_UNLOCKED : STATE_LOCKED);
+
+ m_work_queue->queue(new LambdaContext([this, r](int ret) {
+ post_acquire_lock_handler(r, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_post_acquire_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_post_acquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0 && m_post_next_state == STATE_LOCKED) {
+ // release_lock without calling pre and post handlers
+ revert_to_unlock_state(r);
+ } else if (r != -ECANCELED) {
+ // fail the lock request
+ complete_active_action(m_post_next_state, r);
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::revert_to_unlock_state(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ using managed_lock::ReleaseRequest;
+ ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher,
+ m_work_queue, m_oid, m_cookie,
+ new LambdaContext([this, r](int ret) {
+ std::lock_guard locker{m_lock};
+ ceph_assert(ret == 0);
+ complete_active_action(STATE_UNLOCKED, r);
+ }));
+ m_work_queue->queue(new C_SendLockRequest<ReleaseRequest<I>>(req));
+}
+
+template <typename I>
+void ManagedLock<I>::send_reacquire_lock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (m_state != STATE_LOCKED) {
+ complete_active_action(m_state, 0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+ m_state = STATE_REACQUIRING;
+
+ uint64_t watch_handle = m_watcher->get_watch_handle();
+ if (watch_handle == 0) {
+ // watch (re)failed while recovering
+ lderr(m_cct) << "aborting reacquire due to invalid watch handle"
+ << dendl;
+
+ // treat double-watch failure as a lost lock and invoke the
+ // release/acquire handlers
+ release_acquire_lock();
+ complete_active_action(STATE_LOCKED, 0);
+ return;
+ }
+
+ m_new_cookie = encode_lock_cookie(watch_handle);
+ if (m_cookie == m_new_cookie && m_blocklist_on_break_lock) {
+ ldout(m_cct, 10) << "skipping reacquire since cookie still valid"
+ << dendl;
+ auto ctx = create_context_callback<
+ ManagedLock, &ManagedLock<I>::handle_no_op_reacquire_lock>(this);
+ post_reacquire_lock_handler(0, ctx);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ManagedLock, &ManagedLock<I>::handle_reacquire_lock>(this);
+ ctx = new LambdaContext([this, ctx](int r) {
+ post_reacquire_lock_handler(r, ctx);
+ });
+
+ using managed_lock::ReacquireRequest;
+ ReacquireRequest<I>* req = ReacquireRequest<I>::create(m_ioctx, m_oid,
+ m_cookie, m_new_cookie, m_mode == EXCLUSIVE, ctx);
+ m_work_queue->queue(new C_SendLockRequest<ReacquireRequest<I>>(req));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_reacquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_REACQUIRING);
+
+ if (r < 0) {
+ if (r == -EOPNOTSUPP) {
+ ldout(m_cct, 10) << "updating lock is not supported" << dendl;
+ } else {
+ lderr(m_cct) << "failed to update lock cookie: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ release_acquire_lock();
+ } else {
+ m_cookie = m_new_cookie;
+ }
+
+ complete_active_action(STATE_LOCKED, 0);
+}
+
+template <typename I>
+void ManagedLock<I>::handle_no_op_reacquire_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+ ceph_assert(m_state == STATE_REACQUIRING);
+ ceph_assert(r >= 0);
+ complete_active_action(STATE_LOCKED, 0);
+}
+
+template <typename I>
+void ManagedLock<I>::release_acquire_lock() {
+ assert(ceph_mutex_is_locked(m_lock));
+
+ if (!is_state_shutdown()) {
+ // queue a release and re-acquire of the lock since cookie cannot
+ // be updated on older OSDs
+ execute_action(ACTION_RELEASE_LOCK, nullptr);
+
+ ceph_assert(!m_actions_contexts.empty());
+ ActionContexts &action_contexts(m_actions_contexts.front());
+
+ // reacquire completes when the request lock completes
+ Contexts contexts;
+ std::swap(contexts, action_contexts.second);
+ if (contexts.empty()) {
+ execute_action(ACTION_ACQUIRE_LOCK, nullptr);
+ } else {
+ for (auto ctx : contexts) {
+ execute_action(ACTION_ACQUIRE_LOCK, ctx);
+ }
+ }
+ }
+}
+
+template <typename I>
+void ManagedLock<I>::send_release_lock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ if (m_state == STATE_UNLOCKED) {
+ complete_active_action(STATE_UNLOCKED, 0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+ m_state = STATE_PRE_RELEASING;
+
+ m_work_queue->queue(new LambdaContext([this](int r) {
+ pre_release_lock_handler(false, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_pre_release_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_pre_release_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_PRE_RELEASING);
+ m_state = STATE_RELEASING;
+ }
+
+ if (r < 0) {
+ handle_release_lock(r);
+ return;
+ }
+
+ using managed_lock::ReleaseRequest;
+ ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher,
+ m_work_queue, m_oid, m_cookie,
+ create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_release_lock>(this));
+ m_work_queue->queue(new C_SendLockRequest<ReleaseRequest<I>>(req), 0);
+}
+
+template <typename I>
+void ManagedLock<I>::handle_release_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_RELEASING);
+
+ if (r >= 0 || r == -EBLOCKLISTED || r == -ENOENT) {
+ m_cookie = "";
+ m_post_next_state = STATE_UNLOCKED;
+ } else {
+ m_post_next_state = STATE_LOCKED;
+ }
+
+ m_work_queue->queue(new LambdaContext([this, r](int ret) {
+ post_release_lock_handler(false, r, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_post_release_lock>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_post_release_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+ complete_active_action(m_post_next_state, r);
+}
+
+template <typename I>
+void ManagedLock<I>::send_shutdown() {
+ ldout(m_cct, 10) << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ if (m_state == STATE_UNLOCKED) {
+ m_state = STATE_SHUTTING_DOWN;
+ m_work_queue->queue(new LambdaContext([this](int r) {
+ shutdown_handler(r, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_shutdown>(this));
+ }));
+ return;
+ }
+
+ ceph_assert(m_state == STATE_LOCKED);
+ m_state = STATE_PRE_SHUTTING_DOWN;
+
+ m_lock.unlock();
+ m_work_queue->queue(new C_ShutDownRelease(this), 0);
+ m_lock.lock();
+}
+
+template <typename I>
+void ManagedLock<I>::handle_shutdown(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ wait_for_tracked_ops(r);
+}
+
+template <typename I>
+void ManagedLock<I>::send_shutdown_release() {
+ ldout(m_cct, 10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_work_queue->queue(new LambdaContext([this](int r) {
+ pre_release_lock_handler(true, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_shutdown_pre_release>(this));
+ }));
+}
+
+template <typename I>
+void ManagedLock<I>::handle_shutdown_pre_release(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::string cookie;
+ {
+ std::lock_guard locker{m_lock};
+ cookie = m_cookie;
+
+ ceph_assert(m_state == STATE_PRE_SHUTTING_DOWN);
+ m_state = STATE_SHUTTING_DOWN;
+ }
+
+ using managed_lock::ReleaseRequest;
+ ReleaseRequest<I>* req = ReleaseRequest<I>::create(m_ioctx, m_watcher,
+ m_work_queue, m_oid, cookie,
+ new LambdaContext([this, r](int l) {
+ int rst = r < 0 ? r : l;
+ post_release_lock_handler(true, rst, create_context_callback<
+ ManagedLock<I>, &ManagedLock<I>::handle_shutdown_post_release>(this));
+ }));
+ req->send();
+
+}
+
+template <typename I>
+void ManagedLock<I>::handle_shutdown_post_release(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ wait_for_tracked_ops(r);
+}
+
+template <typename I>
+void ManagedLock<I>::wait_for_tracked_ops(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ Context *ctx = new LambdaContext([this, r](int ret) {
+ complete_shutdown(r);
+ });
+
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void ManagedLock<I>::complete_shutdown(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to shut down lock: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ ActionContexts action_contexts;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_actions_contexts.size() == 1);
+
+ action_contexts = std::move(m_actions_contexts.front());
+ m_actions_contexts.pop_front();
+ m_state = STATE_SHUTDOWN;
+ }
+
+ // expect to be destroyed after firing callback
+ for (auto ctx : action_contexts.second) {
+ ctx->complete(r);
+ }
+}
+
+} // namespace librbd
+
+template class librbd::ManagedLock<librbd::ImageCtx>;
diff --git a/src/librbd/ManagedLock.h b/src/librbd/ManagedLock.h
new file mode 100644
index 000000000..09fc413c0
--- /dev/null
+++ b/src/librbd/ManagedLock.h
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_H
+#define CEPH_LIBRBD_MANAGED_LOCK_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/AsyncOpTracker.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/watcher/Types.h"
+#include "librbd/managed_lock/Types.h"
+#include <list>
+#include <string>
+#include <utility>
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+namespace managed_lock { struct Locker; }
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ManagedLock {
+private:
+ typedef watcher::Traits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Watcher Watcher;
+
+public:
+ static ManagedLock *create(librados::IoCtx& ioctx,
+ AsioEngine& asio_engine,
+ const std::string& oid, Watcher *watcher,
+ managed_lock::Mode mode,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds) {
+ return new ManagedLock(ioctx, asio_engine, oid, watcher, mode,
+ blocklist_on_break_lock, blocklist_expire_seconds);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ ManagedLock(librados::IoCtx& ioctx, AsioEngine& asio_engine,
+ const std::string& oid, Watcher *watcher,
+ managed_lock::Mode mode, bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds);
+ virtual ~ManagedLock();
+
+ bool is_lock_owner() const;
+
+ void shut_down(Context *on_shutdown);
+ void acquire_lock(Context *on_acquired);
+ void try_acquire_lock(Context *on_acquired);
+ void release_lock(Context *on_released);
+ void reacquire_lock(Context *on_reacquired);
+ void get_locker(managed_lock::Locker *locker, Context *on_finish);
+ void break_lock(const managed_lock::Locker &locker, bool force_break_lock,
+ Context *on_finish);
+
+ int assert_header_locked();
+
+ bool is_shutdown() const {
+ std::lock_guard l{m_lock};
+ return is_state_shutdown();
+ }
+
+protected:
+ mutable ceph::mutex m_lock;
+
+ inline void set_state_uninitialized() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_UNLOCKED);
+ m_state = STATE_UNINITIALIZED;
+ }
+ inline void set_state_initializing() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ m_state = STATE_INITIALIZING;
+ }
+ inline void set_state_unlocked() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_INITIALIZING || m_state == STATE_RELEASING);
+ m_state = STATE_UNLOCKED;
+ }
+ inline void set_state_waiting_for_lock() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_ACQUIRING);
+ m_state = STATE_WAITING_FOR_LOCK;
+ }
+ inline void set_state_post_acquiring() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_ACQUIRING);
+ m_state = STATE_POST_ACQUIRING;
+ }
+
+ bool is_state_shutdown() const;
+ inline bool is_state_acquiring() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_state == STATE_ACQUIRING;
+ }
+ inline bool is_state_post_acquiring() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_state == STATE_POST_ACQUIRING;
+ }
+ inline bool is_state_releasing() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_state == STATE_RELEASING;
+ }
+ inline bool is_state_pre_releasing() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_state == STATE_PRE_RELEASING;
+ }
+ inline bool is_state_locked() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_state == STATE_LOCKED;
+ }
+ inline bool is_state_waiting_for_lock() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_state == STATE_WAITING_FOR_LOCK;
+ }
+
+ inline bool is_action_acquire_lock() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return get_active_action() == ACTION_ACQUIRE_LOCK;
+ }
+
+ virtual void shutdown_handler(int r, Context *on_finish);
+ virtual void pre_acquire_lock_handler(Context *on_finish);
+ virtual void post_acquire_lock_handler(int r, Context *on_finish);
+ virtual void pre_release_lock_handler(bool shutting_down,
+ Context *on_finish);
+ virtual void post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish);
+ virtual void post_reacquire_lock_handler(int r, Context *on_finish);
+
+ void execute_next_action();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v (acquire_lock)
+ * UNLOCKED -----------------------------------------> ACQUIRING
+ * ^ |
+ * | |
+ * RELEASING |
+ * | |
+ * | |
+ * | (release_lock) v
+ * PRE_RELEASING <----------------------------------------- LOCKED
+ *
+ * <LOCKED state>
+ * |
+ * v
+ * REACQUIRING -------------------------------------> <finish>
+ * . ^
+ * . |
+ * . . . > <RELEASE action> ---> <ACQUIRE action> ---/
+ *
+ * <UNLOCKED/LOCKED states>
+ * |
+ * |
+ * v
+ * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> <finish>
+ *
+ * @endverbatim
+ */
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_INITIALIZING,
+ STATE_UNLOCKED,
+ STATE_LOCKED,
+ STATE_ACQUIRING,
+ STATE_POST_ACQUIRING,
+ STATE_WAITING_FOR_REGISTER,
+ STATE_WAITING_FOR_LOCK,
+ STATE_REACQUIRING,
+ STATE_PRE_RELEASING,
+ STATE_RELEASING,
+ STATE_PRE_SHUTTING_DOWN,
+ STATE_SHUTTING_DOWN,
+ STATE_SHUTDOWN,
+ };
+
+ enum Action {
+ ACTION_TRY_LOCK,
+ ACTION_ACQUIRE_LOCK,
+ ACTION_REACQUIRE_LOCK,
+ ACTION_RELEASE_LOCK,
+ ACTION_SHUT_DOWN
+ };
+
+ typedef std::list<Context *> Contexts;
+ typedef std::pair<Action, Contexts> ActionContexts;
+ typedef std::list<ActionContexts> ActionsContexts;
+
+ struct C_ShutDownRelease : public Context {
+ ManagedLock *lock;
+ C_ShutDownRelease(ManagedLock *lock)
+ : lock(lock) {
+ }
+ void finish(int r) override {
+ lock->send_shutdown_release();
+ }
+ };
+
+ librados::IoCtx& m_ioctx;
+ CephContext *m_cct;
+ AsioEngine& m_asio_engine;
+ asio::ContextWQ* m_work_queue;
+ std::string m_oid;
+ Watcher *m_watcher;
+ managed_lock::Mode m_mode;
+ bool m_blocklist_on_break_lock;
+ uint32_t m_blocklist_expire_seconds;
+
+ std::string m_cookie;
+ std::string m_new_cookie;
+
+ State m_state;
+ State m_post_next_state;
+
+ ActionsContexts m_actions_contexts;
+ AsyncOpTracker m_async_op_tracker;
+
+ bool is_lock_owner(ceph::mutex &lock) const;
+ bool is_transition_state() const;
+
+ void append_context(Action action, Context *ctx);
+ void execute_action(Action action, Context *ctx);
+
+ Action get_active_action() const;
+ void complete_active_action(State next_state, int r);
+
+ void send_acquire_lock();
+ void handle_pre_acquire_lock(int r);
+ void handle_acquire_lock(int r);
+ void handle_no_op_reacquire_lock(int r);
+
+ void handle_post_acquire_lock(int r);
+ void revert_to_unlock_state(int r);
+
+ void send_reacquire_lock();
+ void handle_reacquire_lock(int r);
+ void release_acquire_lock();
+
+ void send_release_lock();
+ void handle_pre_release_lock(int r);
+ void handle_release_lock(int r);
+ void handle_post_release_lock(int r);
+
+ void send_shutdown();
+ void handle_shutdown(int r);
+ void send_shutdown_release();
+ void handle_shutdown_pre_release(int r);
+ void handle_shutdown_post_release(int r);
+ void wait_for_tracked_ops(int r);
+ void complete_shutdown(int r);
+};
+
+} // namespace librbd
+
+extern template class librbd::ManagedLock<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_H
diff --git a/src/librbd/MirroringWatcher.cc b/src/librbd/MirroringWatcher.cc
new file mode 100644
index 000000000..c0cda5fa1
--- /dev/null
+++ b/src/librbd/MirroringWatcher.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/MirroringWatcher.h"
+#include "include/rbd_types.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "librbd/Utils.h"
+#include "librbd/watcher/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MirroringWatcher: "
+
+namespace librbd {
+
+using namespace mirroring_watcher;
+using namespace watcher;
+
+using librbd::util::create_rados_callback;
+
+namespace {
+
+static const uint64_t NOTIFY_TIMEOUT_MS = 5000;
+
+} // anonymous namespace
+
+template <typename I>
+MirroringWatcher<I>::MirroringWatcher(librados::IoCtx &io_ctx,
+ asio::ContextWQ *work_queue)
+ : Watcher(io_ctx, work_queue, RBD_MIRRORING) {
+}
+
+template <typename I>
+int MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode) {
+ C_SaferCond ctx;
+ notify_mode_updated(io_ctx, mirror_mode, &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode,
+ Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ModeUpdatedPayload{mirror_mode}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS,
+ nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+int MirroringWatcher<I>::notify_image_updated(
+ librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id, const std::string &global_image_id) {
+ C_SaferCond ctx;
+ notify_image_updated(io_ctx, mirror_image_state, image_id, global_image_id,
+ &ctx);
+ return ctx.wait();
+}
+
+template <typename I>
+void MirroringWatcher<I>::notify_image_updated(
+ librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id, const std::string &global_image_id,
+ Context *on_finish) {
+
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ImageUpdatedPayload{
+ mirror_image_state, image_id, global_image_id}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS,
+ nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+
+}
+
+template <typename I>
+void MirroringWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 15) << ": notify_id=" << notify_id << ", "
+ << "handle=" << handle << dendl;
+
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ lderr(cct) << ": error decoding image notification: " << err.what()
+ << dendl;
+ Context *ctx = new C_NotifyAck(this, notify_id, handle);
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(watcher::util::HandlePayloadVisitor<MirroringWatcher<I>>(
+ this, notify_id, handle), notify_message.payload);
+}
+
+template <typename I>
+bool MirroringWatcher<I>::handle_payload(const ModeUpdatedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << ": mode updated: " << payload.mirror_mode << dendl;
+ handle_mode_updated(payload.mirror_mode);
+ return true;
+}
+
+template <typename I>
+bool MirroringWatcher<I>::handle_payload(const ImageUpdatedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << ": image state updated" << dendl;
+ handle_image_updated(payload.mirror_image_state, payload.image_id,
+ payload.global_image_id);
+ return true;
+}
+
+template <typename I>
+bool MirroringWatcher<I>::handle_payload(const UnknownPayload &payload,
+ Context *on_notify_ack) {
+ return true;
+}
+
+} // namespace librbd
+
+template class librbd::MirroringWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/MirroringWatcher.h b/src/librbd/MirroringWatcher.h
new file mode 100644
index 000000000..e13762e9b
--- /dev/null
+++ b/src/librbd/MirroringWatcher.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_H
+#define CEPH_LIBRBD_MIRRORING_WATCHER_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Watcher.h"
+#include "librbd/mirroring_watcher/Types.h"
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+namespace watcher {
+namespace util {
+template <typename> struct HandlePayloadVisitor;
+}
+}
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MirroringWatcher : public Watcher {
+ friend struct watcher::util::HandlePayloadVisitor<MirroringWatcher<ImageCtxT>>;
+
+public:
+ MirroringWatcher(librados::IoCtx &io_ctx, asio::ContextWQ *work_queue);
+
+ static int notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode);
+ static void notify_mode_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorMode mirror_mode,
+ Context *on_finish);
+
+ static int notify_image_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id,
+ const std::string &global_image_id);
+ static void notify_image_updated(librados::IoCtx &io_ctx,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id,
+ const std::string &global_image_id,
+ Context *on_finish);
+
+ virtual void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) = 0;
+ virtual void handle_image_updated(cls::rbd::MirrorImageState state,
+ const std::string &image_id,
+ const std::string &global_image_id) = 0;
+
+private:
+ bool handle_payload(const mirroring_watcher::ModeUpdatedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const mirroring_watcher::ImageUpdatedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const mirroring_watcher::UnknownPayload &payload,
+ Context *on_notify_ack);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+};
+
+} // namespace librbd
+
+extern template class librbd::MirroringWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRRORING_WATCHER_H
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
new file mode 100644
index 000000000..65e3fc4a4
--- /dev/null
+++ b/src/librbd/ObjectMap.cc
@@ -0,0 +1,380 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ObjectMap.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/object_map/RefreshRequest.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "librbd/object_map/UnlockRequest.h"
+#include "librbd/object_map/UpdateRequest.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+
+#include "include/rados/librados.hpp"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/stringify.h"
+#include "osdc/Striper.h"
+#include <sstream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ObjectMap: " << this << " " << __func__ \
+ << ": "
+
+namespace librbd {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+ObjectMap<I>::ObjectMap(I &image_ctx, uint64_t snap_id)
+ : RefCountedObject(image_ctx.cct),
+ m_image_ctx(image_ctx), m_snap_id(snap_id),
+ m_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ObjectMap::lock", this))),
+ m_update_guard(new UpdateGuard(m_image_ctx.cct)) {
+}
+
+template <typename I>
+ObjectMap<I>::~ObjectMap() {
+ delete m_update_guard;
+}
+
+template <typename I>
+int ObjectMap<I>::aio_remove(librados::IoCtx &io_ctx, const std::string &image_id,
+ librados::AioCompletion *c) {
+ return io_ctx.aio_remove(object_map_name(image_id, CEPH_NOSNAP), c);
+}
+
+template <typename I>
+std::string ObjectMap<I>::object_map_name(const std::string &image_id,
+ uint64_t snap_id) {
+ std::string oid(RBD_OBJECT_MAP_PREFIX + image_id);
+ if (snap_id != CEPH_NOSNAP) {
+ std::stringstream snap_suffix;
+ snap_suffix << "." << std::setfill('0') << std::setw(16) << std::hex
+ << snap_id;
+ oid += snap_suffix.str();
+ }
+ return oid;
+}
+
+template <typename I>
+bool ObjectMap<I>::is_compatible(const file_layout_t& layout, uint64_t size) {
+ uint64_t object_count = Striper::get_num_objects(layout, size);
+ return (object_count <= cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT);
+}
+
+template <typename I>
+uint8_t ObjectMap<I>::operator[](uint64_t object_no) const
+{
+ std::shared_lock locker{m_lock};
+ ceph_assert(object_no < m_object_map.size());
+ return m_object_map[object_no];
+}
+
+template <typename I>
+bool ObjectMap<I>::object_may_exist(uint64_t object_no) const
+{
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+
+ // Fall back to default logic if object map is disabled or invalid
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock)) {
+ return true;
+ }
+
+ bool flags_set;
+ int r = m_image_ctx.test_flags(m_image_ctx.snap_id,
+ RBD_FLAG_OBJECT_MAP_INVALID,
+ m_image_ctx.image_lock, &flags_set);
+ if (r < 0 || flags_set) {
+ return true;
+ }
+
+ uint8_t state = (*this)[object_no];
+ bool exists = (state != OBJECT_NONEXISTENT);
+ ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" << exists
+ << dendl;
+ return exists;
+}
+
+template <typename I>
+bool ObjectMap<I>::object_may_not_exist(uint64_t object_no) const
+{
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+
+ // Fall back to default logic if object map is disabled or invalid
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock)) {
+ return true;
+ }
+
+ bool flags_set;
+ int r = m_image_ctx.test_flags(m_image_ctx.snap_id,
+ RBD_FLAG_OBJECT_MAP_INVALID,
+ m_image_ctx.image_lock, &flags_set);
+ if (r < 0 || flags_set) {
+ return true;
+ }
+
+ uint8_t state = (*this)[object_no];
+ bool nonexistent = (state != OBJECT_EXISTS && state != OBJECT_EXISTS_CLEAN);
+ ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r="
+ << nonexistent << dendl;
+ return nonexistent;
+}
+
+template <typename I>
+bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it,
+ uint8_t new_state) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ uint8_t state = *it;
+ if ((state == new_state) ||
+ (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+ (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) {
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+void ObjectMap<I>::open(Context *on_finish) {
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ auto req = object_map::RefreshRequest<I>::create(
+ m_image_ctx, &m_lock, &m_object_map, m_snap_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::close(Context *on_finish) {
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ if (m_snap_id != CEPH_NOSNAP) {
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+ return;
+ }
+
+ ctx = new LambdaContext([this, ctx](int r) {
+ auto req = object_map::UnlockRequest<I>::create(m_image_ctx, ctx);
+ req->send();
+ });
+
+ // ensure the block guard for aio updates is empty before unlocking
+ // the object map
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+bool ObjectMap<I>::set_object_map(ceph::BitVector<2> &target_object_map) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock));
+ std::unique_lock locker{m_lock};
+ m_object_map = target_object_map;
+ return true;
+}
+
+template <typename I>
+void ObjectMap<I>::rollback(uint64_t snap_id, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+
+ std::unique_lock locker{m_lock};
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ object_map::SnapshotRollbackRequest *req =
+ new object_map::SnapshotRollbackRequest(m_image_ctx, snap_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::snapshot_add(uint64_t snap_id, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+ ceph_assert(snap_id != CEPH_NOSNAP);
+
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ object_map::SnapshotCreateRequest *req =
+ new object_map::SnapshotCreateRequest(m_image_ctx, &m_lock, &m_object_map,
+ snap_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::snapshot_remove(uint64_t snap_id, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_wlocked(m_image_ctx.image_lock));
+ ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+ ceph_assert(snap_id != CEPH_NOSNAP);
+
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ object_map::SnapshotRemoveRequest *req =
+ new object_map::SnapshotRemoveRequest(m_image_ctx, &m_lock, &m_object_map,
+ snap_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::aio_save(Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock));
+ std::shared_lock locker{m_lock};
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_save(&op, m_object_map);
+
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
+ librados::AioCompletion *comp = util::create_rados_callback(ctx);
+
+ int r = m_image_ctx.md_ctx.aio_operate(oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ObjectMap<I>::aio_resize(uint64_t new_size, uint8_t default_object_state,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock));
+ ceph_assert(m_image_ctx.image_watcher != NULL);
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ Context *ctx = create_context_callback<Context>(on_finish, this);
+
+ object_map::ResizeRequest *req = new object_map::ResizeRequest(
+ m_image_ctx, &m_lock, &m_object_map, m_snap_id, new_size,
+ default_object_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void ObjectMap<I>::detained_aio_update(UpdateOperation &&op) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert(ceph_mutex_is_wlocked(m_lock));
+
+ BlockGuardCell *cell;
+ int r = m_update_guard->detain({op.start_object_no, op.end_object_no},
+ &op, &cell);
+ if (r < 0) {
+ lderr(cct) << "failed to detain object map update: " << cpp_strerror(r)
+ << dendl;
+ m_image_ctx.op_work_queue->queue(op.on_finish, r);
+ m_async_op_tracker.finish_op();
+ return;
+ } else if (r > 0) {
+ ldout(cct, 20) << "detaining object map update due to in-flight update: "
+ << "start=" << op.start_object_no << ", "
+ << "end=" << op.end_object_no << ", "
+ << (op.current_state ?
+ stringify(static_cast<uint32_t>(*op.current_state)) :
+ "")
+ << "->" << static_cast<uint32_t>(op.new_state) << dendl;
+ return;
+ }
+
+ ldout(cct, 20) << "in-flight update cell: " << cell << dendl;
+ Context *on_finish = op.on_finish;
+ Context *ctx = new LambdaContext([this, cell, on_finish](int r) {
+ handle_detained_aio_update(cell, r, on_finish);
+ });
+ aio_update(CEPH_NOSNAP, op.start_object_no, op.end_object_no, op.new_state,
+ op.current_state, op.parent_trace, op.ignore_enoent, ctx);
+}
+
+template <typename I>
+void ObjectMap<I>::handle_detained_aio_update(BlockGuardCell *cell, int r,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "cell=" << cell << ", r=" << r << dendl;
+
+ typename UpdateGuard::BlockOperations block_ops;
+ m_update_guard->release(cell, &block_ops);
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ std::unique_lock locker{m_lock};
+ for (auto &op : block_ops) {
+ detained_aio_update(std::move(op));
+ }
+ }
+
+ on_finish->complete(r);
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace,
+ bool ignore_enoent, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+ ceph_assert(m_image_ctx.image_watcher != nullptr);
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ ceph_assert(start_object_no < end_object_no);
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "start=" << start_object_no << ", "
+ << "end=" << end_object_no << ", "
+ << (current_state ?
+ stringify(static_cast<uint32_t>(*current_state)) : "")
+ << "->" << static_cast<uint32_t>(new_state) << dendl;
+ if (snap_id == CEPH_NOSNAP) {
+ ceph_assert(ceph_mutex_is_wlocked(m_lock));
+ end_object_no = std::min(end_object_no, m_object_map.size());
+ if (start_object_no >= end_object_no) {
+ ldout(cct, 20) << "skipping update of invalid object map" << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto it = m_object_map.begin() + start_object_no;
+ auto end_it = m_object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ if (update_required(it, new_state)) {
+ break;
+ }
+ }
+ if (it == end_it) {
+ ldout(cct, 20) << "object map update not required" << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+ }
+
+ auto req = object_map::UpdateRequest<I>::create(
+ m_image_ctx, &m_lock, &m_object_map, snap_id, start_object_no,
+ end_object_no, new_state, current_state, parent_trace, ignore_enoent,
+ on_finish);
+ req->send();
+}
+
+} // namespace librbd
+
+template class librbd::ObjectMap<librbd::ImageCtx>;
+
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
new file mode 100644
index 000000000..7577d267f
--- /dev/null
+++ b/src/librbd/ObjectMap.h
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_H
+#define CEPH_LIBRBD_OBJECT_MAP_H
+
+#include "include/int_types.h"
+#include "include/fs_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/object_map_types.h"
+#include "common/AsyncOpTracker.h"
+#include "common/bit_vector.hpp"
+#include "common/RWLock.h"
+#include "common/RefCountedObj.h"
+#include "librbd/Utils.h"
+#include <boost/optional.hpp>
+
+class Context;
+namespace ZTracer { struct Trace; }
+
+namespace librbd {
+
+template <typename Op> class BlockGuard;
+struct BlockGuardCell;
+class ImageCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectMap : public RefCountedObject {
+public:
+ static ObjectMap *create(ImageCtxT &image_ctx, uint64_t snap_id) {
+ return new ObjectMap(image_ctx, snap_id);
+ }
+
+ ObjectMap(ImageCtxT &image_ctx, uint64_t snap_id);
+ ~ObjectMap();
+
+ static int aio_remove(librados::IoCtx &io_ctx, const std::string &image_id, librados::AioCompletion *c);
+ static std::string object_map_name(const std::string &image_id,
+ uint64_t snap_id);
+
+ static bool is_compatible(const file_layout_t& layout, uint64_t size);
+
+ uint8_t operator[](uint64_t object_no) const;
+ inline uint64_t size() const {
+ std::shared_lock locker{m_lock};
+ return m_object_map.size();
+ }
+
+ inline void set_state(uint64_t object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state) {
+ std::unique_lock locker{m_lock};
+ ceph_assert(object_no < m_object_map.size());
+ if (current_state && m_object_map[object_no] != *current_state) {
+ return;
+ }
+ m_object_map[object_no] = new_state;
+ }
+
+ void open(Context *on_finish);
+ void close(Context *on_finish);
+ bool set_object_map(ceph::BitVector<2> &target_object_map);
+ bool object_may_exist(uint64_t object_no) const;
+ bool object_may_not_exist(uint64_t object_no) const;
+
+ void aio_save(Context *on_finish);
+ void aio_resize(uint64_t new_size, uint8_t default_object_state,
+ Context *on_finish);
+
+ template <typename T, void(T::*MF)(int) = &T::complete>
+ bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ T *callback_object) {
+ return aio_update<T, MF>(snap_id, start_object_no, start_object_no + 1,
+ new_state, current_state, parent_trace,
+ ignore_enoent, callback_object);
+ }
+
+ template <typename T, void(T::*MF)(int) = &T::complete>
+ bool aio_update(uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ T *callback_object) {
+ ceph_assert(start_object_no < end_object_no);
+ std::unique_lock locker{m_lock};
+
+ if (snap_id == CEPH_NOSNAP) {
+ end_object_no = std::min(end_object_no, m_object_map.size());
+ if (start_object_no >= end_object_no) {
+ return false;
+ }
+
+ auto it = m_object_map.begin() + start_object_no;
+ auto end_it = m_object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ if (update_required(it, new_state)) {
+ break;
+ }
+ }
+
+ if (it == end_it) {
+ return false;
+ }
+
+ m_async_op_tracker.start_op();
+ UpdateOperation update_operation(start_object_no, end_object_no,
+ new_state, current_state, parent_trace,
+ ignore_enoent,
+ util::create_context_callback<T, MF>(
+ callback_object));
+ detained_aio_update(std::move(update_operation));
+ } else {
+ aio_update(snap_id, start_object_no, end_object_no, new_state,
+ current_state, parent_trace, ignore_enoent,
+ util::create_context_callback<T, MF>(callback_object));
+ }
+ return true;
+ }
+
+ void rollback(uint64_t snap_id, Context *on_finish);
+ void snapshot_add(uint64_t snap_id, Context *on_finish);
+ void snapshot_remove(uint64_t snap_id, Context *on_finish);
+
+private:
+ struct UpdateOperation {
+ uint64_t start_object_no;
+ uint64_t end_object_no;
+ uint8_t new_state;
+ boost::optional<uint8_t> current_state;
+ ZTracer::Trace parent_trace;
+ bool ignore_enoent;
+ Context *on_finish;
+
+ UpdateOperation(uint64_t start_object_no, uint64_t end_object_no,
+ uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace,
+ bool ignore_enoent, Context *on_finish)
+ : start_object_no(start_object_no), end_object_no(end_object_no),
+ new_state(new_state), current_state(current_state),
+ parent_trace(parent_trace), ignore_enoent(ignore_enoent),
+ on_finish(on_finish) {
+ }
+ };
+
+ typedef BlockGuard<UpdateOperation> UpdateGuard;
+
+ ImageCtxT &m_image_ctx;
+ uint64_t m_snap_id;
+
+ mutable ceph::shared_mutex m_lock;
+ ceph::BitVector<2> m_object_map;
+
+ AsyncOpTracker m_async_op_tracker;
+ UpdateGuard *m_update_guard = nullptr;
+
+ void detained_aio_update(UpdateOperation &&update_operation);
+ void handle_detained_aio_update(BlockGuardCell *cell, int r,
+ Context *on_finish);
+
+ void aio_update(uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ Context *on_finish);
+ bool update_required(const ceph::BitVector<2>::Iterator &it,
+ uint8_t new_state);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ObjectMap<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_H
diff --git a/src/librbd/Operations.cc b/src/librbd/Operations.cc
new file mode 100644
index 000000000..30bb7efb3
--- /dev/null
+++ b/src/librbd/Operations.cc
@@ -0,0 +1,1932 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Operations.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "osdc/Striper.h"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/journal/StandardPolicy.h"
+#include "librbd/operation/DisableFeaturesRequest.h"
+#include "librbd/operation/EnableFeaturesRequest.h"
+#include "librbd/operation/FlattenRequest.h"
+#include "librbd/operation/MetadataRemoveRequest.h"
+#include "librbd/operation/MetadataSetRequest.h"
+#include "librbd/operation/MigrateRequest.h"
+#include "librbd/operation/ObjectMapIterate.h"
+#include "librbd/operation/RebuildObjectMapRequest.h"
+#include "librbd/operation/RenameRequest.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "librbd/operation/SnapshotRenameRequest.h"
+#include "librbd/operation/SnapshotRollbackRequest.h"
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "librbd/operation/SnapshotLimitRequest.h"
+#include "librbd/operation/SparsifyRequest.h"
+#include <set>
+#include <boost/bind/bind.hpp>
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Operations: "
+
+namespace librbd {
+
+using namespace boost::placeholders;
+
+namespace {
+
+std::ostream &operator<<(std::ostream &out, const Operation &op) {
+ switch (op) {
+ case OPERATION_CHECK_OBJECT_MAP:
+ out << "check object map";
+ break;
+ case OPERATION_FLATTEN:
+ out << "flatten";
+ break;
+ case OPERATION_METADATA_UPDATE:
+ out << "metadata update";
+ break;
+ case OPERATION_MIGRATE:
+ out << "migrate";
+ break;
+ case OPERATION_REBUILD_OBJECT_MAP:
+ out << "rebuild object map";
+ break;
+ case OPERATION_RENAME:
+ out << "rename";
+ break;
+ case OPERATION_RESIZE:
+ out << "resize";
+ break;
+ case OPERATION_SNAP_CREATE:
+ out << "snap create";
+ break;
+ case OPERATION_SNAP_PROTECT:
+ out << "snap protect";
+ break;
+ case OPERATION_SNAP_REMOVE:
+ out << "snap remove";
+ break;
+ case OPERATION_SNAP_RENAME:
+ out << "snap rename";
+ break;
+ case OPERATION_SNAP_ROLLBACK:
+ out << "snap rollback";
+ break;
+ case OPERATION_SNAP_UNPROTECT:
+ out << "snap unprotect";
+ break;
+ case OPERATION_SPARSIFY:
+ out << "sparsify";
+ break;
+ case OPERATION_UPDATE_FEATURES:
+ out << "update features";
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return out;
+}
+
+template <typename I>
+struct C_NotifyUpdate : public Context {
+ I &image_ctx;
+ Context *on_finish;
+ bool notified = false;
+
+ C_NotifyUpdate(I &image_ctx, Context *on_finish)
+ : image_ctx(image_ctx), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ CephContext *cct = image_ctx.cct;
+ if (notified) {
+ if (r == -ETIMEDOUT) {
+ // don't fail the op if a peer fails to get the update notification
+ lderr(cct) << "update notification timed-out" << dendl;
+ r = 0;
+ } else if (r == -ENOENT) {
+ // don't fail if header is missing (e.g. v1 image rename)
+ ldout(cct, 5) << "update notification on missing header" << dendl;
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "update notification failed: " << cpp_strerror(r)
+ << dendl;
+ }
+ Context::complete(r);
+ return;
+ }
+
+ if (r < 0) {
+ // op failed -- no need to send update notification
+ Context::complete(r);
+ return;
+ }
+
+ notified = true;
+ image_ctx.notify_update(this);
+ }
+ void finish(int r) override {
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+struct C_InvokeAsyncRequest : public Context {
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * . . . . . . | . . . . . . . . . . . . . . . . . .
+ * . . | . .
+ * . v v v .
+ * . REFRESH_IMAGE (skip if not needed) .
+ * . | .
+ * . v .
+ * . ACQUIRE_LOCK (skip if exclusive lock .
+ * . | disabled or has lock) .
+ * . | .
+ * . /--------/ \--------\ . . . . . . . . . . . . .
+ * . | | .
+ * . v v .
+ * LOCAL_REQUEST REMOTE_REQUEST
+ * | |
+ * | |
+ * \--------\ /--------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ I &image_ctx;
+ Operation operation;
+ exclusive_lock::OperationRequestType request_type;
+ bool permit_snapshot;
+ boost::function<void(Context*)> local;
+ boost::function<void(Context*)> remote;
+ std::set<int> filter_error_codes;
+ Context *on_finish;
+ bool request_lock = false;
+
+ C_InvokeAsyncRequest(I &image_ctx, Operation operation,
+ exclusive_lock::OperationRequestType request_type,
+ bool permit_snapshot,
+ const boost::function<void(Context*)>& local,
+ const boost::function<void(Context*)>& remote,
+ const std::set<int> &filter_error_codes,
+ Context *on_finish)
+ : image_ctx(image_ctx), operation(operation), request_type(request_type),
+ permit_snapshot(permit_snapshot), local(local), remote(remote),
+ filter_error_codes(filter_error_codes), on_finish(on_finish) {
+ }
+
+ void send() {
+ send_refresh_image();
+ }
+
+ void send_refresh_image() {
+ if (!image_ctx.state->is_refresh_required()) {
+ send_acquire_exclusive_lock();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_refresh_image>(this);
+ image_ctx.state->refresh(ctx);
+ }
+
+ void handle_refresh_image(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ send_acquire_exclusive_lock();
+ }
+
+ void send_acquire_exclusive_lock() {
+ // context can complete before owner_lock is unlocked
+ ceph::shared_mutex &owner_lock(image_ctx.owner_lock);
+ owner_lock.lock_shared();
+ image_ctx.image_lock.lock_shared();
+ if (image_ctx.read_only ||
+ (!permit_snapshot && image_ctx.snap_id != CEPH_NOSNAP)) {
+ image_ctx.image_lock.unlock_shared();
+ owner_lock.unlock_shared();
+ complete(-EROFS);
+ return;
+ }
+ image_ctx.image_lock.unlock_shared();
+
+ if (image_ctx.exclusive_lock == nullptr) {
+ send_local_request();
+ owner_lock.unlock_shared();
+ return;
+ } else if (image_ctx.image_watcher == nullptr) {
+ owner_lock.unlock_shared();
+ complete(-EROFS);
+ return;
+ }
+
+ if (image_ctx.exclusive_lock->is_lock_owner() &&
+ image_ctx.exclusive_lock->accept_request(request_type, nullptr)) {
+ send_local_request();
+ owner_lock.unlock_shared();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_async_context_callback(
+ image_ctx, util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_acquire_exclusive_lock>(
+ this, image_ctx.exclusive_lock));
+
+ if (request_lock) {
+ // current lock owner doesn't support op -- try to perform
+ // the action locally
+ request_lock = false;
+ image_ctx.exclusive_lock->acquire_lock(ctx);
+ } else {
+ image_ctx.exclusive_lock->try_acquire_lock(ctx);
+ }
+ owner_lock.unlock_shared();
+ }
+
+ void handle_acquire_exclusive_lock(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ complete(r == -EBLOCKLISTED ? -EBLOCKLISTED : -EROFS);
+ return;
+ }
+
+ // context can complete before owner_lock is unlocked
+ ceph::shared_mutex &owner_lock(image_ctx.owner_lock);
+ owner_lock.lock_shared();
+ if (image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner()) {
+ send_local_request();
+ owner_lock.unlock_shared();
+ return;
+ }
+
+ send_remote_request();
+ owner_lock.unlock_shared();
+ }
+
+ void send_remote_request() {
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_async_context_callback(
+ image_ctx, util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_remote_request>(this));
+ remote(ctx);
+ }
+
+ void handle_remote_request(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ ldout(cct, 5) << operation << " not supported by current lock owner"
+ << dendl;
+ request_lock = true;
+ send_refresh_image();
+ return;
+ } else if (r != -ETIMEDOUT && r != -ERESTART) {
+ image_ctx.state->handle_update_notification();
+
+ complete(r);
+ return;
+ }
+
+ ldout(cct, 5) << operation << " timed out notifying lock owner" << dendl;
+ send_refresh_image();
+ }
+
+ void send_local_request() {
+ auto ctx = new LambdaContext(
+ [this](int r) {
+ if (r == -ERESTART) {
+ image_ctx.operations->finish_op(operation, r);
+ send_refresh_image();
+ return;
+ }
+ execute_local_request();
+ });
+
+ image_ctx.operations->start_op(operation, ctx);
+ }
+
+ void execute_local_request() {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = util::create_async_context_callback(
+ image_ctx, util::create_context_callback<
+ C_InvokeAsyncRequest<I>,
+ &C_InvokeAsyncRequest<I>::handle_local_request>(this));
+ local(ctx);
+ }
+
+ void handle_local_request(int r) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": r=" << r << dendl;
+
+ image_ctx.operations->finish_op(operation, r);
+
+ if (r == -ERESTART) {
+ send_refresh_image();
+ return;
+ }
+ complete(r);
+ }
+
+ void finish(int r) override {
+ if (filter_error_codes.count(r) != 0) {
+ r = 0;
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+bool needs_invalidate(I& image_ctx, uint64_t object_no,
+ uint8_t current_state, uint8_t new_state) {
+ if ( (current_state == OBJECT_EXISTS ||
+ current_state == OBJECT_EXISTS_CLEAN) &&
+ (new_state == OBJECT_NONEXISTENT ||
+ new_state == OBJECT_PENDING)) {
+ return false;
+ }
+ return true;
+}
+
+} // anonymous namespace
+
+template <typename I>
+Operations<I>::Operations(I &image_ctx)
+ : m_image_ctx(image_ctx),
+ m_queue_lock(ceph::make_mutex(
+ util::unique_lock_name("librbd::Operations::m_queue_lock",
+ this))) {
+}
+
+template <typename I>
+void Operations<I>::start_op(Operation op, Context *ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": " << op << " " << ctx << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ bool requires_lock = m_image_ctx.exclusive_lock != nullptr;
+
+ ctx = util::create_async_context_callback(
+ m_image_ctx, new LambdaContext(
+ [this, op, requires_lock, ctx](int r) {
+ Context *finish_op_ctx = nullptr;
+ if (requires_lock && r == 0) {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ auto exclusive_lock = m_image_ctx.exclusive_lock;
+
+ if (exclusive_lock == nullptr ||
+ (finish_op_ctx = exclusive_lock->start_op(&r)) == nullptr) {
+ ldout(m_image_ctx.cct, 20) << "lock owner lost, restarting"
+ << dendl;
+ r = -ERESTART;
+ }
+ }
+
+ ldout(m_image_ctx.cct, 20) << "start " << op << " " << ctx << dendl;
+ ctx->complete(r);
+ if (finish_op_ctx != nullptr) {
+ finish_op_ctx->complete(0);
+ }
+ }));
+
+ std::unique_lock locker{m_queue_lock};
+ if (!m_in_flight_ops.insert(op).second) {
+ ldout(cct, 20) << __func__ << ": " << op << " in flight" << dendl;
+ m_queued_ops[op].push_back(ctx);
+ return;
+ }
+
+ ctx->complete(0);
+}
+
+template <typename I>
+void Operations<I>::finish_op(Operation op, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << __func__ << ": " << op << " r=" << r << dendl;
+
+ std::unique_lock locker{m_queue_lock};
+ auto &queue = m_queued_ops[op];
+ if (queue.empty()) {
+ m_in_flight_ops.erase(op);
+ return;
+ }
+
+ auto ctx = queue.front();
+ queue.pop_front();
+ // propagate -ERESTART through all the queue
+ ctx->complete(r == -ERESTART ? r : 0);
+}
+
+template <typename I>
+int Operations<I>::flatten(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "flatten" << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if (m_image_ctx.parent_md.spec.pool_id == -1) {
+ lderr(cct) << "image has no parent" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_FLATTEN,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_flatten, this,
+ boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_flatten,
+ m_image_ctx.image_watcher, request_id,
+ boost::ref(prog_ctx), _1));
+
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ ldout(cct, 20) << "flatten finished" << dendl;
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_flatten(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "flatten" << dendl;
+
+ if (m_image_ctx.read_only || m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+
+ // can't flatten a non-clone
+ if (m_image_ctx.parent_md.spec.pool_id == -1) {
+ lderr(cct) << "image has no parent" << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ if (m_image_ctx.snap_id != CEPH_NOSNAP) {
+ lderr(cct) << "snapshots cannot be flattened" << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ uint64_t overlap;
+ int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &overlap);
+ ceph_assert(r == 0);
+ ceph_assert(overlap <= m_image_ctx.size);
+
+ uint64_t overlap_objects = Striper::get_num_objects(m_image_ctx.layout,
+ overlap);
+
+ m_image_ctx.image_lock.unlock_shared();
+
+ operation::FlattenRequest<I> *req = new operation::FlattenRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), overlap_objects,
+ prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::rebuild_object_map(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "rebuild_object_map" << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_REBUILD_OBJECT_MAP,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true,
+ boost::bind(&Operations<I>::execute_rebuild_object_map,
+ this, boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_rebuild_object_map,
+ m_image_ctx.image_watcher, request_id,
+ boost::ref(prog_ctx), _1));
+
+ ldout(cct, 10) << "rebuild object map finished" << dendl;
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_rebuild_object_map(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ if (m_image_ctx.read_only || m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ lderr(cct) << "image must support object-map feature" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ operation::RebuildObjectMapRequest<I> *req =
+ new operation::RebuildObjectMapRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::check_object_map(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ r = invoke_async_request(OPERATION_CHECK_OBJECT_MAP,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true,
+ boost::bind(&Operations<I>::check_object_map, this,
+ boost::ref(prog_ctx), _1),
+ [this](Context *c) {
+ m_image_ctx.op_work_queue->queue(c, -EOPNOTSUPP);
+ });
+
+ return r;
+}
+
+template <typename I>
+void Operations<I>::object_map_iterate(ProgressContext &prog_ctx,
+ operation::ObjectIterateWork<I> handle_mismatch,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ operation::ObjectMapIterateRequest<I> *req =
+ new operation::ObjectMapIterateRequest<I>(m_image_ctx, on_finish,
+ prog_ctx, handle_mismatch);
+ req->send();
+}
+
+template <typename I>
+void Operations<I>::check_object_map(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ object_map_iterate(prog_ctx, needs_invalidate, on_finish);
+}
+
+template <typename I>
+int Operations<I>::rename(const char *dstname) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dstname
+ << dendl;
+
+ int r = librbd::detect_format(m_image_ctx.md_ctx, dstname, NULL, NULL);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error checking for existing image called "
+ << dstname << ":" << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (r == 0) {
+ lderr(cct) << "rbd image " << dstname << " already exists" << dendl;
+ return -EEXIST;
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_RENAME,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_rename, this,
+ dstname, _1),
+ boost::bind(&ImageWatcher<I>::notify_rename,
+ m_image_ctx.image_watcher, request_id,
+ dstname, _1));
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+
+ m_image_ctx.set_image_name(dstname);
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_rename(const std::string &dest_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dest_name
+ << dendl;
+
+ if (m_image_ctx.old_format) {
+ m_image_ctx.image_lock.lock_shared();
+ if (m_image_ctx.name == dest_name) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ // unregister watch before and register back after rename
+ on_finish = new C_NotifyUpdate<I>(m_image_ctx, on_finish);
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ if (m_image_ctx.old_format) {
+ m_image_ctx.image_watcher->set_oid(m_image_ctx.header_oid);
+ }
+ m_image_ctx.image_watcher->register_watch(on_finish);
+ });
+ on_finish = new LambdaContext([this, dest_name, on_finish](int r) {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ operation::RenameRequest<I> *req = new operation::RenameRequest<I>(
+ m_image_ctx, on_finish, dest_name);
+ req->send();
+ });
+ m_image_ctx.image_watcher->unregister_watch(on_finish);
+ return;
+ }
+ operation::RenameRequest<I> *req = new operation::RenameRequest<I>(
+ m_image_ctx, on_finish, dest_name);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.image_lock.lock_shared();
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "size=" << m_image_ctx.size << ", "
+ << "new_size=" << size << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP) &&
+ !ObjectMap<>::is_compatible(m_image_ctx.layout, size)) {
+ lderr(cct) << "New size not compatible with object map" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_RESIZE,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_resize, this,
+ size, allow_shrink, boost::ref(prog_ctx), _1, 0),
+ boost::bind(&ImageWatcher<I>::notify_resize,
+ m_image_ctx.image_watcher, request_id,
+ size, allow_shrink, boost::ref(prog_ctx), _1));
+
+ m_image_ctx.perfcounter->inc(l_librbd_resize);
+ ldout(cct, 2) << "resize finished" << dendl;
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx,
+ Context *on_finish,
+ uint64_t journal_op_tid) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ m_image_ctx.image_lock.lock_shared();
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "size=" << m_image_ctx.size << ", "
+ << "new_size=" << size << dendl;
+
+ if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only ||
+ m_image_ctx.operations_disabled) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EROFS);
+ return;
+ } else if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock) &&
+ !ObjectMap<>::is_compatible(m_image_ctx.layout, size)) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ operation::ResizeRequest<I> *req = new operation::ResizeRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), size, allow_shrink,
+ prog_ctx, journal_op_tid, false);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name, uint64_t flags,
+ ProgressContext &prog_ctx) {
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ snap_create(snap_namespace, snap_name, flags, prog_ctx, &ctx);
+ r = ctx.wait();
+
+ if (r < 0) {
+ return r;
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_create);
+ return r;
+}
+
+template <typename I>
+void Operations<I>::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name, uint64_t flags,
+ ProgressContext &prog_ctx, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ uint64_t request_id = util::reserve_async_request_id();
+ C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(
+ m_image_ctx, OPERATION_SNAP_CREATE,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true,
+ boost::bind(&Operations<I>::execute_snap_create, this, snap_namespace, snap_name,
+ _1, 0, flags, boost::ref(prog_ctx)),
+ boost::bind(&ImageWatcher<I>::notify_snap_create, m_image_ctx.image_watcher,
+ request_id, snap_namespace, snap_name, flags,
+ boost::ref(prog_ctx), _1),
+ {-EEXIST}, on_finish);
+ req->send();
+}
+
+template <typename I>
+void Operations<I>::execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t flags,
+ ProgressContext &prog_ctx) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ operation::SnapshotCreateRequest<I> *req =
+ new operation::SnapshotCreateRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ snap_namespace, snap_name, journal_op_tid, flags, prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ ProgressContext& prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ C_SaferCond cond_ctx;
+ {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ {
+ // need to drop image_lock before invalidating cache
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if (!m_image_ctx.snap_exists) {
+ return -ENOENT;
+ }
+
+ if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "No such snapshot found." << dendl;
+ return -ENOENT;
+ }
+ }
+
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false);
+ if (r < 0) {
+ return r;
+ }
+
+ Context *ctx = new LambdaContext(
+ [this, ctx=&cond_ctx](int r) {
+ m_image_ctx.operations->finish_op(OPERATION_SNAP_ROLLBACK, r);
+ ctx->complete(r);
+ });
+ ctx = new LambdaContext(
+ [this, snap_namespace, snap_name, &prog_ctx, ctx](int r) {
+ if (r < 0) {
+ ctx->complete(r);
+ return;
+ }
+ std::shared_lock l{m_image_ctx.owner_lock};
+ execute_snap_rollback(snap_namespace, snap_name, prog_ctx, ctx);
+ });
+
+ m_image_ctx.operations->start_op(OPERATION_SNAP_ROLLBACK, ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_rollback);
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ ProgressContext& prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "No such snapshot found." << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ uint64_t new_size = m_image_ctx.get_image_size(snap_id);
+ m_image_ctx.image_lock.unlock_shared();
+
+ // async mode used for journal replay
+ operation::SnapshotRollbackRequest<I> *request =
+ new operation::SnapshotRollbackRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name,
+ snap_id, new_size, prog_ctx);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name) {
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ snap_remove(snap_namespace, snap_name, &ctx);
+ r = ctx.wait();
+
+ if (r < 0) {
+ return r;
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_remove);
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ // quickly filter out duplicate ops
+ m_image_ctx.image_lock.lock_shared();
+ if (m_image_ctx.get_snap_id(snap_namespace, snap_name) == CEPH_NOSNAP) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ bool proxy_op = ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 ||
+ (m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0);
+ m_image_ctx.image_lock.unlock_shared();
+
+ if (proxy_op) {
+ uint64_t request_id = util::reserve_async_request_id();
+ auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL;
+ if (cls::rbd::get_snap_namespace_type(snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) {
+ request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE;
+ }
+ C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(
+ m_image_ctx, OPERATION_SNAP_REMOVE, request_type, true,
+ boost::bind(&Operations<I>::execute_snap_remove, this, snap_namespace,
+ snap_name, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_remove,
+ m_image_ctx.image_watcher, request_id, snap_namespace,
+ snap_name, _1),
+ {-ENOENT}, on_finish);
+ req->send();
+ } else {
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ execute_snap_remove(snap_namespace, snap_name, on_finish);
+ }
+}
+
+template <typename I>
+void Operations<I>::execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ {
+ if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(m_image_ctx.cct) << "No such snapshot found." << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ bool is_protected;
+ int r = m_image_ctx.is_snap_protected(snap_id, &is_protected);
+ if (r < 0) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(r);
+ return;
+ } else if (is_protected) {
+ lderr(m_image_ctx.cct) << "snapshot is protected" << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EBUSY);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ operation::SnapshotRemoveRequest<I> *req =
+ new operation::SnapshotRemoveRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ snap_namespace, snap_name, snap_id);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_rename(const char *srcname, const char *dstname) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "snap_name=" << srcname << ", "
+ << "new_snap_name=" << dstname << dendl;
+
+ snapid_t snap_id;
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ {
+ std::shared_lock l{m_image_ctx.image_lock};
+ snap_id = m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), srcname);
+ if (snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+ if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), dstname) != CEPH_NOSNAP) {
+ return -EEXIST;
+ }
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_SNAP_RENAME,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_rename,
+ this, snap_id, dstname, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_rename,
+ m_image_ctx.image_watcher, request_id,
+ snap_id, dstname, _1));
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ execute_snap_rename(snap_id, dstname, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ m_image_ctx.perfcounter->inc(l_librbd_snap_rename);
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_rename(const uint64_t src_snap_id,
+ const std::string &dest_snap_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ if ((m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ dest_snap_name) != CEPH_NOSNAP) {
+ // Renaming is supported for snapshots from user namespace only.
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EEXIST);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": "
+ << "snap_id=" << src_snap_id << ", "
+ << "new_snap_name=" << dest_snap_name << dendl;
+
+ operation::SnapshotRenameRequest<I> *req =
+ new operation::SnapshotRenameRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), src_snap_id,
+ dest_snap_name);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ if (!m_image_ctx.test_features(RBD_FEATURE_LAYERING)) {
+ lderr(cct) << "image must support layering" << dendl;
+ return -ENOSYS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ bool is_protected;
+ r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_protected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_protected) {
+ return -EBUSY;
+ }
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_SNAP_PROTECT,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_protect,
+ this, snap_namespace, snap_name, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_protect,
+ m_image_ctx.image_watcher, request_id,
+ snap_namespace, snap_name, _1));
+ if (r < 0 && r != -EBUSY) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ execute_snap_protect(snap_namespace, snap_name, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ bool is_protected;
+ int r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_protected);
+ if (r < 0) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(r);
+ return;
+ } else if (is_protected) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EBUSY);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ operation::SnapshotProtectRequest<I> *request =
+ new operation::SnapshotProtectRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ bool is_unprotected;
+ r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_unprotected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_unprotected) {
+ return -EINVAL;
+ }
+ }
+
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_SNAP_UNPROTECT,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true,
+ boost::bind(&Operations<I>::execute_snap_unprotect,
+ this, snap_namespace, snap_name, _1),
+ boost::bind(&ImageWatcher<I>::notify_snap_unprotect,
+ m_image_ctx.image_watcher, request_id,
+ snap_namespace, snap_name, _1));
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ } else {
+ C_SaferCond cond_ctx;
+ {
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ execute_snap_unprotect(snap_namespace, snap_name, &cond_ctx);
+ }
+
+ r = cond_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ }
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+ bool is_unprotected;
+ int r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name),
+ &is_unprotected);
+ if (r < 0) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(r);
+ return;
+ } else if (is_unprotected) {
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_image_ctx.image_lock.unlock_shared();
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name
+ << dendl;
+
+ operation::SnapshotUnprotectRequest<I> *request =
+ new operation::SnapshotUnprotectRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), snap_namespace, snap_name);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::snap_set_limit(uint64_t limit) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit << dendl;
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond limit_ctx;
+ {
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_snap_set_limit(limit, &limit_ctx);
+ }
+
+ r = limit_ctx.wait();
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_snap_set_limit(const uint64_t limit,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit
+ << dendl;
+
+ operation::SnapshotLimitRequest<I> *request =
+ new operation::SnapshotLimitRequest<I>(m_image_ctx, on_finish, limit);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::update_features(uint64_t features, bool enabled) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": features=" << features
+ << ", enabled=" << enabled << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ } else if (m_image_ctx.old_format) {
+ lderr(cct) << "old-format images do not support features" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t disable_mask = (RBD_FEATURES_MUTABLE |
+ RBD_FEATURES_DISABLE_ONLY);
+ if ((enabled && (features & RBD_FEATURES_MUTABLE) != features) ||
+ (!enabled && (features & disable_mask) != features) ||
+ ((features & ~RBD_FEATURES_MUTABLE_INTERNAL) != features)) {
+ lderr(cct) << "cannot update immutable features" << dendl;
+ return -EINVAL;
+ }
+
+ bool set_object_map = (features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP;
+ bool set_fast_diff = (features & RBD_FEATURE_FAST_DIFF) == RBD_FEATURE_FAST_DIFF;
+ bool exist_fast_diff = (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0;
+ bool exist_object_map = (m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0;
+
+ if ((enabled && ((set_object_map && !exist_fast_diff) || (set_fast_diff && !exist_object_map)))
+ || (!enabled && (set_object_map && exist_fast_diff))) {
+ features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF);
+ }
+
+ if (features == 0) {
+ lderr(cct) << "update requires at least one feature" << dendl;
+ return -EINVAL;
+ }
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if (enabled && (features & m_image_ctx.features) != 0) {
+ lderr(cct) << "one or more requested features are already enabled"
+ << dendl;
+ return -EINVAL;
+ }
+ if (!enabled && (features & ~m_image_ctx.features) != 0) {
+ lderr(cct) << "one or more requested features are already disabled"
+ << dendl;
+ return -EINVAL;
+ }
+ }
+
+ // if disabling journaling, avoid attempting to open the journal
+ // when acquiring the exclusive lock in case the journal is corrupt
+ bool disabling_journal = false;
+ if (!enabled && ((features & RBD_FEATURE_JOURNALING) != 0)) {
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ m_image_ctx.set_journal_policy(new journal::DisabledPolicy());
+ disabling_journal = true;
+ }
+ BOOST_SCOPE_EXIT_ALL( (this)(disabling_journal) ) {
+ if (disabling_journal) {
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ m_image_ctx.set_journal_policy(
+ new journal::StandardPolicy<I>(&m_image_ctx));
+ }
+ };
+
+ // The journal options are not passed to the lock owner in the
+ // update features request. Therefore, if journaling is being
+ // enabled, the lock should be locally acquired instead of
+ // attempting to send the request to the peer.
+ if (enabled && (features & RBD_FEATURE_JOURNALING) != 0) {
+ C_SaferCond cond_ctx;
+ {
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ true);
+ if (r < 0) {
+ return r;
+ }
+
+ execute_update_features(features, enabled, &cond_ctx, 0);
+ }
+
+ r = cond_ctx.wait();
+ } else {
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_UPDATE_FEATURES,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_update_features,
+ this, features, enabled, _1, 0),
+ boost::bind(&ImageWatcher<I>::notify_update_features,
+ m_image_ctx.image_watcher, request_id,
+ features, enabled, _1));
+ }
+ ldout(cct, 2) << "update_features finished" << dendl;
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_update_features(uint64_t features, bool enabled,
+ Context *on_finish,
+ uint64_t journal_op_tid) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": features=" << features
+ << ", enabled=" << enabled << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ if (enabled) {
+ operation::EnableFeaturesRequest<I> *req =
+ new operation::EnableFeaturesRequest<I>(
+ m_image_ctx, on_finish, journal_op_tid, features);
+ req->send();
+ } else {
+ operation::DisableFeaturesRequest<I> *req =
+ new operation::DisableFeaturesRequest<I>(
+ m_image_ctx, on_finish, journal_op_tid, features, false);
+ req->send();
+ }
+}
+
+template <typename I>
+int Operations<I>::metadata_set(const std::string &key,
+ const std::string &value) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value="
+ << value << dendl;
+
+ std::string config_key;
+ bool config_override = util::is_metadata_config_override(key, &config_key);
+ if (config_override) {
+ // validate config setting
+ if (!librbd::api::Config<I>::is_option_name(&m_image_ctx, config_key)) {
+ lderr(cct) << "validation for " << key
+ << " failed: not allowed image level override" << dendl;
+ return -EINVAL;
+ }
+ int r = ConfigProxy{false}.set_val(config_key.c_str(), value);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_METADATA_UPDATE,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_metadata_set,
+ this, key, value, _1),
+ boost::bind(&ImageWatcher<I>::notify_metadata_set,
+ m_image_ctx.image_watcher, request_id,
+ key, value, _1));
+
+ if (config_override && r >= 0) {
+ // apply new config key immediately
+ r = m_image_ctx.state->refresh_if_required();
+ }
+
+ ldout(cct, 20) << "metadata_set finished" << dendl;
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_metadata_set(const std::string &key,
+ const std::string &value,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value="
+ << value << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ operation::MetadataSetRequest<I> *request =
+ new operation::MetadataSetRequest<I>(m_image_ctx,
+ new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ key, value);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::metadata_remove(const std::string &key) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ std::string value;
+ r = cls_client::metadata_get(&m_image_ctx.md_ctx, m_image_ctx.header_oid, key, &value);
+ if(r < 0)
+ return r;
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_METADATA_UPDATE,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_metadata_remove,
+ this, key, _1),
+ boost::bind(&ImageWatcher<I>::notify_metadata_remove,
+ m_image_ctx.image_watcher, request_id,
+ key, _1));
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ std::string config_key;
+ if (util::is_metadata_config_override(key, &config_key) && r >= 0) {
+ // apply new config key immediately
+ r = m_image_ctx.state->refresh_if_required();
+ }
+
+ ldout(cct, 20) << "metadata_remove finished" << dendl;
+ return r;
+}
+
+template <typename I>
+void Operations<I>::execute_metadata_remove(const std::string &key,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ operation::MetadataRemoveRequest<I> *request =
+ new operation::MetadataRemoveRequest<I>(
+ m_image_ctx,
+ new C_NotifyUpdate<I>(m_image_ctx, on_finish), key);
+ request->send();
+}
+
+template <typename I>
+int Operations<I>::migrate(ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "migrate" << dendl;
+
+ int r = m_image_ctx.state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_image_ctx.read_only) {
+ return -EROFS;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if (m_image_ctx.migration_info.empty()) {
+ lderr(cct) << "image has no migrating parent" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ r = invoke_async_request(OPERATION_MIGRATE,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_migrate, this,
+ boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_migrate,
+ m_image_ctx.image_watcher, request_id,
+ boost::ref(prog_ctx), _1));
+
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ ldout(cct, 20) << "migrate finished" << dendl;
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_migrate(ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "migrate" << dendl;
+
+ if (m_image_ctx.read_only || m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.lock_shared();
+
+ if (m_image_ctx.migration_info.empty()) {
+ lderr(cct) << "image has no migrating parent" << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ if (m_image_ctx.snap_id != CEPH_NOSNAP) {
+ lderr(cct) << "snapshots cannot be migrated" << dendl;
+ m_image_ctx.image_lock.unlock_shared();
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ m_image_ctx.image_lock.unlock_shared();
+
+ operation::MigrateRequest<I> *req = new operation::MigrateRequest<I>(
+ m_image_ctx, new C_NotifyUpdate<I>(m_image_ctx, on_finish), prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::sparsify(size_t sparse_size, ProgressContext &prog_ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "sparsify" << dendl;
+
+ if (sparse_size < 4096 || sparse_size > m_image_ctx.get_object_size() ||
+ (sparse_size & (sparse_size - 1)) != 0) {
+ lderr(cct) << "sparse size should be power of two not less than 4096"
+ << " and not larger image object size" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t request_id = util::reserve_async_request_id();
+ int r = invoke_async_request(OPERATION_SPARSIFY,
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL,
+ false,
+ boost::bind(&Operations<I>::execute_sparsify,
+ this, sparse_size,
+ boost::ref(prog_ctx), _1),
+ boost::bind(&ImageWatcher<I>::notify_sparsify,
+ m_image_ctx.image_watcher,
+ request_id, sparse_size,
+ boost::ref(prog_ctx), _1));
+ if (r < 0 && r != -EINVAL) {
+ return r;
+ }
+ ldout(cct, 20) << "resparsify finished" << dendl;
+ return 0;
+}
+
+template <typename I>
+void Operations<I>::execute_sparsify(size_t sparse_size,
+ ProgressContext &prog_ctx,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "sparsify" << dendl;
+
+ if (m_image_ctx.operations_disabled) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+
+ auto req = new operation::SparsifyRequest<I>(
+ m_image_ctx, sparse_size, new C_NotifyUpdate<I>(m_image_ctx, on_finish),
+ prog_ctx);
+ req->send();
+}
+
+template <typename I>
+int Operations<I>::prepare_image_update(
+ exclusive_lock::OperationRequestType request_type, bool request_lock) {
+ ceph_assert(ceph_mutex_is_rlocked(m_image_ctx.owner_lock));
+ if (m_image_ctx.image_watcher == nullptr) {
+ return -EROFS;
+ }
+
+ // need to upgrade to a write lock
+ C_SaferCond ctx;
+ m_image_ctx.owner_lock.unlock_shared();
+ bool attempting_lock = false;
+ {
+ std::unique_lock owner_locker{m_image_ctx.owner_lock};
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ (!m_image_ctx.exclusive_lock->is_lock_owner() ||
+ !m_image_ctx.exclusive_lock->accept_request(request_type, nullptr))) {
+
+ attempting_lock = true;
+ m_image_ctx.exclusive_lock->block_requests(0);
+
+ if (request_lock) {
+ m_image_ctx.exclusive_lock->acquire_lock(&ctx);
+ } else {
+ m_image_ctx.exclusive_lock->try_acquire_lock(&ctx);
+ }
+ }
+ }
+
+ int r = 0;
+ if (attempting_lock) {
+ r = ctx.wait();
+ }
+
+ m_image_ctx.owner_lock.lock_shared();
+ if (attempting_lock && m_image_ctx.exclusive_lock != nullptr) {
+ m_image_ctx.exclusive_lock->unblock_requests();
+ }
+
+ if (r == -EAGAIN || r == -EBUSY) {
+ r = 0;
+ }
+ if (r < 0) {
+ return r;
+ } else if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.exclusive_lock->is_lock_owner()) {
+ return m_image_ctx.exclusive_lock->get_unlocked_op_error();
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Operations<I>::invoke_async_request(
+ Operation op, exclusive_lock::OperationRequestType request_type,
+ bool permit_snapshot, const boost::function<void(Context*)>& local_request,
+ const boost::function<void(Context*)>& remote_request) {
+ C_SaferCond ctx;
+ C_InvokeAsyncRequest<I> *req = new C_InvokeAsyncRequest<I>(m_image_ctx, op,
+ request_type,
+ permit_snapshot,
+ local_request,
+ remote_request,
+ {}, &ctx);
+ req->send();
+ return ctx.wait();
+}
+
+} // namespace librbd
+
+template class librbd::Operations<librbd::ImageCtx>;
diff --git a/src/librbd/Operations.h b/src/librbd/Operations.h
new file mode 100644
index 000000000..52d1484e7
--- /dev/null
+++ b/src/librbd/Operations.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATIONS_H
+#define CEPH_LIBRBD_OPERATIONS_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/int_types.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/operation/ObjectMapIterate.h"
+#include <atomic>
+#include <string>
+#include <list>
+#include <map>
+#include <set>
+#include <boost/function.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+enum Operation {
+ OPERATION_CHECK_OBJECT_MAP,
+ OPERATION_FLATTEN,
+ OPERATION_METADATA_UPDATE,
+ OPERATION_MIGRATE,
+ OPERATION_REBUILD_OBJECT_MAP,
+ OPERATION_RENAME,
+ OPERATION_RESIZE,
+ OPERATION_SNAP_CREATE,
+ OPERATION_SNAP_PROTECT,
+ OPERATION_SNAP_REMOVE,
+ OPERATION_SNAP_RENAME,
+ OPERATION_SNAP_ROLLBACK,
+ OPERATION_SNAP_UNPROTECT,
+ OPERATION_SPARSIFY,
+ OPERATION_UPDATE_FEATURES,
+};
+
+template <typename ImageCtxT = ImageCtx>
+class Operations {
+public:
+ Operations(ImageCtxT &image_ctx);
+
+ void start_op(enum Operation op, Context *ctx);
+ void finish_op(enum Operation op, int r);
+
+ int flatten(ProgressContext &prog_ctx);
+ void execute_flatten(ProgressContext &prog_ctx, Context *on_finish);
+
+ int rebuild_object_map(ProgressContext &prog_ctx);
+ void execute_rebuild_object_map(ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ int check_object_map(ProgressContext &prog_ctx);
+ void check_object_map(ProgressContext &prog_ctx, Context *on_finish);
+
+ void object_map_iterate(ProgressContext &prog_ctx,
+ operation::ObjectIterateWork<ImageCtxT> handle_mismatch,
+ Context* on_finish);
+
+ int rename(const char *dstname);
+ void execute_rename(const std::string &dest_name, Context *on_finish);
+
+ int resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx);
+ void execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx,
+ Context *on_finish, uint64_t journal_op_tid);
+
+ int snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name, uint64_t flags,
+ ProgressContext& prog_ctx);
+ void snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string& snap_name, uint64_t flags,
+ ProgressContext& prog_ctx, Context *on_finish);
+ void execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, Context *on_finish,
+ uint64_t journal_op_tid, uint64_t flags,
+ ProgressContext &prog_ctx);
+
+ int snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ ProgressContext& prog_ctx);
+ void execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ ProgressContext& prog_ctx, Context *on_finish);
+
+ int snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name);
+ void snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ Context *on_finish);
+ void execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+
+ int snap_rename(const char *srcname, const char *dstname);
+ void execute_snap_rename(const uint64_t src_snap_id,
+ const std::string &dest_snap_name,
+ Context *on_finish);
+
+ int snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name);
+ void execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+
+ int snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name);
+ void execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ Context *on_finish);
+
+ int snap_set_limit(uint64_t limit);
+ void execute_snap_set_limit(uint64_t limit, Context *on_finish);
+
+ int update_features(uint64_t features, bool enabled);
+ void execute_update_features(uint64_t features, bool enabled,
+ Context *on_finish, uint64_t journal_op_tid);
+
+ int metadata_set(const std::string &key, const std::string &value);
+ void execute_metadata_set(const std::string &key, const std::string &value,
+ Context *on_finish);
+
+ int metadata_remove(const std::string &key);
+ void execute_metadata_remove(const std::string &key, Context *on_finish);
+
+ int migrate(ProgressContext &prog_ctx);
+ void execute_migrate(ProgressContext &prog_ctx, Context *on_finish);
+
+ int sparsify(size_t sparse_size, ProgressContext &prog_ctx);
+ void execute_sparsify(size_t sparse_size, ProgressContext &prog_ctx,
+ Context *on_finish);
+
+ int prepare_image_update(exclusive_lock::OperationRequestType request_type,
+ bool request_lock);
+
+private:
+ ImageCtxT &m_image_ctx;
+
+ mutable ceph::mutex m_queue_lock;
+ std::set<Operation> m_in_flight_ops;
+ std::map<Operation, std::list<Context *>> m_queued_ops;
+
+ int invoke_async_request(Operation op,
+ exclusive_lock::OperationRequestType request_type,
+ bool permit_snapshot,
+ const boost::function<void(Context*)>& local,
+ const boost::function<void(Context*)>& remote);
+};
+
+} // namespace librbd
+
+extern template class librbd::Operations<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATIONS_H
diff --git a/src/librbd/PluginRegistry.cc b/src/librbd/PluginRegistry.cc
new file mode 100644
index 000000000..6ddf0a414
--- /dev/null
+++ b/src/librbd/PluginRegistry.cc
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/PluginRegistry.h"
+#include "include/Context.h"
+#include "common/dout.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/plugin/Api.h"
+#include <boost/tokenizer.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::PluginRegistry: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+
+template <typename I>
+PluginRegistry<I>::PluginRegistry(I* image_ctx)
+ : m_image_ctx(image_ctx), m_plugin_api(std::make_unique<plugin::Api<I>>()),
+ m_image_writeback(std::make_unique<cache::ImageWriteback<I>>(*image_ctx)) {
+}
+
+template <typename I>
+PluginRegistry<I>::~PluginRegistry() {
+}
+
+template <typename I>
+void PluginRegistry<I>::init(const std::string& plugins, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ auto plugin_registry = cct->get_plugin_registry();
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ boost::tokenizer<boost::escaped_list_separator<char>> tokenizer(plugins);
+ for (auto token : tokenizer) {
+ ldout(cct, 5) << "attempting to load plugin: " << token << dendl;
+
+ auto ctx = gather_ctx->new_sub();
+
+ auto plugin = dynamic_cast<plugin::Interface<I>*>(
+ plugin_registry->get_with_load("librbd", "librbd_" + token));
+ if (plugin == nullptr) {
+ lderr(cct) << "failed to load plugin: " << token << dendl;
+ ctx->complete(-ENOSYS);
+ break;
+ }
+
+ plugin->init(
+ m_image_ctx, *m_plugin_api, *m_image_writeback, m_plugin_hook_points, ctx);
+ }
+
+ gather_ctx->activate();
+}
+
+template <typename I>
+void PluginRegistry<I>::acquired_exclusive_lock(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ for (auto &hook : m_plugin_hook_points) {
+ auto ctx = gather_ctx->new_sub();
+ hook->acquired_exclusive_lock(ctx);
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void PluginRegistry<I>::prerelease_exclusive_lock(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ for (auto &hook : m_plugin_hook_points) {
+ auto ctx = gather_ctx->new_sub();
+ hook->prerelease_exclusive_lock(ctx);
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void PluginRegistry<I>::discard(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ for (auto &hook : m_plugin_hook_points) {
+ auto ctx = gather_ctx->new_sub();
+ hook->discard(ctx);
+ }
+ gather_ctx->activate();
+}
+
+} // namespace librbd
+
+template class librbd::PluginRegistry<librbd::ImageCtx>;
diff --git a/src/librbd/PluginRegistry.h b/src/librbd/PluginRegistry.h
new file mode 100644
index 000000000..92e183ce1
--- /dev/null
+++ b/src/librbd/PluginRegistry.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_PLUGIN_REGISTRY_H
+#define CEPH_LIBRBD_PLUGIN_REGISTRY_H
+
+#include "librbd/plugin/Types.h"
+#include <memory>
+#include <string>
+#include <list>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+class ImageWritebackInterface;
+}
+
+namespace plugin { template <typename> struct Api; }
+
+template <typename ImageCtxT>
+class PluginRegistry {
+public:
+ PluginRegistry(ImageCtxT* image_ctx);
+ ~PluginRegistry();
+
+ void init(const std::string& plugins, Context* on_finish);
+
+ void acquired_exclusive_lock(Context* on_finish);
+ void prerelease_exclusive_lock(Context* on_finish);
+ void discard(Context* on_finish);
+
+private:
+ ImageCtxT* m_image_ctx;
+ std::unique_ptr<plugin::Api<ImageCtxT>> m_plugin_api;
+ std::unique_ptr<cache::ImageWritebackInterface> m_image_writeback;
+
+ std::string m_plugins;
+
+ plugin::PluginHookPoints m_plugin_hook_points;
+
+};
+
+} // namespace librbd
+
+extern template class librbd::PluginRegistry<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_PLUGIN_REGISTRY_H
diff --git a/src/librbd/TaskFinisher.h b/src/librbd/TaskFinisher.h
new file mode 100644
index 000000000..65e7da4a6
--- /dev/null
+++ b/src/librbd/TaskFinisher.h
@@ -0,0 +1,179 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef LIBRBD_TASK_FINISHER_H
+#define LIBRBD_TASK_FINISHER_H
+
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "common/ceph_context.h"
+#include "common/Finisher.h"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include <map>
+#include <utility>
+
+
+namespace librbd {
+
+struct TaskFinisherSingleton {
+ ceph::mutex m_lock = ceph::make_mutex("librbd::TaskFinisher::m_lock");
+ SafeTimer *m_safe_timer;
+ Finisher *m_finisher;
+
+ static TaskFinisherSingleton& get_singleton(CephContext* cct) {
+ return cct->lookup_or_create_singleton_object<
+ TaskFinisherSingleton>("librbd::TaskFinisherSingleton", false, cct);
+ }
+
+ explicit TaskFinisherSingleton(CephContext *cct) {
+ m_safe_timer = new SafeTimer(cct, m_lock, false);
+ m_safe_timer->init();
+ m_finisher = new Finisher(cct, "librbd::TaskFinisher::m_finisher", "taskfin_librbd");
+ m_finisher->start();
+ }
+ virtual ~TaskFinisherSingleton() {
+ {
+ std::lock_guard l{m_lock};
+ m_safe_timer->shutdown();
+ delete m_safe_timer;
+ }
+ m_finisher->wait_for_empty();
+ m_finisher->stop();
+ delete m_finisher;
+ }
+
+ void queue(Context* ctx, int r) {
+ m_finisher->queue(ctx, r);
+ }
+};
+
+
+template <typename Task>
+class TaskFinisher {
+public:
+ TaskFinisher(CephContext &cct) : m_cct(cct) {
+ auto& singleton = TaskFinisherSingleton::get_singleton(&cct);
+ m_lock = &singleton.m_lock;
+ m_safe_timer = singleton.m_safe_timer;
+ m_finisher = singleton.m_finisher;
+ }
+
+ bool cancel(const Task& task) {
+ std::lock_guard l{*m_lock};
+ typename TaskContexts::iterator it = m_task_contexts.find(task);
+ if (it == m_task_contexts.end()) {
+ return false;
+ }
+ it->second.first->complete(-ECANCELED);
+ m_safe_timer->cancel_event(it->second.second);
+ m_task_contexts.erase(it);
+ return true;
+ }
+
+ void cancel_all() {
+ std::lock_guard l{*m_lock};
+ for (auto &[task, pair] : m_task_contexts) {
+ pair.first->complete(-ECANCELED);
+ m_safe_timer->cancel_event(pair.second);
+ }
+ m_task_contexts.clear();
+ }
+
+ bool add_event_after(const Task& task, double seconds, Context *ctx) {
+ std::lock_guard l{*m_lock};
+ if (m_task_contexts.count(task) != 0) {
+ // task already scheduled on finisher or timer
+ delete ctx;
+ return false;
+ }
+ C_Task *timer_ctx = new C_Task(this, task);
+ m_task_contexts[task] = std::make_pair(ctx, timer_ctx);
+
+ m_safe_timer->add_event_after(seconds, timer_ctx);
+ return true;
+ }
+
+ bool reschedule_event_after(const Task& task, double seconds) {
+ std::lock_guard l{*m_lock};
+ auto it = m_task_contexts.find(task);
+ if (it == m_task_contexts.end()) {
+ return false;
+ }
+ bool canceled = m_safe_timer->cancel_event(it->second.second);
+ if (!canceled) {
+ return false;
+ }
+ auto timer_ctx = new C_Task(this, task);
+ it->second.second = timer_ctx;
+ m_safe_timer->add_event_after(seconds, timer_ctx);
+ return true;
+ }
+
+ void queue(Context *ctx, int r = 0) {
+ m_finisher->queue(ctx, r);
+ }
+
+ bool queue(const Task& task, Context *ctx) {
+ std::lock_guard l{*m_lock};
+ typename TaskContexts::iterator it = m_task_contexts.find(task);
+ if (it != m_task_contexts.end()) {
+ if (it->second.second != NULL &&
+ m_safe_timer->cancel_event(it->second.second)) {
+ it->second.first->complete(-ECANCELED);
+ } else {
+ // task already scheduled on the finisher
+ ctx->complete(-ECANCELED);
+ return false;
+ }
+ }
+ m_task_contexts[task] = std::make_pair(ctx, reinterpret_cast<Context *>(0));
+
+ m_finisher->queue(new C_Task(this, task));
+ return true;
+ }
+
+private:
+ class C_Task : public Context {
+ public:
+ C_Task(TaskFinisher *task_finisher, const Task& task)
+ : m_task_finisher(task_finisher), m_task(task)
+ {
+ }
+ protected:
+ void finish(int r) override {
+ m_task_finisher->complete(m_task);
+ }
+ private:
+ TaskFinisher *m_task_finisher;
+ Task m_task;
+ };
+
+ CephContext &m_cct;
+
+ ceph::mutex *m_lock;
+ Finisher *m_finisher;
+ SafeTimer *m_safe_timer;
+
+ typedef std::map<Task, std::pair<Context *, Context *> > TaskContexts;
+ TaskContexts m_task_contexts;
+
+ void complete(const Task& task) {
+ Context *ctx = NULL;
+ {
+ std::lock_guard l{*m_lock};
+ typename TaskContexts::iterator it = m_task_contexts.find(task);
+ if (it != m_task_contexts.end()) {
+ ctx = it->second.first;
+ m_task_contexts.erase(it);
+ }
+ }
+
+ if (ctx != NULL) {
+ ctx->complete(0);
+ }
+ }
+};
+
+} // namespace librbd
+
+#endif // LIBRBD_TASK_FINISHER
diff --git a/src/librbd/TrashWatcher.cc b/src/librbd/TrashWatcher.cc
new file mode 100644
index 000000000..75d588205
--- /dev/null
+++ b/src/librbd/TrashWatcher.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/TrashWatcher.h"
+#include "include/rbd_types.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/watcher/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::TrashWatcher: " << __func__ << ": "
+
+namespace librbd {
+
+using namespace trash_watcher;
+using namespace watcher;
+
+using librbd::util::create_rados_callback;
+
+namespace {
+
+static const uint64_t NOTIFY_TIMEOUT_MS = 5000;
+
+} // anonymous namespace
+
+template <typename I>
+TrashWatcher<I>::TrashWatcher(librados::IoCtx &io_ctx,
+ asio::ContextWQ *work_queue)
+ : Watcher(io_ctx, work_queue, RBD_TRASH) {
+}
+
+template <typename I>
+void TrashWatcher<I>::notify_image_added(
+ librados::IoCtx &io_ctx, const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec, Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ImageAddedPayload{image_id, trash_image_spec}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::notify_image_removed(librados::IoCtx &io_ctx,
+ const std::string& image_id,
+ Context *on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ bufferlist bl;
+ encode(NotifyMessage{ImageRemovedPayload{image_id}}, bl);
+
+ librados::AioCompletion *comp = create_rados_callback(on_finish);
+ int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 15) << "notify_id=" << notify_id << ", "
+ << "handle=" << handle << dendl;
+
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "error decoding image notification: " << err.what()
+ << dendl;
+ Context *ctx = new C_NotifyAck(this, notify_id, handle);
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(watcher::util::HandlePayloadVisitor<TrashWatcher<I>>(
+ this, notify_id, handle), notify_message.payload);
+}
+
+template <typename I>
+bool TrashWatcher<I>::handle_payload(const ImageAddedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << dendl;
+ handle_image_added(payload.image_id, payload.trash_image_spec);
+ return true;
+}
+
+template <typename I>
+bool TrashWatcher<I>::handle_payload(const ImageRemovedPayload &payload,
+ Context *on_notify_ack) {
+ CephContext *cct = this->m_cct;
+ ldout(cct, 20) << dendl;
+ handle_image_removed(payload.image_id);
+ return true;
+}
+
+template <typename I>
+bool TrashWatcher<I>::handle_payload(const UnknownPayload &payload,
+ Context *on_notify_ack) {
+ return true;
+}
+
+} // namespace librbd
+
+template class librbd::TrashWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/TrashWatcher.h b/src/librbd/TrashWatcher.h
new file mode 100644
index 000000000..684eaf4f5
--- /dev/null
+++ b/src/librbd/TrashWatcher.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_WATCHER_H
+#define CEPH_LIBRBD_TRASH_WATCHER_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Watcher.h"
+#include "librbd/trash_watcher/Types.h"
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+namespace watcher {
+namespace util {
+template <typename> struct HandlePayloadVisitor;
+} // namespace util
+} // namespace watcher
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashWatcher : public Watcher {
+ friend struct watcher::util::HandlePayloadVisitor<TrashWatcher<ImageCtxT>>;
+public:
+ TrashWatcher(librados::IoCtx &io_ctx, asio::ContextWQ *work_queue);
+
+ static void notify_image_added(librados::IoCtx &io_ctx,
+ const std::string& image_id,
+ const cls::rbd::TrashImageSpec& spec,
+ Context *on_finish);
+ static void notify_image_removed(librados::IoCtx &io_ctx,
+ const std::string& image_id,
+ Context *on_finish);
+
+protected:
+ virtual void handle_image_added(const std::string &image_id,
+ const cls::rbd::TrashImageSpec& spec) = 0;
+ virtual void handle_image_removed(const std::string &image_id) = 0;
+
+private:
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+
+ bool handle_payload(const trash_watcher::ImageAddedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const trash_watcher::ImageRemovedPayload &payload,
+ Context *on_notify_ack);
+ bool handle_payload(const trash_watcher::UnknownPayload &payload,
+ Context *on_notify_ack);
+};
+
+} // namespace librbd
+
+extern template class librbd::TrashWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_TRASH_WATCHER_H
diff --git a/src/librbd/Types.h b/src/librbd/Types.h
new file mode 100644
index 000000000..f1c7d6c5d
--- /dev/null
+++ b/src/librbd/Types.h
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_TYPES_H
+#define LIBRBD_TYPES_H
+
+#include "include/types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "deep_copy/Types.h"
+#include <map>
+#include <memory>
+#include <string>
+
+namespace neorados { class IOContext; }
+
+namespace librbd {
+
+// Performance counters
+enum {
+ l_librbd_first = 26000,
+
+ l_librbd_rd, // read ops
+ l_librbd_rd_bytes, // bytes read
+ l_librbd_rd_latency, // average latency
+ l_librbd_wr,
+ l_librbd_wr_bytes,
+ l_librbd_wr_latency,
+ l_librbd_discard,
+ l_librbd_discard_bytes,
+ l_librbd_discard_latency,
+ l_librbd_flush,
+ l_librbd_flush_latency,
+
+ l_librbd_ws,
+ l_librbd_ws_bytes,
+ l_librbd_ws_latency,
+
+ l_librbd_cmp,
+ l_librbd_cmp_bytes,
+ l_librbd_cmp_latency,
+
+ l_librbd_snap_create,
+ l_librbd_snap_remove,
+ l_librbd_snap_rollback,
+ l_librbd_snap_rename,
+
+ l_librbd_notify,
+ l_librbd_resize,
+
+ l_librbd_readahead,
+ l_librbd_readahead_bytes,
+
+ l_librbd_invalidate_cache,
+
+ l_librbd_opened_time,
+ l_librbd_lock_acquired_time,
+
+ l_librbd_last,
+};
+
+typedef std::shared_ptr<neorados::IOContext> IOContext;
+
+typedef std::map<uint64_t, uint64_t> SnapSeqs;
+
+/// Full information about an image's parent.
+struct ParentImageInfo {
+ /// Identification of the parent.
+ cls::rbd::ParentImageSpec spec;
+
+ /** @brief Where the portion of data shared with the child image ends.
+ * Since images can be resized multiple times, the portion of data shared
+ * with the child image is not necessarily min(parent size, child size).
+ * If the child image is first shrunk and then enlarged, the common portion
+ * will be shorter. */
+ uint64_t overlap = 0;
+};
+
+struct SnapInfo {
+ std::string name;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ uint64_t size;
+ ParentImageInfo parent;
+ uint8_t protection_status;
+ uint64_t flags;
+ utime_t timestamp;
+ SnapInfo(std::string _name,
+ const cls::rbd::SnapshotNamespace &_snap_namespace,
+ uint64_t _size, const ParentImageInfo &_parent,
+ uint8_t _protection_status, uint64_t _flags, utime_t _timestamp)
+ : name(_name), snap_namespace(_snap_namespace), size(_size),
+ parent(_parent), protection_status(_protection_status), flags(_flags),
+ timestamp(_timestamp) {
+ }
+};
+
+enum {
+ OPEN_FLAG_SKIP_OPEN_PARENT = 1 << 0,
+ OPEN_FLAG_OLD_FORMAT = 1 << 1,
+ OPEN_FLAG_IGNORE_MIGRATING = 1 << 2
+};
+
+enum ImageReadOnlyFlag {
+ IMAGE_READ_ONLY_FLAG_USER = 1 << 0,
+ IMAGE_READ_ONLY_FLAG_NON_PRIMARY = 1 << 1,
+};
+
+enum SnapCreateFlag {
+ SNAP_CREATE_FLAG_SKIP_OBJECT_MAP = 1 << 0,
+ SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE = 1 << 1,
+ SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR = 1 << 2,
+};
+
+struct MigrationInfo {
+ int64_t pool_id = -1;
+ std::string pool_namespace;
+ std::string image_name;
+ std::string image_id;
+ std::string source_spec;
+ deep_copy::SnapMap snap_map;
+ uint64_t overlap = 0;
+ bool flatten = false;
+
+ MigrationInfo() {
+ }
+ MigrationInfo(int64_t pool_id, const std::string& pool_namespace,
+ const std::string& image_name, const std::string& image_id,
+ const std::string& source_spec,
+ const deep_copy::SnapMap &snap_map, uint64_t overlap,
+ bool flatten)
+ : pool_id(pool_id), pool_namespace(pool_namespace), image_name(image_name),
+ image_id(image_id), source_spec(source_spec), snap_map(snap_map),
+ overlap(overlap), flatten(flatten) {
+ }
+
+ bool empty() const {
+ return (pool_id == -1 && source_spec.empty());
+ }
+};
+
+} // namespace librbd
+
+#endif // LIBRBD_TYPES_H
diff --git a/src/librbd/Utils.cc b/src/librbd/Utils.cc
new file mode 100644
index 000000000..75fe7b1a7
--- /dev/null
+++ b/src/librbd/Utils.cc
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "librbd/Utils.h"
+#include "include/random.h"
+#include "include/rbd_types.h"
+#include "include/stringify.h"
+#include "include/neorados/RADOS.hpp"
+#include "include/rbd/features.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Features.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <bitset>
+#include <random>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::util::" << __func__ << ": "
+
+namespace librbd {
+namespace util {
+namespace {
+
+const std::string CONFIG_KEY_URI_PREFIX{"config://"};
+
+} // anonymous namespace
+
+const std::string group_header_name(const std::string &group_id)
+{
+ return RBD_GROUP_HEADER_PREFIX + group_id;
+}
+
+const std::string id_obj_name(const std::string &name)
+{
+ return RBD_ID_PREFIX + name;
+}
+
+const std::string header_name(const std::string &image_id)
+{
+ return RBD_HEADER_PREFIX + image_id;
+}
+
+const std::string old_header_name(const std::string &image_name)
+{
+ return image_name + RBD_SUFFIX;
+}
+
+std::string unique_lock_name(const std::string &name, void *address) {
+ return name + " (" + stringify(address) + ")";
+}
+
+librados::AioCompletion *create_rados_callback(Context *on_finish) {
+ return create_rados_callback<Context, &Context::complete>(on_finish);
+}
+
+std::string generate_image_id(librados::IoCtx &ioctx) {
+ librados::Rados rados(ioctx);
+
+ uint64_t bid = rados.get_instance_id();
+ std::mt19937 generator{random_device_t{}()};
+ std::uniform_int_distribution<uint32_t> distribution{0, 0xFFFFFFFF};
+ uint32_t extra = distribution(generator);
+
+ ostringstream bid_ss;
+ bid_ss << std::hex << bid << std::hex << extra;
+ std::string id = bid_ss.str();
+
+ // ensure the image id won't overflow the fixed block name size
+ if (id.length() > RBD_MAX_IMAGE_ID_LENGTH) {
+ id = id.substr(id.length() - RBD_MAX_IMAGE_ID_LENGTH);
+ }
+
+ return id;
+}
+
+uint64_t get_rbd_default_features(CephContext* cct)
+{
+ auto value = cct->_conf.get_val<std::string>("rbd_default_features");
+ return librbd::rbd_features_from_string(value, nullptr);
+}
+
+
+bool calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ uint64_t length,
+ size_t *write_offset,
+ size_t *write_length,
+ size_t *offset) {
+ size_t extent_size;
+ if (*offset + sparse_size > length) {
+ extent_size = length - *offset;
+ } else {
+ extent_size = sparse_size;
+ }
+
+ bufferptr extent(bp, *offset, extent_size);
+ *offset += extent_size;
+
+ bool extent_is_zero = extent.is_zero();
+ if (!extent_is_zero) {
+ *write_length += extent_size;
+ }
+ if (extent_is_zero && *write_length == 0) {
+ *write_offset += extent_size;
+ }
+
+ if ((extent_is_zero || *offset == length) && *write_length != 0) {
+ return true;
+ }
+ return false;
+}
+
+bool is_metadata_config_override(const std::string& metadata_key,
+ std::string* config_key) {
+ size_t prefix_len = librbd::ImageCtx::METADATA_CONF_PREFIX.size();
+ if (metadata_key.size() > prefix_len &&
+ metadata_key.compare(0, prefix_len,
+ librbd::ImageCtx::METADATA_CONF_PREFIX) == 0) {
+ *config_key = metadata_key.substr(prefix_len,
+ metadata_key.size() - prefix_len);
+ return true;
+ }
+ return false;
+}
+
+int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc,
+ int64_t pool_id,
+ const std::optional<std::string>& pool_namespace,
+ librados::IoCtx* dst_io_ctx) {
+ auto cct = (CephContext *)src_io_ctx.cct();
+
+ librados::Rados rados(src_io_ctx);
+ int r = rados.ioctx_create2(pool_id, *dst_io_ctx);
+ if (r == -ENOENT) {
+ ldout(cct, 1) << pool_desc << " pool " << pool_id << " no longer exists"
+ << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "error accessing " << pool_desc << " pool " << pool_id
+ << dendl;
+ return r;
+ }
+
+ dst_io_ctx->set_namespace(
+ pool_namespace ? *pool_namespace : src_io_ctx.get_namespace());
+ if (src_io_ctx.get_pool_full_try()) {
+ dst_io_ctx->set_pool_full_try();
+ }
+ return 0;
+}
+
+int snap_create_flags_api_to_internal(CephContext *cct, uint32_t api_flags,
+ uint64_t *internal_flags) {
+ *internal_flags = 0;
+
+ if (api_flags & RBD_SNAP_CREATE_SKIP_QUIESCE) {
+ *internal_flags |= SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE;
+ api_flags &= ~RBD_SNAP_CREATE_SKIP_QUIESCE;
+ } else if (api_flags & RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR) {
+ *internal_flags |= SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR;
+ api_flags &= ~RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR;
+ }
+
+ if (api_flags != 0) {
+ lderr(cct) << "invalid snap create flags: "
+ << std::bitset<32>(api_flags) << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+uint32_t get_default_snap_create_flags(ImageCtx *ictx) {
+ auto mode = ictx->config.get_val<std::string>(
+ "rbd_default_snapshot_quiesce_mode");
+
+ if (mode == "required") {
+ return 0;
+ } else if (mode == "ignore-error") {
+ return RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR;
+ } else if (mode == "skip") {
+ return RBD_SNAP_CREATE_SKIP_QUIESCE;
+ } else {
+ ceph_abort_msg("invalid rbd_default_snapshot_quiesce_mode");
+ }
+}
+
+SnapContext get_snap_context(
+ const std::optional<
+ std::pair<std::uint64_t,
+ std::vector<std::uint64_t>>>& write_snap_context) {
+ SnapContext snapc;
+ if (write_snap_context) {
+ snapc = SnapContext{write_snap_context->first,
+ {write_snap_context->second.begin(),
+ write_snap_context->second.end()}};
+ }
+ return snapc;
+}
+
+uint64_t reserve_async_request_id() {
+ static std::atomic<uint64_t> async_request_seq = 0;
+
+ return ++async_request_seq;
+}
+
+bool is_config_key_uri(const std::string& uri) {
+ return boost::starts_with(uri, CONFIG_KEY_URI_PREFIX);
+}
+
+int get_config_key(librados::Rados& rados, const std::string& uri,
+ std::string* value) {
+ auto cct = reinterpret_cast<CephContext*>(rados.cct());
+
+ if (!is_config_key_uri(uri)) {
+ return -EINVAL;
+ }
+
+ std::string key = uri.substr(CONFIG_KEY_URI_PREFIX.size());
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" + key + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve MON config key " << key << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ *value = std::string(out_bl.c_str(), out_bl.length());
+ return 0;
+}
+
+} // namespace util
+} // namespace librbd
diff --git a/src/librbd/Utils.h b/src/librbd/Utils.h
new file mode 100644
index 000000000..dee91feee
--- /dev/null
+++ b/src/librbd/Utils.h
@@ -0,0 +1,286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_UTILS_H
+#define CEPH_LIBRBD_UTILS_H
+
+#include "include/rados/librados.hpp"
+#include "include/rbd_types.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
+#include "common/RefCountedObj.h"
+
+#include <atomic>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include <stdio.h>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace util {
+namespace detail {
+
+template <typename T>
+void rados_callback(rados_completion_t c, void *arg) {
+ reinterpret_cast<T*>(arg)->complete(rados_aio_get_return_value(c));
+}
+
+template <typename T, void(T::*MF)(int)>
+void rados_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ (obj->*MF)(r);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy>
+void rados_state_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ Context *on_finish = (obj->*MF)(&r);
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ if (destroy) {
+ delete obj;
+ }
+ }
+}
+
+template <typename T, void (T::*MF)(int)>
+class C_CallbackAdapter : public Context {
+ T *obj;
+public:
+ C_CallbackAdapter(T *obj) : obj(obj) {
+ }
+
+protected:
+ void finish(int r) override {
+ (obj->*MF)(r);
+ }
+};
+
+template <typename T, void (T::*MF)(int)>
+class C_RefCallbackAdapter : public Context {
+ RefCountedPtr refptr;
+ Context *on_finish;
+
+public:
+ C_RefCallbackAdapter(T *obj, RefCountedPtr refptr)
+ : refptr(std::move(refptr)),
+ on_finish(new C_CallbackAdapter<T, MF>(obj)) {
+ }
+
+protected:
+ void finish(int r) override {
+ on_finish->complete(r);
+ }
+};
+
+template <typename T, Context*(T::*MF)(int*), bool destroy>
+class C_StateCallbackAdapter : public Context {
+ T *obj;
+public:
+ C_StateCallbackAdapter(T *obj) : obj(obj){
+ }
+
+protected:
+ void complete(int r) override {
+ Context *on_finish = (obj->*MF)(&r);
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ if (destroy) {
+ delete obj;
+ }
+ }
+ Context::complete(r);
+ }
+ void finish(int r) override {
+ }
+};
+
+template <typename T, Context*(T::*MF)(int*)>
+class C_RefStateCallbackAdapter : public Context {
+ RefCountedPtr refptr;
+ Context *on_finish;
+
+public:
+ C_RefStateCallbackAdapter(T *obj, RefCountedPtr refptr)
+ : refptr(std::move(refptr)),
+ on_finish(new C_StateCallbackAdapter<T, MF, true>(obj)) {
+ }
+
+protected:
+ void finish(int r) override {
+ on_finish->complete(r);
+ }
+};
+
+template <typename WQ>
+struct C_AsyncCallback : public Context {
+ WQ *op_work_queue;
+ Context *on_finish;
+
+ C_AsyncCallback(WQ *op_work_queue, Context *on_finish)
+ : op_work_queue(op_work_queue), on_finish(on_finish) {
+ }
+ ~C_AsyncCallback() override {
+ delete on_finish;
+ }
+ void finish(int r) override {
+ op_work_queue->queue(on_finish, r);
+ on_finish = nullptr;
+ }
+};
+
+} // namespace detail
+
+std::string generate_image_id(librados::IoCtx &ioctx);
+
+template <typename T>
+inline std::string generate_image_id(librados::IoCtx &ioctx) {
+ return generate_image_id(ioctx);
+}
+
+const std::string group_header_name(const std::string &group_id);
+const std::string id_obj_name(const std::string &name);
+const std::string header_name(const std::string &image_id);
+const std::string old_header_name(const std::string &image_name);
+std::string unique_lock_name(const std::string &name, void *address);
+
+template <typename I>
+std::string data_object_name(I* image_ctx, uint64_t object_no) {
+ char buf[RBD_MAX_OBJ_NAME_SIZE];
+ size_t length = snprintf(buf, RBD_MAX_OBJ_NAME_SIZE,
+ image_ctx->format_string, object_no);
+ ceph_assert(length < RBD_MAX_OBJ_NAME_SIZE);
+
+ std::string oid;
+ oid.reserve(RBD_MAX_OBJ_NAME_SIZE);
+ oid.append(buf, length);
+ return oid;
+}
+
+librados::AioCompletion *create_rados_callback(Context *on_finish);
+
+template <typename T>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_callback<T>);
+}
+
+template <typename T, void(T::*MF)(int)>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_callback<T, MF>);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_state_callback<T, MF, destroy>);
+}
+
+template <typename T, void(T::*MF)(int) = &T::complete>
+Context *create_context_callback(T *obj) {
+ return new detail::C_CallbackAdapter<T, MF>(obj);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+Context *create_context_callback(T *obj) {
+ return new detail::C_StateCallbackAdapter<T, MF, destroy>(obj);
+}
+
+//for reference counting objects
+template <typename T, void(T::*MF)(int) = &T::complete>
+Context *create_context_callback(T *obj, RefCountedPtr refptr) {
+ return new detail::C_RefCallbackAdapter<T, MF>(obj, refptr);
+}
+
+template <typename T, Context*(T::*MF)(int*)>
+Context *create_context_callback(T *obj, RefCountedPtr refptr) {
+ return new detail::C_RefStateCallbackAdapter<T, MF>(obj, refptr);
+}
+
+//for objects that don't inherit from RefCountedObj, to handle unit tests
+template <typename T, void(T::*MF)(int) = &T::complete, typename R>
+typename std::enable_if<not std::is_base_of<RefCountedPtr, R>::value, Context*>::type
+create_context_callback(T *obj, R *refptr) {
+ return new detail::C_CallbackAdapter<T, MF>(obj);
+}
+
+template <typename T, Context*(T::*MF)(int*), typename R, bool destroy=true>
+typename std::enable_if<not std::is_base_of<RefCountedPtr, R>::value, Context*>::type
+create_context_callback(T *obj, R *refptr) {
+ return new detail::C_StateCallbackAdapter<T, MF, destroy>(obj);
+}
+
+template <typename I>
+Context *create_async_context_callback(I &image_ctx, Context *on_finish) {
+ // use async callback to acquire a clean lock context
+ return new detail::C_AsyncCallback<
+ typename std::decay<decltype(*image_ctx.op_work_queue)>::type>(
+ image_ctx.op_work_queue, on_finish);
+}
+
+template <typename WQ>
+Context *create_async_context_callback(WQ *work_queue, Context *on_finish) {
+ // use async callback to acquire a clean lock context
+ return new detail::C_AsyncCallback<WQ>(work_queue, on_finish);
+}
+
+// TODO: temporary until AioCompletion supports templated ImageCtx
+inline ImageCtx *get_image_ctx(ImageCtx *image_ctx) {
+ return image_ctx;
+}
+
+uint64_t get_rbd_default_features(CephContext* cct);
+
+bool calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ uint64_t length,
+ size_t *write_offset,
+ size_t *write_length,
+ size_t *offset);
+
+template <typename I>
+inline ZTracer::Trace create_trace(const I &image_ctx, const char *trace_name,
+ const ZTracer::Trace &parent_trace) {
+ if (parent_trace.valid()) {
+ return ZTracer::Trace(trace_name, &image_ctx.trace_endpoint, &parent_trace);
+ }
+ return ZTracer::Trace();
+}
+
+bool is_metadata_config_override(const std::string& metadata_key,
+ std::string* config_key);
+
+int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc,
+ int64_t pool_id,
+ const std::optional<std::string>& pool_namespace,
+ librados::IoCtx* dst_io_ctx);
+
+int snap_create_flags_api_to_internal(CephContext *cct, uint32_t api_flags,
+ uint64_t *internal_flags);
+
+uint32_t get_default_snap_create_flags(ImageCtx *ictx);
+
+SnapContext get_snap_context(
+ const std::optional<
+ std::pair<std::uint64_t,
+ std::vector<std::uint64_t>>>& write_snap_context);
+
+uint64_t reserve_async_request_id();
+
+bool is_config_key_uri(const std::string& uri);
+int get_config_key(librados::Rados& rados, const std::string& uri,
+ std::string* value);
+
+} // namespace util
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_UTILS_H
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
new file mode 100644
index 000000000..413983f3e
--- /dev/null
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -0,0 +1,557 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "librbd/WatchNotifyTypes.h"
+
+namespace librbd {
+namespace watch_notify {
+
+void AsyncRequestId::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+ encode(request_id, bl);
+}
+
+void AsyncRequestId::decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(client_id, iter);
+ decode(request_id, iter);
+}
+
+void AsyncRequestId::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+ f->dump_unsigned("request_id", request_id);
+}
+
+void AcquiredLockPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+}
+
+void AcquiredLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ if (version >= 2) {
+ decode(client_id, iter);
+ }
+}
+
+void AcquiredLockPayload::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+}
+
+void ReleasedLockPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+}
+
+void ReleasedLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ if (version >= 2) {
+ decode(client_id, iter);
+ }
+}
+
+void ReleasedLockPayload::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+}
+
+void RequestLockPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(client_id, bl);
+ encode(force, bl);
+}
+
+void RequestLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ if (version >= 2) {
+ decode(client_id, iter);
+ }
+ if (version >= 3) {
+ decode(force, iter);
+ }
+}
+
+void RequestLockPayload::dump(Formatter *f) const {
+ f->open_object_section("client_id");
+ client_id.dump(f);
+ f->close_section();
+ f->dump_bool("force", force);
+}
+
+void HeaderUpdatePayload::encode(bufferlist &bl) const {
+}
+
+void HeaderUpdatePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void HeaderUpdatePayload::dump(Formatter *f) const {
+}
+
+void AsyncRequestPayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(async_request_id, bl);
+}
+
+void AsyncRequestPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(async_request_id, iter);
+}
+
+void AsyncRequestPayloadBase::dump(Formatter *f) const {
+ f->open_object_section("async_request_id");
+ async_request_id.dump(f);
+ f->close_section();
+}
+
+void AsyncProgressPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ AsyncRequestPayloadBase::encode(bl);
+ encode(offset, bl);
+ encode(total, bl);
+}
+
+void AsyncProgressPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ AsyncRequestPayloadBase::decode(version, iter);
+ decode(offset, iter);
+ decode(total, iter);
+}
+
+void AsyncProgressPayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("total", total);
+}
+
+void AsyncCompletePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ AsyncRequestPayloadBase::encode(bl);
+ encode(result, bl);
+}
+
+void AsyncCompletePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ AsyncRequestPayloadBase::decode(version, iter);
+ decode(result, iter);
+}
+
+void AsyncCompletePayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_int("result", result);
+}
+
+void ResizePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(size, bl);
+ AsyncRequestPayloadBase::encode(bl);
+ encode(allow_shrink, bl);
+}
+
+void ResizePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(size, iter);
+ AsyncRequestPayloadBase::decode(version, iter);
+
+ if (version >= 4) {
+ decode(allow_shrink, iter);
+ }
+}
+
+void ResizePayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_unsigned("size", size);
+ f->dump_bool("allow_shrink", allow_shrink);
+}
+
+void SnapPayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(snap_name, bl);
+ encode(snap_namespace, bl);
+ encode(async_request_id, bl);
+}
+
+void SnapPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(snap_name, iter);
+ if (version >= 6) {
+ decode(snap_namespace, iter);
+ }
+ if (version >= 7) {
+ decode(async_request_id, iter);
+ }
+}
+
+void SnapPayloadBase::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_string("snap_name", snap_name);
+ snap_namespace.dump(f);
+}
+
+void SnapCreatePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ SnapPayloadBase::encode(bl);
+ encode(flags, bl);
+}
+
+void SnapCreatePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ SnapPayloadBase::decode(version, iter);
+ if (version == 5) {
+ decode(snap_namespace, iter);
+ }
+ if (version >= 7) {
+ decode(flags, iter);
+ }
+}
+
+void SnapCreatePayload::dump(Formatter *f) const {
+ SnapPayloadBase::dump(f);
+ f->dump_unsigned("flags", flags);
+}
+
+void SnapRenamePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(snap_id, bl);
+ SnapPayloadBase::encode(bl);
+}
+
+void SnapRenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(snap_id, iter);
+ SnapPayloadBase::decode(version, iter);
+}
+
+void SnapRenamePayload::dump(Formatter *f) const {
+ SnapPayloadBase::dump(f);
+ f->dump_unsigned("src_snap_id", snap_id);
+}
+
+void RenamePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(image_name, bl);
+ encode(async_request_id, bl);
+}
+
+void RenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(image_name, iter);
+ if (version >= 7) {
+ decode(async_request_id, iter);
+ }
+}
+
+void RenamePayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_string("image_name", image_name);
+}
+
+void UpdateFeaturesPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(features, bl);
+ encode(enabled, bl);
+ encode(async_request_id, bl);
+}
+
+void UpdateFeaturesPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(features, iter);
+ decode(enabled, iter);
+ if (version >= 7) {
+ decode(async_request_id, iter);
+ }
+}
+
+void UpdateFeaturesPayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_unsigned("features", features);
+ f->dump_bool("enabled", enabled);
+}
+
+void SparsifyPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ AsyncRequestPayloadBase::encode(bl);
+ encode(sparse_size, bl);
+}
+
+void SparsifyPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ AsyncRequestPayloadBase::decode(version, iter);
+ decode(sparse_size, iter);
+}
+
+void SparsifyPayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_unsigned("sparse_size", sparse_size);
+}
+
+void MetadataUpdatePayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(key, bl);
+ encode(value, bl);
+ encode(async_request_id, bl);
+}
+
+void MetadataUpdatePayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(key, iter);
+ decode(value, iter);
+ if (version >= 7) {
+ decode(async_request_id, iter);
+ }
+}
+
+void MetadataUpdatePayload::dump(Formatter *f) const {
+ AsyncRequestPayloadBase::dump(f);
+ f->dump_string("key", key);
+ f->dump_string("value", *value);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+bool NotifyMessage::check_for_refresh() const {
+ return payload->check_for_refresh();
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(7, 1, bl);
+ encode(static_cast<uint32_t>(payload->get_notify_op()), bl);
+ payload->encode(bl);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_ACQUIRED_LOCK:
+ payload.reset(new AcquiredLockPayload());
+ break;
+ case NOTIFY_OP_RELEASED_LOCK:
+ payload.reset(new ReleasedLockPayload());
+ break;
+ case NOTIFY_OP_REQUEST_LOCK:
+ payload.reset(new RequestLockPayload());
+ break;
+ case NOTIFY_OP_HEADER_UPDATE:
+ payload.reset(new HeaderUpdatePayload());
+ break;
+ case NOTIFY_OP_ASYNC_PROGRESS:
+ payload.reset(new AsyncProgressPayload());
+ break;
+ case NOTIFY_OP_ASYNC_COMPLETE:
+ payload.reset(new AsyncCompletePayload());
+ break;
+ case NOTIFY_OP_FLATTEN:
+ payload.reset(new FlattenPayload());
+ break;
+ case NOTIFY_OP_RESIZE:
+ payload.reset(new ResizePayload());
+ break;
+ case NOTIFY_OP_SNAP_CREATE:
+ payload.reset(new SnapCreatePayload());
+ break;
+ case NOTIFY_OP_SNAP_REMOVE:
+ payload.reset(new SnapRemovePayload());
+ break;
+ case NOTIFY_OP_SNAP_RENAME:
+ payload.reset(new SnapRenamePayload());
+ break;
+ case NOTIFY_OP_SNAP_PROTECT:
+ payload.reset(new SnapProtectPayload());
+ break;
+ case NOTIFY_OP_SNAP_UNPROTECT:
+ payload.reset(new SnapUnprotectPayload());
+ break;
+ case NOTIFY_OP_REBUILD_OBJECT_MAP:
+ payload.reset(new RebuildObjectMapPayload());
+ break;
+ case NOTIFY_OP_RENAME:
+ payload.reset(new RenamePayload());
+ break;
+ case NOTIFY_OP_UPDATE_FEATURES:
+ payload.reset(new UpdateFeaturesPayload());
+ break;
+ case NOTIFY_OP_MIGRATE:
+ payload.reset(new MigratePayload());
+ break;
+ case NOTIFY_OP_SPARSIFY:
+ payload.reset(new SparsifyPayload());
+ break;
+ case NOTIFY_OP_QUIESCE:
+ payload.reset(new QuiescePayload());
+ break;
+ case NOTIFY_OP_UNQUIESCE:
+ payload.reset(new UnquiescePayload());
+ break;
+ case NOTIFY_OP_METADATA_UPDATE:
+ payload.reset(new MetadataUpdatePayload());
+ break;
+ }
+
+ payload->decode(struct_v, iter);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ payload->dump(f);
+}
+
+NotifyOp NotifyMessage::get_notify_op() const {
+ return payload->get_notify_op();
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(new AcquiredLockPayload(ClientId(1, 2))));
+ o.push_back(new NotifyMessage(new ReleasedLockPayload(ClientId(1, 2))));
+ o.push_back(new NotifyMessage(new RequestLockPayload(ClientId(1, 2), true)));
+ o.push_back(new NotifyMessage(new HeaderUpdatePayload()));
+ o.push_back(new NotifyMessage(new AsyncProgressPayload(AsyncRequestId(ClientId(0, 1), 2), 3, 4)));
+ o.push_back(new NotifyMessage(new AsyncCompletePayload(AsyncRequestId(ClientId(0, 1), 2), 3)));
+ o.push_back(new NotifyMessage(new FlattenPayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(new ResizePayload(AsyncRequestId(ClientId(0, 1), 2), 123, true)));
+ o.push_back(new NotifyMessage(new SnapCreatePayload(AsyncRequestId(ClientId(0, 1), 2),
+ cls::rbd::UserSnapshotNamespace(),
+ "foo", 1)));
+ o.push_back(new NotifyMessage(new SnapRemovePayload(AsyncRequestId(ClientId(0, 1), 2),
+ cls::rbd::UserSnapshotNamespace(), "foo")));
+ o.push_back(new NotifyMessage(new SnapProtectPayload(AsyncRequestId(ClientId(0, 1), 2),
+ cls::rbd::UserSnapshotNamespace(), "foo")));
+ o.push_back(new NotifyMessage(new SnapUnprotectPayload(AsyncRequestId(ClientId(0, 1), 2),
+ cls::rbd::UserSnapshotNamespace(), "foo")));
+ o.push_back(new NotifyMessage(new RebuildObjectMapPayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(new RenamePayload(AsyncRequestId(ClientId(0, 1), 2), "foo")));
+ o.push_back(new NotifyMessage(new UpdateFeaturesPayload(AsyncRequestId(ClientId(0, 1), 2),
+ 1, true)));
+ o.push_back(new NotifyMessage(new MigratePayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(new SparsifyPayload(AsyncRequestId(ClientId(0, 1), 2), 1)));
+ o.push_back(new NotifyMessage(new QuiescePayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(new UnquiescePayload(AsyncRequestId(ClientId(0, 1), 2))));
+ o.push_back(new NotifyMessage(new MetadataUpdatePayload(AsyncRequestId(ClientId(0, 1), 2),
+ "foo", std::optional<std::string>{"xyz"})));
+}
+
+void ResponseMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(result, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ResponseMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+ decode(result, iter);
+ DECODE_FINISH(iter);
+}
+
+void ResponseMessage::dump(Formatter *f) const {
+ f->dump_int("result", result);
+}
+
+void ResponseMessage::generate_test_instances(std::list<ResponseMessage *> &o) {
+ o.push_back(new ResponseMessage(1));
+}
+
+std::ostream &operator<<(std::ostream &out,
+ const librbd::watch_notify::NotifyOp &op) {
+ using namespace librbd::watch_notify;
+
+ switch (op) {
+ case NOTIFY_OP_ACQUIRED_LOCK:
+ out << "AcquiredLock";
+ break;
+ case NOTIFY_OP_RELEASED_LOCK:
+ out << "ReleasedLock";
+ break;
+ case NOTIFY_OP_REQUEST_LOCK:
+ out << "RequestLock";
+ break;
+ case NOTIFY_OP_HEADER_UPDATE:
+ out << "HeaderUpdate";
+ break;
+ case NOTIFY_OP_ASYNC_PROGRESS:
+ out << "AsyncProgress";
+ break;
+ case NOTIFY_OP_ASYNC_COMPLETE:
+ out << "AsyncComplete";
+ break;
+ case NOTIFY_OP_FLATTEN:
+ out << "Flatten";
+ break;
+ case NOTIFY_OP_RESIZE:
+ out << "Resize";
+ break;
+ case NOTIFY_OP_SNAP_CREATE:
+ out << "SnapCreate";
+ break;
+ case NOTIFY_OP_SNAP_REMOVE:
+ out << "SnapRemove";
+ break;
+ case NOTIFY_OP_SNAP_RENAME:
+ out << "SnapRename";
+ break;
+ case NOTIFY_OP_SNAP_PROTECT:
+ out << "SnapProtect";
+ break;
+ case NOTIFY_OP_SNAP_UNPROTECT:
+ out << "SnapUnprotect";
+ break;
+ case NOTIFY_OP_REBUILD_OBJECT_MAP:
+ out << "RebuildObjectMap";
+ break;
+ case NOTIFY_OP_RENAME:
+ out << "Rename";
+ break;
+ case NOTIFY_OP_UPDATE_FEATURES:
+ out << "UpdateFeatures";
+ break;
+ case NOTIFY_OP_MIGRATE:
+ out << "Migrate";
+ break;
+ case NOTIFY_OP_SPARSIFY:
+ out << "Sparsify";
+ break;
+ case NOTIFY_OP_QUIESCE:
+ out << "Quiesce";
+ break;
+ case NOTIFY_OP_UNQUIESCE:
+ out << "Unquiesce";
+ break;
+ case NOTIFY_OP_METADATA_UPDATE:
+ out << "MetadataUpdate";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out,
+ const librbd::watch_notify::AsyncRequestId &request) {
+ out << "[" << request.client_id.gid << "," << request.client_id.handle << ","
+ << request.request_id << "]";
+ return out;
+}
+} // namespace watch_notify
+} // namespace librbd
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
new file mode 100644
index 000000000..ca0b40f28
--- /dev/null
+++ b/src/librbd/WatchNotifyTypes.h
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef LIBRBD_WATCH_NOTIFY_TYPES_H
+#define LIBRBD_WATCH_NOTIFY_TYPES_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "librbd/watcher/Types.h"
+#include <iosfwd>
+#include <list>
+#include <memory>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace librbd {
+namespace watch_notify {
+
+using librbd::watcher::ClientId;
+
+WRITE_CLASS_ENCODER(ClientId);
+
+struct AsyncRequestId {
+ ClientId client_id;
+ uint64_t request_id;
+
+ AsyncRequestId() : request_id() {}
+ AsyncRequestId(const ClientId &client_id_, uint64_t request_id_)
+ : client_id(client_id_), request_id(request_id_) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ inline bool operator<(const AsyncRequestId &rhs) const {
+ if (client_id != rhs.client_id) {
+ return client_id < rhs.client_id;
+ } else {
+ return request_id < rhs.request_id;
+ }
+ }
+ inline bool operator!=(const AsyncRequestId &rhs) const {
+ return (client_id != rhs.client_id || request_id != rhs.request_id);
+ }
+ inline operator bool() const {
+ return (*this != AsyncRequestId());
+ }
+};
+
+enum NotifyOp {
+ NOTIFY_OP_ACQUIRED_LOCK = 0,
+ NOTIFY_OP_RELEASED_LOCK = 1,
+ NOTIFY_OP_REQUEST_LOCK = 2,
+ NOTIFY_OP_HEADER_UPDATE = 3,
+ NOTIFY_OP_ASYNC_PROGRESS = 4,
+ NOTIFY_OP_ASYNC_COMPLETE = 5,
+ NOTIFY_OP_FLATTEN = 6,
+ NOTIFY_OP_RESIZE = 7,
+ NOTIFY_OP_SNAP_CREATE = 8,
+ NOTIFY_OP_SNAP_REMOVE = 9,
+ NOTIFY_OP_REBUILD_OBJECT_MAP = 10,
+ NOTIFY_OP_SNAP_RENAME = 11,
+ NOTIFY_OP_SNAP_PROTECT = 12,
+ NOTIFY_OP_SNAP_UNPROTECT = 13,
+ NOTIFY_OP_RENAME = 14,
+ NOTIFY_OP_UPDATE_FEATURES = 15,
+ NOTIFY_OP_MIGRATE = 16,
+ NOTIFY_OP_SPARSIFY = 17,
+ NOTIFY_OP_QUIESCE = 18,
+ NOTIFY_OP_UNQUIESCE = 19,
+ NOTIFY_OP_METADATA_UPDATE = 20,
+};
+
+struct Payload {
+ virtual ~Payload() {}
+
+ virtual NotifyOp get_notify_op() const = 0;
+ virtual bool check_for_refresh() const = 0;
+
+ virtual void encode(bufferlist &bl) const = 0;
+ virtual void decode(__u8 version, bufferlist::const_iterator &iter) = 0;
+ virtual void dump(Formatter *f) const = 0;
+};
+
+struct AcquiredLockPayload : public Payload {
+ ClientId client_id;
+
+ AcquiredLockPayload() {}
+ AcquiredLockPayload(const ClientId &client_id) : client_id(client_id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_ACQUIRED_LOCK;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct ReleasedLockPayload : public Payload {
+ ClientId client_id;
+
+ ReleasedLockPayload() {}
+ ReleasedLockPayload(const ClientId &client_id) : client_id(client_id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_RELEASED_LOCK;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct RequestLockPayload : public Payload {
+ ClientId client_id;
+ bool force = false;
+
+ RequestLockPayload() {}
+ RequestLockPayload(const ClientId &client_id, bool force)
+ : client_id(client_id), force(force) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_REQUEST_LOCK;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct HeaderUpdatePayload : public Payload {
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_HEADER_UPDATE;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct AsyncRequestPayloadBase : public Payload {
+public:
+ AsyncRequestId async_request_id;
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+
+protected:
+ AsyncRequestPayloadBase() {}
+ AsyncRequestPayloadBase(const AsyncRequestId &id) : async_request_id(id) {}
+};
+
+struct AsyncProgressPayload : public AsyncRequestPayloadBase {
+ uint64_t offset = 0;
+ uint64_t total = 0;
+
+ AsyncProgressPayload() {}
+ AsyncProgressPayload(const AsyncRequestId &id, uint64_t offset, uint64_t total)
+ : AsyncRequestPayloadBase(id), offset(offset), total(total) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_ASYNC_PROGRESS;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct AsyncCompletePayload : public AsyncRequestPayloadBase {
+ int result = 0;
+
+ AsyncCompletePayload() {}
+ AsyncCompletePayload(const AsyncRequestId &id, int r)
+ : AsyncRequestPayloadBase(id), result(r) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_ASYNC_COMPLETE;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct FlattenPayload : public AsyncRequestPayloadBase {
+ FlattenPayload() {}
+ FlattenPayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_FLATTEN;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+};
+
+struct ResizePayload : public AsyncRequestPayloadBase {
+ uint64_t size = 0;
+ bool allow_shrink = true;
+
+ ResizePayload() {}
+ ResizePayload(const AsyncRequestId &id, uint64_t size, bool allow_shrink)
+ : AsyncRequestPayloadBase(id), size(size), allow_shrink(allow_shrink) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_RESIZE;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct SnapPayloadBase : public AsyncRequestPayloadBase {
+public:
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+
+ bool check_for_refresh() const override {
+ return true;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+
+protected:
+ SnapPayloadBase() {}
+ SnapPayloadBase(const AsyncRequestId &id,
+ const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : AsyncRequestPayloadBase(id), snap_namespace(snap_namespace),
+ snap_name(name) {
+ }
+};
+
+struct SnapCreatePayload : public SnapPayloadBase {
+ uint64_t flags = 0;
+
+ SnapCreatePayload() {}
+ SnapCreatePayload(const AsyncRequestId &id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &name, uint64_t flags)
+ : SnapPayloadBase(id, snap_namespace, name), flags(flags) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_SNAP_CREATE;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct SnapRenamePayload : public SnapPayloadBase {
+ uint64_t snap_id = 0;
+
+ SnapRenamePayload() {}
+ SnapRenamePayload(const AsyncRequestId &id,
+ const uint64_t &src_snap_id,
+ const std::string &dst_name)
+ : SnapPayloadBase(id, cls::rbd::UserSnapshotNamespace(), dst_name),
+ snap_id(src_snap_id) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_SNAP_RENAME;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct SnapRemovePayload : public SnapPayloadBase {
+ SnapRemovePayload() {}
+ SnapRemovePayload(const AsyncRequestId &id,
+ const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(id, snap_namespace, name) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_SNAP_REMOVE;
+ }
+};
+
+struct SnapProtectPayload : public SnapPayloadBase {
+ SnapProtectPayload() {}
+ SnapProtectPayload(const AsyncRequestId &id,
+ const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(id, snap_namespace, name) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_SNAP_PROTECT;
+ }
+};
+
+struct SnapUnprotectPayload : public SnapPayloadBase {
+ SnapUnprotectPayload() {}
+ SnapUnprotectPayload(const AsyncRequestId &id,
+ const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &name)
+ : SnapPayloadBase(id, snap_namespace, name) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_SNAP_UNPROTECT;
+ }
+};
+
+struct RebuildObjectMapPayload : public AsyncRequestPayloadBase {
+ RebuildObjectMapPayload() {}
+ RebuildObjectMapPayload(const AsyncRequestId &id)
+ : AsyncRequestPayloadBase(id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_REBUILD_OBJECT_MAP;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+};
+
+struct RenamePayload : public AsyncRequestPayloadBase {
+ std::string image_name;
+
+ RenamePayload() {}
+ RenamePayload(const AsyncRequestId &id, const std::string _image_name)
+ : AsyncRequestPayloadBase(id), image_name(_image_name) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_RENAME;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UpdateFeaturesPayload : public AsyncRequestPayloadBase {
+ uint64_t features = 0;
+ bool enabled = false;
+
+ UpdateFeaturesPayload() {}
+ UpdateFeaturesPayload(const AsyncRequestId &id, uint64_t features,
+ bool enabled)
+ : AsyncRequestPayloadBase(id), features(features), enabled(enabled) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_UPDATE_FEATURES;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct MigratePayload : public AsyncRequestPayloadBase {
+ MigratePayload() {}
+ MigratePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_MIGRATE;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+};
+
+struct SparsifyPayload : public AsyncRequestPayloadBase {
+ size_t sparse_size = 0;
+
+ SparsifyPayload() {}
+ SparsifyPayload(const AsyncRequestId &id, size_t sparse_size)
+ : AsyncRequestPayloadBase(id), sparse_size(sparse_size) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_SPARSIFY;
+ }
+ bool check_for_refresh() const override {
+ return true;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct QuiescePayload : public AsyncRequestPayloadBase {
+ QuiescePayload() {}
+ QuiescePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_QUIESCE;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+};
+
+struct UnquiescePayload : public AsyncRequestPayloadBase {
+ UnquiescePayload() {}
+ UnquiescePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_UNQUIESCE;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+};
+
+struct MetadataUpdatePayload : public AsyncRequestPayloadBase {
+ std::string key;
+ std::optional<std::string> value;
+ MetadataUpdatePayload() {}
+ MetadataUpdatePayload(const AsyncRequestId &id, std::string key,
+ std::optional<std::string> value)
+ : AsyncRequestPayloadBase(id), key(key), value(value) {
+ }
+
+ NotifyOp get_notify_op() const override {
+ return NOTIFY_OP_METADATA_UPDATE;
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload : public Payload {
+ NotifyOp get_notify_op() const override {
+ return static_cast<NotifyOp>(-1);
+ }
+ bool check_for_refresh() const override {
+ return false;
+ }
+
+ void encode(bufferlist &bl) const override;
+ void decode(__u8 version, bufferlist::const_iterator &iter) override;
+ void dump(Formatter *f) const override;
+};
+
+struct NotifyMessage {
+ NotifyMessage() : payload(new UnknownPayload()) {}
+ NotifyMessage(Payload *payload) : payload(payload) {}
+
+ std::unique_ptr<Payload> payload;
+
+ bool check_for_refresh() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+ NotifyOp get_notify_op() const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+struct ResponseMessage {
+ ResponseMessage() : result(0) {}
+ ResponseMessage(int result_) : result(result_) {}
+
+ int result;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<ResponseMessage *> &o);
+};
+
+std::ostream &operator<<(std::ostream &out,
+ const NotifyOp &op);
+std::ostream &operator<<(std::ostream &out,
+ const AsyncRequestId &request);
+
+WRITE_CLASS_ENCODER(AsyncRequestId);
+WRITE_CLASS_ENCODER(NotifyMessage);
+WRITE_CLASS_ENCODER(ResponseMessage);
+
+} // namespace watch_notify
+} // namespace librbd
+
+
+#endif // LIBRBD_WATCH_NOTIFY_TYPES_H
diff --git a/src/librbd/Watcher.cc b/src/librbd/Watcher.cc
new file mode 100644
index 000000000..c215d6df7
--- /dev/null
+++ b/src/librbd/Watcher.cc
@@ -0,0 +1,370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/Watcher.h"
+#include "librbd/watcher/RewatchRequest.h"
+#include "librbd/Utils.h"
+#include "librbd/TaskFinisher.h"
+#include "librbd/asio/ContextWQ.h"
+#include "include/encoding.h"
+#include "common/errno.h"
+#include <boost/bind/bind.hpp>
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+
+using namespace boost::placeholders;
+
+using namespace watcher;
+
+using util::create_context_callback;
+using util::create_rados_callback;
+using std::string;
+
+namespace {
+
+struct C_UnwatchAndFlush : public Context {
+ librados::Rados rados;
+ Context *on_finish;
+ bool flushing = false;
+ int ret_val = 0;
+
+ C_UnwatchAndFlush(librados::IoCtx &io_ctx, Context *on_finish)
+ : rados(io_ctx), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ if (ret_val == 0 && r < 0) {
+ ret_val = r;
+ }
+
+ if (!flushing) {
+ flushing = true;
+
+ librados::AioCompletion *aio_comp = create_rados_callback(this);
+ r = rados.aio_watch_flush(aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ return;
+ }
+
+ // ensure our reference to the RadosClient is released prior
+ // to completing the callback to avoid racing an explicit
+ // librados shutdown
+ Context *ctx = on_finish;
+ r = ret_val;
+ delete this;
+
+ ctx->complete(r);
+ }
+
+ void finish(int r) override {
+ }
+};
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Watcher::C_NotifyAck " << this << " " \
+ << __func__ << ": "
+
+Watcher::C_NotifyAck::C_NotifyAck(Watcher *watcher, uint64_t notify_id,
+ uint64_t handle)
+ : watcher(watcher), cct(watcher->m_cct), notify_id(notify_id),
+ handle(handle) {
+ ldout(cct, 10) << "id=" << notify_id << ", " << "handle=" << handle << dendl;
+}
+
+void Watcher::C_NotifyAck::finish(int r) {
+ ldout(cct, 10) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+ watcher->acknowledge_notify(notify_id, handle, out);
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Watcher: " << this << " " << __func__ \
+ << ": "
+
+Watcher::Watcher(librados::IoCtx& ioctx, asio::ContextWQ *work_queue,
+ const string& oid)
+ : m_ioctx(ioctx), m_work_queue(work_queue), m_oid(oid),
+ m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
+ m_watch_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::Watcher::m_watch_lock", this))),
+ m_watch_handle(0), m_notifier(work_queue, ioctx, oid),
+ m_watch_state(WATCH_STATE_IDLE), m_watch_ctx(*this) {
+}
+
+Watcher::~Watcher() {
+ std::shared_lock l{m_watch_lock};
+ ceph_assert(is_unregistered(m_watch_lock));
+}
+
+void Watcher::register_watch(Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ std::unique_lock watch_locker{m_watch_lock};
+ ceph_assert(is_unregistered(m_watch_lock));
+ m_watch_state = WATCH_STATE_REGISTERING;
+ m_watch_blocklisted = false;
+
+ librados::AioCompletion *aio_comp = create_rados_callback(
+ new C_RegisterWatch(this, on_finish));
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void Watcher::handle_register_watch(int r, Context *on_finish) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::unique_lock watch_locker{m_watch_lock};
+ ceph_assert(m_watch_state == WATCH_STATE_REGISTERING);
+
+ m_watch_state = WATCH_STATE_IDLE;
+ if (r < 0) {
+ lderr(m_cct) << "failed to register watch: " << cpp_strerror(r)
+ << dendl;
+ m_watch_handle = 0;
+ }
+
+ if (m_unregister_watch_ctx != nullptr) {
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == 0 && m_watch_error) {
+ lderr(m_cct) << "re-registering watch after error" << dendl;
+ m_watch_state = WATCH_STATE_REWATCHING;
+ watch_error = true;
+ } else {
+ m_watch_blocklisted = (r == -EBLOCKLISTED);
+ }
+ }
+
+ on_finish->complete(r);
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::unregister_watch(Context *on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ {
+ std::unique_lock watch_locker{m_watch_lock};
+ if (m_watch_state != WATCH_STATE_IDLE) {
+ ldout(m_cct, 10) << "delaying unregister until register completed"
+ << dendl;
+
+ ceph_assert(m_unregister_watch_ctx == nullptr);
+ m_unregister_watch_ctx = new LambdaContext([this, on_finish](int r) {
+ unregister_watch(on_finish);
+ });
+ return;
+ } else if (is_registered(m_watch_lock)) {
+ librados::AioCompletion *aio_comp = create_rados_callback(
+ new C_UnwatchAndFlush(m_ioctx, on_finish));
+ int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+
+ m_watch_handle = 0;
+ m_watch_blocklisted = false;
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+bool Watcher::notifications_blocked() const {
+ std::shared_lock locker{m_watch_lock};
+
+ bool blocked = (m_blocked_count > 0);
+ ldout(m_cct, 5) << "blocked=" << blocked << dendl;
+ return blocked;
+}
+
+void Watcher::block_notifies(Context *on_finish) {
+ {
+ std::unique_lock locker{m_watch_lock};
+ ++m_blocked_count;
+ ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl;
+ }
+ m_async_op_tracker.wait_for_ops(on_finish);
+}
+
+void Watcher::unblock_notifies() {
+ std::unique_lock locker{m_watch_lock};
+ ceph_assert(m_blocked_count > 0);
+ --m_blocked_count;
+ ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl;
+}
+
+void Watcher::flush(Context *on_finish) {
+ m_notifier.flush(on_finish);
+}
+
+std::string Watcher::get_oid() const {
+ std::shared_lock locker{m_watch_lock};
+ return m_oid;
+}
+
+void Watcher::set_oid(const string& oid) {
+ std::unique_lock watch_locker{m_watch_lock};
+ ceph_assert(is_unregistered(m_watch_lock));
+
+ m_oid = oid;
+}
+
+void Watcher::handle_error(uint64_t handle, int err) {
+ lderr(m_cct) << "handle=" << handle << ": " << cpp_strerror(err) << dendl;
+
+ std::unique_lock watch_locker{m_watch_lock};
+ m_watch_error = true;
+
+ if (is_registered(m_watch_lock)) {
+ m_watch_state = WATCH_STATE_REWATCHING;
+ if (err == -EBLOCKLISTED) {
+ m_watch_blocklisted = true;
+ }
+
+ auto ctx = new LambdaContext(
+ boost::bind(&Watcher::rewatch, this));
+ m_work_queue->queue(ctx);
+ }
+}
+
+void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
+ bufferlist &out) {
+ m_ioctx.notify_ack(m_oid, notify_id, handle, out);
+}
+
+void Watcher::rewatch() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::unique_lock watch_locker{m_watch_lock};
+ ceph_assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else {
+ m_watch_error = false;
+ auto ctx = create_context_callback<
+ Watcher, &Watcher::handle_rewatch>(this);
+ auto req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock,
+ &m_watch_ctx, &m_watch_handle, ctx);
+ req->send();
+ return;
+ }
+ }
+
+ unregister_watch_ctx->complete(0);
+}
+
+void Watcher::handle_rewatch(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::unique_lock watch_locker{m_watch_lock};
+ ceph_assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ m_watch_blocklisted = false;
+ if (m_unregister_watch_ctx != nullptr) {
+ ldout(m_cct, 10) << "image is closing, skip rewatch" << dendl;
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLOCKLISTED) {
+ lderr(m_cct) << "client blocklisted" << dendl;
+ m_watch_blocklisted = true;
+ } else if (r == -ENOENT) {
+ ldout(m_cct, 5) << "object does not exist" << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to rewatch: " << cpp_strerror(r) << dendl;
+ watch_error = true;
+ } else if (m_watch_error) {
+ lderr(m_cct) << "re-registering watch after error" << dendl;
+ watch_error = true;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ return;
+ } else if (watch_error) {
+ rewatch();
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ Watcher, &Watcher::handle_rewatch_callback>(this);
+ m_work_queue->queue(ctx, r);
+}
+
+void Watcher::handle_rewatch_callback(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+ handle_rewatch_complete(r);
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::unique_lock watch_locker{m_watch_lock};
+ ceph_assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLOCKLISTED || r == -ENOENT) {
+ m_watch_state = WATCH_STATE_IDLE;
+ } else if (r < 0 || m_watch_error) {
+ watch_error = true;
+ } else {
+ m_watch_state = WATCH_STATE_IDLE;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::send_notify(bufferlist& payload,
+ watcher::NotifyResponse *response,
+ Context *on_finish) {
+ m_notifier.notify(payload, response, on_finish);
+}
+
+void Watcher::WatchCtx::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ // if notifications are blocked, finish the notification w/o
+ // bubbling the notification up to the derived class
+ watcher.m_async_op_tracker.start_op();
+ if (watcher.notifications_blocked()) {
+ bufferlist bl;
+ watcher.acknowledge_notify(notify_id, handle, bl);
+ } else {
+ watcher.handle_notify(notify_id, handle, notifier_id, bl);
+ }
+ watcher.m_async_op_tracker.finish_op();
+}
+
+void Watcher::WatchCtx::handle_error(uint64_t handle, int err) {
+ watcher.handle_error(handle, err);
+}
+
+} // namespace librbd
diff --git a/src/librbd/Watcher.h b/src/librbd/Watcher.h
new file mode 100644
index 000000000..96ecda7d0
--- /dev/null
+++ b/src/librbd/Watcher.h
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_H
+#define CEPH_LIBRBD_WATCHER_H
+
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "common/RWLock.h"
+#include "include/rados/librados.hpp"
+#include "librbd/watcher/Notifier.h"
+#include "librbd/watcher/Types.h"
+#include <string>
+#include <utility>
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+namespace watcher { struct NotifyResponse; }
+
+class Watcher {
+public:
+ struct C_NotifyAck : public Context {
+ Watcher *watcher;
+ CephContext *cct;
+ uint64_t notify_id;
+ uint64_t handle;
+ bufferlist out;
+
+ C_NotifyAck(Watcher *watcher, uint64_t notify_id, uint64_t handle);
+ void finish(int r) override;
+ };
+
+ Watcher(librados::IoCtx& ioctx, asio::ContextWQ *work_queue,
+ const std::string& oid);
+ virtual ~Watcher();
+
+ void register_watch(Context *on_finish);
+ virtual void unregister_watch(Context *on_finish);
+ void flush(Context *on_finish);
+
+ bool notifications_blocked() const;
+ virtual void block_notifies(Context *on_finish);
+ void unblock_notifies();
+
+ std::string get_oid() const;
+ void set_oid(const string& oid);
+
+ uint64_t get_watch_handle() const {
+ std::shared_lock watch_locker{m_watch_lock};
+ return m_watch_handle;
+ }
+
+ bool is_registered() const {
+ std::shared_lock locker{m_watch_lock};
+ return is_registered(m_watch_lock);
+ }
+ bool is_unregistered() const {
+ std::shared_lock locker{m_watch_lock};
+ return is_unregistered(m_watch_lock);
+ }
+ bool is_blocklisted() const {
+ std::shared_lock locker{m_watch_lock};
+ return m_watch_blocklisted;
+ }
+
+protected:
+ enum WatchState {
+ WATCH_STATE_IDLE,
+ WATCH_STATE_REGISTERING,
+ WATCH_STATE_REWATCHING
+ };
+
+ librados::IoCtx& m_ioctx;
+ asio::ContextWQ *m_work_queue;
+ std::string m_oid;
+ CephContext *m_cct;
+ mutable ceph::shared_mutex m_watch_lock;
+ uint64_t m_watch_handle;
+ watcher::Notifier m_notifier;
+
+ WatchState m_watch_state;
+ bool m_watch_blocklisted = false;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ bool is_registered(const ceph::shared_mutex&) const {
+ return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle != 0);
+ }
+ bool is_unregistered(const ceph::shared_mutex&) const {
+ return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle == 0);
+ }
+
+ void send_notify(bufferlist &payload,
+ watcher::NotifyResponse *response = nullptr,
+ Context *on_finish = nullptr);
+
+ virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) = 0;
+
+ virtual void handle_error(uint64_t cookie, int err);
+
+ void acknowledge_notify(uint64_t notify_id, uint64_t handle,
+ bufferlist &out);
+
+ virtual void handle_rewatch_complete(int r) { }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNREGISTERED
+ * |
+ * | (register_watch)
+ * |
+ * REGISTERING
+ * |
+ * v (watch error)
+ * REGISTERED * * * * * * * > ERROR
+ * | ^ |
+ * | | | (rewatch)
+ * | | v
+ * | | REWATCHING
+ * | | |
+ * | | |
+ * | \---------------------/
+ * |
+ * | (unregister_watch)
+ * |
+ * v
+ * UNREGISTERED
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ struct WatchCtx : public librados::WatchCtx2 {
+ Watcher &watcher;
+
+ WatchCtx(Watcher &parent) : watcher(parent) {}
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ bufferlist& bl) override;
+ void handle_error(uint64_t handle, int err) override;
+ };
+
+ struct C_RegisterWatch : public Context {
+ Watcher *watcher;
+ Context *on_finish;
+
+ C_RegisterWatch(Watcher *watcher, Context *on_finish)
+ : watcher(watcher), on_finish(on_finish) {
+ }
+ void finish(int r) override {
+ watcher->handle_register_watch(r, on_finish);
+ }
+ };
+
+ WatchCtx m_watch_ctx;
+ Context *m_unregister_watch_ctx = nullptr;
+
+ bool m_watch_error = false;
+
+ uint32_t m_blocked_count = 0;
+
+ void handle_register_watch(int r, Context *on_finish);
+
+ void rewatch();
+ void handle_rewatch(int r);
+ void handle_rewatch_callback(int r);
+
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_H
diff --git a/src/librbd/api/Config.cc b/src/librbd/api/Config.cc
new file mode 100644
index 000000000..8148607e3
--- /dev/null
+++ b/src/librbd/api/Config.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Config.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/api/PoolMetadata.h"
+#include "librbd/image/GetMetadataRequest.h"
+#include <algorithm>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Config: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+const uint32_t MAX_KEYS = 64;
+
+typedef std::map<std::string_view, std::pair<std::string, config_source_t>> Parent;
+
+static std::set<std::string_view> EXCLUDE_OPTIONS {
+ "rbd_auto_exclusive_lock_until_manual_request",
+ "rbd_default_format",
+ "rbd_default_pool",
+ "rbd_discard_on_zeroed_write_same",
+ "rbd_op_thread_timeout",
+ "rbd_op_threads",
+ "rbd_tracing",
+ "rbd_validate_names",
+ "rbd_validate_pool",
+ "rbd_mirror_pool_replayers_refresh_interval",
+ "rbd_config_pool_override_update_timestamp"
+ };
+static std::set<std::string_view> EXCLUDE_IMAGE_OPTIONS {
+ "rbd_default_clone_format",
+ "rbd_default_data_pool",
+ "rbd_default_features",
+ "rbd_default_format",
+ "rbd_default_order",
+ "rbd_default_stripe_count",
+ "rbd_default_stripe_unit",
+ "rbd_journal_order",
+ "rbd_journal_pool",
+ "rbd_journal_splay_width"
+ };
+
+struct Options : Parent {
+ librados::IoCtx m_io_ctx;
+
+ Options(librados::IoCtx& io_ctx, bool image_apply_only_options) {
+ m_io_ctx.dup(io_ctx);
+ m_io_ctx.set_namespace("");
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_io_ctx.cct());
+
+ const std::string rbd_key_prefix("rbd_");
+ const std::string rbd_mirror_key_prefix("rbd_mirror_");
+ auto& schema = cct->_conf.get_schema();
+ for (auto& pair : schema) {
+ if (!boost::starts_with(pair.first, rbd_key_prefix)) {
+ continue;
+ } else if (EXCLUDE_OPTIONS.count(pair.first) != 0) {
+ continue;
+ } else if (image_apply_only_options &&
+ EXCLUDE_IMAGE_OPTIONS.count(pair.first) != 0) {
+ continue;
+ } else if (image_apply_only_options &&
+ boost::starts_with(pair.first, rbd_mirror_key_prefix)) {
+ continue;
+ }
+
+ insert({pair.first, {}});
+ }
+ }
+
+ int init() {
+ CephContext *cct = (CephContext *)m_io_ctx.cct();
+
+ for (auto& [k,v] : *this) {
+ int r = cct->_conf.get_val(k, &v.first);
+ ceph_assert(r == 0);
+ v.second = RBD_CONFIG_SOURCE_CONFIG;
+ }
+
+ std::string last_key = ImageCtx::METADATA_CONF_PREFIX;
+ bool more_results = true;
+
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+
+ int r = librbd::api::PoolMetadata<>::list(m_io_ctx, last_key, MAX_KEYS,
+ &pairs);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pairs.empty()) {
+ break;
+ }
+
+ more_results = (pairs.size() == MAX_KEYS);
+ last_key = pairs.rbegin()->first;
+
+ for (auto kv : pairs) {
+ std::string key;
+ if (!util::is_metadata_config_override(kv.first, &key)) {
+ more_results = false;
+ break;
+ }
+ auto it = find(key);
+ if (it != end()) {
+ it->second = {{kv.second.c_str(), kv.second.length()},
+ RBD_CONFIG_SOURCE_POOL};
+ }
+ }
+ }
+ return 0;
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+bool Config<I>::is_option_name(librados::IoCtx& io_ctx,
+ const std::string &name) {
+ Options opts(io_ctx, false);
+
+ return (opts.find(name) != opts.end());
+}
+
+template <typename I>
+int Config<I>::list(librados::IoCtx& io_ctx,
+ std::vector<config_option_t> *options) {
+ Options opts(io_ctx, false);
+
+ int r = opts.init();
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto& [k,v] : opts) {
+ options->push_back({std::string{k}, v.first, v.second});
+ }
+
+ return 0;
+}
+
+template <typename I>
+bool Config<I>::is_option_name(I *image_ctx, const std::string &name) {
+ Options opts(image_ctx->md_ctx, true);
+
+ return (opts.find(name) != opts.end());
+}
+
+template <typename I>
+int Config<I>::list(I *image_ctx, std::vector<config_option_t> *options) {
+ CephContext *cct = image_ctx->cct;
+ Options opts(image_ctx->md_ctx, true);
+
+ int r = opts.init();
+ if (r < 0) {
+ return r;
+ }
+
+ std::map<std::string, bufferlist> pairs;
+ C_SaferCond ctx;
+ auto req = image::GetMetadataRequest<I>::create(
+ image_ctx->md_ctx, image_ctx->header_oid, true,
+ ImageCtx::METADATA_CONF_PREFIX, ImageCtx::METADATA_CONF_PREFIX, 0U, &pairs,
+ &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed reading image metadata: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto kv : pairs) {
+ std::string key;
+ if (!util::is_metadata_config_override(kv.first, &key)) {
+ break;
+ }
+ auto it = opts.find(key);
+ if (it != opts.end()) {
+ it->second = {{kv.second.c_str(), kv.second.length()},
+ RBD_CONFIG_SOURCE_IMAGE};
+ }
+ }
+
+ for (auto& [k,v] : opts) {
+ options->push_back({std::string{k}, v.first, v.second});
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Config<I>::apply_pool_overrides(librados::IoCtx& io_ctx,
+ ConfigProxy* config) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ Options opts(io_ctx, false);
+ int r = opts.init();
+ if (r < 0) {
+ lderr(cct) << "failed to read pool config overrides: " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ for (auto& [k,v] : opts) {
+ if (v.second == RBD_CONFIG_SOURCE_POOL) {
+ r = config->set_val(k, v.first);
+ if (r < 0) {
+ lderr(cct) << "failed to override pool config " << k << "="
+ << v.first << ": " << cpp_strerror(r) << dendl;
+ }
+ }
+ }
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Config<librbd::ImageCtx>;
diff --git a/src/librbd/api/Config.h b/src/librbd/api/Config.h
new file mode 100644
index 000000000..83225d287
--- /dev/null
+++ b/src/librbd/api/Config.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_CONFIG_H
+#define CEPH_LIBRBD_API_CONFIG_H
+
+#include "common/config_fwd.h"
+#include "include/common_fwd.h"
+#include "include/rbd/librbd.hpp"
+#include "include/rados/librados_fwd.hpp"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Config {
+public:
+ static bool is_option_name(librados::IoCtx& io_ctx, const std::string &name);
+ static int list(librados::IoCtx& io_ctx,
+ std::vector<config_option_t> *options);
+
+ static bool is_option_name(ImageCtxT *image_ctx, const std::string &name);
+ static int list(ImageCtxT *image_ctx, std::vector<config_option_t> *options);
+
+ static void apply_pool_overrides(librados::IoCtx& io_ctx,
+ ConfigProxy* config);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Config<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_CONFIG_H
diff --git a/src/librbd/api/DiffIterate.cc b/src/librbd/api/DiffIterate.cc
new file mode 100644
index 000000000..042f5eafb
--- /dev/null
+++ b/src/librbd/api/DiffIterate.cc
@@ -0,0 +1,376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/DiffIterate.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/internal.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/object_map/DiffRequest.h"
+#include "include/rados/librados.hpp"
+#include "include/interval_set.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "common/Throttle.h"
+#include "osdc/Striper.h"
+#include <boost/tuple/tuple.hpp>
+#include <list>
+#include <map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DiffIterate: "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+struct DiffContext {
+ DiffIterate<>::Callback callback;
+ void *callback_arg;
+ bool whole_object;
+ bool include_parent;
+ uint64_t from_snap_id;
+ uint64_t end_snap_id;
+ OrderedThrottle throttle;
+
+ template <typename I>
+ DiffContext(I &image_ctx, DiffIterate<>::Callback callback,
+ void *callback_arg, bool _whole_object, bool _include_parent,
+ uint64_t _from_snap_id, uint64_t _end_snap_id)
+ : callback(callback), callback_arg(callback_arg),
+ whole_object(_whole_object), include_parent(_include_parent),
+ from_snap_id(_from_snap_id), end_snap_id(_end_snap_id),
+ throttle(image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"), true) {
+ }
+};
+
+template <typename I>
+class C_DiffObject : public Context {
+public:
+ C_DiffObject(I &image_ctx, DiffContext &diff_context, uint64_t image_offset,
+ uint64_t image_length)
+ : m_image_ctx(image_ctx), m_cct(image_ctx.cct),
+ m_diff_context(diff_context), m_image_offset(image_offset),
+ m_image_length(image_length) {
+ }
+
+ void send() {
+ Context* ctx = m_diff_context.throttle.start_op(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_GENERIC);
+ int list_snaps_flags = 0;
+ if (!m_diff_context.include_parent || m_diff_context.from_snap_id != 0) {
+ list_snaps_flags |= io::LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT;
+ }
+ if (m_diff_context.whole_object) {
+ list_snaps_flags |= io::LIST_SNAPS_FLAG_WHOLE_OBJECT;
+ }
+ auto req = io::ImageDispatchSpec::create_list_snaps(
+ m_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START,
+ aio_comp, {{m_image_offset, m_image_length}},
+ {m_diff_context.from_snap_id, m_diff_context.end_snap_id},
+ list_snaps_flags, &m_snapshot_delta, {});
+ req->send();
+ }
+
+protected:
+ typedef boost::tuple<uint64_t, size_t, bool> Diff;
+ typedef std::list<Diff> Diffs;
+
+ void finish(int r) override {
+ CephContext *cct = m_cct;
+
+ if (r < 0) {
+ ldout(cct, 20) << "list_snaps failed: " << m_image_offset << "~"
+ << m_image_length << ": " << cpp_strerror(r) << dendl;
+ }
+
+ Diffs diffs;
+ ldout(cct, 20) << "image extent " << m_image_offset << "~"
+ << m_image_length << ": list_snaps complete" << dendl;
+
+ compute_diffs(&diffs);
+ for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) {
+ r = m_diff_context.callback(d->get<0>(), d->get<1>(), d->get<2>(),
+ m_diff_context.callback_arg);
+ if (r < 0) {
+ break;
+ }
+ }
+ m_diff_context.throttle.end_op(r);
+ }
+
+private:
+ I& m_image_ctx;
+ CephContext *m_cct;
+ DiffContext &m_diff_context;
+ uint64_t m_image_offset;
+ uint64_t m_image_length;
+
+ io::SnapshotDelta m_snapshot_delta;
+
+ void compute_diffs(Diffs *diffs) {
+ CephContext *cct = m_cct;
+
+ // merge per-snapshot deltas into an aggregate
+ io::SparseExtents aggregate_snapshot_extents;
+ for (auto& [key, snapshot_extents] : m_snapshot_delta) {
+ for (auto& snapshot_extent : snapshot_extents) {
+ auto state = snapshot_extent.get_val().state;
+
+ // ignore DNE object (and parent)
+ if ((state == io::SPARSE_EXTENT_STATE_DNE) ||
+ (key == io::INITIAL_WRITE_READ_SNAP_IDS &&
+ state == io::SPARSE_EXTENT_STATE_ZEROED)) {
+ continue;
+ }
+
+ aggregate_snapshot_extents.insert(
+ snapshot_extent.get_off(), snapshot_extent.get_len(),
+ {state, snapshot_extent.get_len()});
+ }
+ }
+
+ // build delta callback set
+ for (auto& snapshot_extent : aggregate_snapshot_extents) {
+ ldout(cct, 20) << "off=" << snapshot_extent.get_off() << ", "
+ << "len=" << snapshot_extent.get_len() << ", "
+ << "state=" << snapshot_extent.get_val().state << dendl;
+ diffs->emplace_back(
+ snapshot_extent.get_off(), snapshot_extent.get_len(),
+ snapshot_extent.get_val().state == io::SPARSE_EXTENT_STATE_DATA);
+ }
+ }
+};
+
+int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) {
+ // it's possible for a discard to create a hole in the parent image -- ignore
+ if (exists) {
+ interval_set<uint64_t> *diff = static_cast<interval_set<uint64_t> *>(arg);
+ diff->insert(off, len);
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int DiffIterate<I>::diff_iterate(I *ictx,
+ const cls::rbd::SnapshotNamespace& from_snap_namespace,
+ const char *fromsnapname,
+ uint64_t off, uint64_t len,
+ bool include_parent, bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+{
+ ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off
+ << " len = " << len << dendl;
+
+ if (!ictx->data_ctx.is_valid()) {
+ return -ENODEV;
+ }
+
+ // ensure previous writes are visible to listsnaps
+ C_SaferCond flush_ctx;
+ {
+ std::shared_lock owner_locker{ictx->owner_lock};
+ auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, ictx,
+ io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ *ictx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START,
+ aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ }
+ int r = flush_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ ictx->image_lock.lock_shared();
+ r = clip_io(ictx, off, &len);
+ ictx->image_lock.unlock_shared();
+ if (r < 0) {
+ return r;
+ }
+
+ DiffIterate command(*ictx, from_snap_namespace, fromsnapname, off, len,
+ include_parent, whole_object, cb, arg);
+ r = command.execute();
+ return r;
+}
+
+template <typename I>
+int DiffIterate<I>::execute() {
+ CephContext* cct = m_image_ctx.cct;
+
+ ceph_assert(m_image_ctx.data_ctx.is_valid());
+
+ librados::snap_t from_snap_id = 0;
+ librados::snap_t end_snap_id;
+ uint64_t from_size = 0;
+ uint64_t end_size;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if (m_from_snap_name) {
+ from_snap_id = m_image_ctx.get_snap_id(m_from_snap_namespace,
+ m_from_snap_name);
+ from_size = m_image_ctx.get_image_size(from_snap_id);
+ }
+ end_snap_id = m_image_ctx.snap_id;
+ end_size = m_image_ctx.get_image_size(end_snap_id);
+ }
+
+ if (from_snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+ if (from_snap_id == end_snap_id) {
+ // no diff.
+ return 0;
+ }
+ if (from_snap_id >= end_snap_id) {
+ return -EINVAL;
+ }
+
+ int r;
+ bool fast_diff_enabled = false;
+ BitVector<2> object_diff_state;
+ interval_set<uint64_t> parent_diff;
+ if (m_whole_object) {
+ C_SaferCond ctx;
+ auto req = object_map::DiffRequest<I>::create(&m_image_ctx, from_snap_id,
+ end_snap_id,
+ &object_diff_state, &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ ldout(cct, 5) << "fast diff disabled" << dendl;
+ } else {
+ ldout(cct, 5) << "fast diff enabled" << dendl;
+ fast_diff_enabled = true;
+
+ // check parent overlap only if we are comparing to the beginning of time
+ if (m_include_parent && from_snap_id == 0) {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ uint64_t overlap = 0;
+ m_image_ctx.get_parent_overlap(m_image_ctx.snap_id, &overlap);
+ if (m_image_ctx.parent && overlap > 0) {
+ ldout(cct, 10) << " first getting parent diff" << dendl;
+ DiffIterate diff_parent(*m_image_ctx.parent, {}, nullptr, 0, overlap,
+ true, true, &simple_diff_cb, &parent_diff);
+ r = diff_parent.execute();
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+ }
+ }
+
+ ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to "
+ << end_snap_id << " size from " << from_size
+ << " to " << end_size << dendl;
+ DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg,
+ m_whole_object, m_include_parent, from_snap_id,
+ end_snap_id);
+
+ uint64_t period = m_image_ctx.get_stripe_period();
+ uint64_t off = m_offset;
+ uint64_t left = m_length;
+
+ while (left > 0) {
+ uint64_t period_off = off - (off % period);
+ uint64_t read_len = min(period_off + period - off, left);
+
+ if (fast_diff_enabled) {
+ // map to extents
+ map<object_t,vector<ObjectExtent> > object_extents;
+ Striper::file_to_extents(cct, m_image_ctx.format_string,
+ &m_image_ctx.layout, off, read_len, 0,
+ object_extents, 0);
+
+ // get diff info for each object and merge adjacent stripe units
+ // into an aggregate (this also sorts them)
+ io::SparseExtents aggregate_sparse_extents;
+ for (auto& [object, extents] : object_extents) {
+ const uint64_t object_no = extents.front().objectno;
+ uint8_t diff_state = object_diff_state[object_no];
+ ldout(cct, 20) << "object " << object << ": diff_state="
+ << (int)diff_state << dendl;
+
+ if (diff_state == object_map::DIFF_STATE_HOLE &&
+ from_snap_id == 0 && !parent_diff.empty()) {
+ // no data in child object -- report parent diff instead
+ for (auto& oe : extents) {
+ for (auto& be : oe.buffer_extents) {
+ interval_set<uint64_t> o;
+ o.insert(off + be.first, be.second);
+ o.intersection_of(parent_diff);
+ ldout(cct, 20) << " reporting parent overlap " << o << dendl;
+ for (auto e = o.begin(); e != o.end(); ++e) {
+ aggregate_sparse_extents.insert(e.get_start(), e.get_len(),
+ {io::SPARSE_EXTENT_STATE_DATA,
+ e.get_len()});
+ }
+ }
+ }
+ } else if (diff_state == object_map::DIFF_STATE_HOLE_UPDATED ||
+ diff_state == object_map::DIFF_STATE_DATA_UPDATED) {
+ auto state = (diff_state == object_map::DIFF_STATE_HOLE_UPDATED ?
+ io::SPARSE_EXTENT_STATE_ZEROED : io::SPARSE_EXTENT_STATE_DATA);
+ for (auto& oe : extents) {
+ for (auto& be : oe.buffer_extents) {
+ aggregate_sparse_extents.insert(off + be.first, be.second,
+ {state, be.second});
+ }
+ }
+ }
+ }
+
+ for (const auto& se : aggregate_sparse_extents) {
+ ldout(cct, 20) << "off=" << se.get_off() << ", len=" << se.get_len()
+ << ", state=" << se.get_val().state << dendl;
+ r = m_callback(se.get_off(), se.get_len(),
+ se.get_val().state == io::SPARSE_EXTENT_STATE_DATA,
+ m_callback_arg);
+ if (r < 0) {
+ return r;
+ }
+ }
+ } else {
+ auto diff_object = new C_DiffObject<I>(m_image_ctx, diff_context, off,
+ read_len);
+ diff_object->send();
+
+ if (diff_context.throttle.pending_error()) {
+ r = diff_context.throttle.wait_for_ret();
+ return r;
+ }
+ }
+
+ left -= read_len;
+ off += read_len;
+ }
+
+ r = diff_context.throttle.wait_for_ret();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::DiffIterate<librbd::ImageCtx>;
diff --git a/src/librbd/api/DiffIterate.h b/src/librbd/api/DiffIterate.h
new file mode 100644
index 000000000..e6074d9cb
--- /dev/null
+++ b/src/librbd/api/DiffIterate.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_DIFF_ITERATE_H
+#define CEPH_LIBRBD_API_DIFF_ITERATE_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DiffIterate {
+public:
+ typedef int (*Callback)(uint64_t, size_t, int, void *);
+
+ static int diff_iterate(ImageCtxT *ictx,
+ const cls::rbd::SnapshotNamespace& from_snap_namespace,
+ const char *fromsnapname,
+ uint64_t off, uint64_t len, bool include_parent,
+ bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg);
+
+private:
+ ImageCtxT &m_image_ctx;
+ cls::rbd::SnapshotNamespace m_from_snap_namespace;
+ const char* m_from_snap_name;
+ uint64_t m_offset;
+ uint64_t m_length;
+ bool m_include_parent;
+ bool m_whole_object;
+ Callback m_callback;
+ void *m_callback_arg;
+
+ DiffIterate(ImageCtxT &image_ctx,
+ const cls::rbd::SnapshotNamespace& from_snap_namespace,
+ const char *from_snap_name, uint64_t off, uint64_t len,
+ bool include_parent, bool whole_object, Callback callback,
+ void *callback_arg)
+ : m_image_ctx(image_ctx), m_from_snap_namespace(from_snap_namespace),
+ m_from_snap_name(from_snap_name), m_offset(off),
+ m_length(len), m_include_parent(include_parent),
+ m_whole_object(whole_object), m_callback(callback),
+ m_callback_arg(callback_arg)
+ {
+ }
+
+ int execute();
+
+ int diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id,
+ BitVector<2>* object_diff_state);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::DiffIterate<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_DIFF_ITERATE_H
diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc
new file mode 100644
index 000000000..72a99cf47
--- /dev/null
+++ b/src/librbd/api/Group.cc
@@ -0,0 +1,1290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Cond.h"
+#include "common/errno.h"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/api/Group.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/internal.h"
+#include "librbd/io/AioCompletion.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Group: " << __func__ << ": "
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+// list binds to list() here, so std::list is explicitly used below
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+using librados::Rados;
+
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+template <typename I>
+snap_t get_group_snap_id(I* ictx,
+ const cls::rbd::SnapshotNamespace& in_snap_namespace) {
+ ceph_assert(ceph_mutex_is_locked(ictx->image_lock));
+ auto it = ictx->snap_ids.lower_bound({cls::rbd::GroupSnapshotNamespace{},
+ ""});
+ for (; it != ictx->snap_ids.end(); ++it) {
+ if (it->first.first == in_snap_namespace) {
+ return it->second;
+ } else if (boost::get<cls::rbd::GroupSnapshotNamespace>(&it->first.first) ==
+ nullptr) {
+ break;
+ }
+ }
+ return CEPH_NOSNAP;
+}
+
+string generate_uuid(librados::IoCtx& io_ctx)
+{
+ Rados rados(io_ctx);
+ uint64_t bid = rados.get_instance_id();
+
+ uint32_t extra = rand() % 0xFFFFFFFF;
+ ostringstream bid_ss;
+ bid_ss << std::hex << bid << std::hex << extra;
+ return bid_ss.str();
+}
+
+int group_snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<cls::rbd::GroupSnapshot> *cls_snaps)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ vector<string> ind_snap_names;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+ const int max_read = 1024;
+ cls::rbd::GroupSnapshot snap_last;
+
+ for (;;) {
+ vector<cls::rbd::GroupSnapshot> snaps_page;
+
+ r = cls_client::group_snap_list(&group_ioctx, group_header_oid,
+ snap_last, max_read, &snaps_page);
+
+ if (r < 0) {
+ lderr(cct) << "error reading snap list from group: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ cls_snaps->insert(cls_snaps->end(), snaps_page.begin(), snaps_page.end());
+ if (snaps_page.size() < max_read) {
+ break;
+ }
+ snap_last = *snaps_page.rbegin();
+ }
+
+ return 0;
+}
+
+std::string calc_ind_image_snap_name(uint64_t pool_id,
+ const std::string &group_id,
+ const std::string &snap_id)
+{
+ std::stringstream ind_snap_name_stream;
+ ind_snap_name_stream << ".group." << std::hex << pool_id << "_"
+ << group_id << "_" << snap_id;
+ return ind_snap_name_stream.str();
+}
+
+int group_image_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<cls::rbd::GroupImageStatus> *image_ids)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+ ldout(cct, 20) << "listing images in group name "
+ << group_name << " group id " << group_header_oid << dendl;
+ image_ids->clear();
+
+ const int max_read = 1024;
+ cls::rbd::GroupImageSpec start_last;
+ do {
+ std::vector<cls::rbd::GroupImageStatus> image_ids_page;
+
+ r = cls_client::group_image_list(&group_ioctx, group_header_oid,
+ start_last, max_read, &image_ids_page);
+
+ if (r < 0) {
+ lderr(cct) << "error reading image list from group: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ image_ids->insert(image_ids->end(),
+ image_ids_page.begin(), image_ids_page.end());
+
+ if (image_ids_page.size() > 0)
+ start_last = image_ids_page.rbegin()->spec;
+
+ r = image_ids_page.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+int group_image_remove(librados::IoCtx& group_ioctx, string group_id,
+ librados::IoCtx& image_ioctx, string image_id)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_header_oid = util::group_header_name(group_id);
+
+ string image_header_oid = util::header_name(image_id);
+
+ ldout(cct, 20) << "removing image " << image_id
+ << " image id " << image_header_oid << dendl;
+
+ cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id());
+
+ cls::rbd::GroupImageStatus incomplete_st(image_id, image_ioctx.get_id(),
+ cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE);
+
+ cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id());
+
+ int r = cls_client::group_image_set(&group_ioctx, group_header_oid,
+ incomplete_st);
+
+ if (r < 0) {
+ lderr(cct) << "couldn't put image into removing state: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = cls_client::image_group_remove(&image_ioctx, image_header_oid,
+ group_spec);
+ if ((r < 0) && (r != -ENOENT)) {
+ lderr(cct) << "couldn't remove group reference from image"
+ << cpp_strerror(-r) << dendl;
+ return r;
+ } else if (r >= 0) {
+ ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid);
+ }
+
+ r = cls_client::group_image_remove(&group_ioctx, group_header_oid, spec);
+ if (r < 0) {
+ lderr(cct) << "couldn't remove image from group"
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int group_snap_remove_by_record(librados::IoCtx& group_ioctx,
+ const cls::rbd::GroupSnapshot& group_snap,
+ const std::string& group_id,
+ const std::string& group_header_oid) {
+
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ std::vector<C_SaferCond*> on_finishes;
+ int r, ret_code;
+
+ std::vector<librbd::ImageCtx*> ictxs;
+
+ cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id,
+ group_snap.id};
+
+ ldout(cct, 20) << "Removing snapshots" << dendl;
+ int snap_count = group_snap.snaps.size();
+
+ for (int i = 0; i < snap_count; ++i) {
+ librbd::IoCtx image_io_ctx;
+ r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {},
+ &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id,
+ nullptr, image_io_ctx, false);
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ image_ctx->state->open(0, on_finish);
+
+ ictxs.push_back(image_ctx);
+ on_finishes.push_back(on_finish);
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ ictxs[i] = nullptr;
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Opened participating images. " <<
+ "Deleting snapshots themselves." << dendl;
+
+ for (int i = 0; i < snap_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ on_finishes[i] = new C_SaferCond;
+
+ std::string snap_name;
+ ictx->image_lock.lock_shared();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ r = ictx->get_snap_name(snap_id, &snap_name);
+ ictx->image_lock.unlock_shared();
+
+ if (r >= 0) {
+ ldout(cct, 20) << "removing individual snapshot from image " << ictx->name
+ << dendl;
+ ictx->operations->snap_remove(ne, snap_name, on_finishes[i]);
+ } else {
+ // We are ok to ignore missing image snapshots. The snapshot could have
+ // been inconsistent in the first place.
+ on_finishes[i]->complete(0);
+ }
+ }
+
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0 && r != -ENOENT) {
+ // if previous attempts to remove this snapshot failed then the image's
+ // snapshot may not exist
+ lderr(cct) << "Failed deleting image snapshot. Ret code: " << r << dendl;
+ ret_code = r;
+ }
+ }
+
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Removed images snapshots removing snapshot record."
+ << dendl;
+
+ r = cls_client::group_snap_remove(&group_ioctx, group_header_oid,
+ group_snap.id);
+ if (r < 0) {
+ ret_code = r;
+ goto finish;
+ }
+
+finish:
+ for (int i = 0; i < snap_count; ++i) {
+ if (ictxs[i] != nullptr) {
+ ictxs[i]->state->close();
+ }
+ }
+ return ret_code;
+}
+
+int group_snap_rollback_by_record(librados::IoCtx& group_ioctx,
+ const cls::rbd::GroupSnapshot& group_snap,
+ const std::string& group_id,
+ const std::string& group_header_oid,
+ ProgressContext& pctx) {
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ std::vector<C_SaferCond*> on_finishes;
+ int r, ret_code;
+
+ std::vector<librbd::ImageCtx*> ictxs;
+
+ cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id,
+ group_snap.id};
+
+ ldout(cct, 20) << "Rolling back snapshots" << dendl;
+ int snap_count = group_snap.snaps.size();
+
+ for (int i = 0; i < snap_count; ++i) {
+ librados::IoCtx image_io_ctx;
+ r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {},
+ &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id,
+ nullptr, image_io_ctx, false);
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ image_ctx->state->open(0, on_finish);
+
+ ictxs.push_back(image_ctx);
+ on_finishes.push_back(on_finish);
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ ictxs[i] = nullptr;
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Requesting exclusive locks for images" << dendl;
+ for (auto ictx: ictxs) {
+ std::shared_lock owner_lock{ictx->owner_lock};
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->block_requests(-EBUSY);
+ }
+ }
+ for (int i = 0; i < snap_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ std::shared_lock owner_lock{ictx->owner_lock};
+
+ on_finishes[i] = new C_SaferCond;
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->acquire_lock(on_finishes[i]);
+ }
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < snap_count; ++i) {
+ r = 0;
+ ImageCtx *ictx = ictxs[i];
+ if (ictx->exclusive_lock != nullptr) {
+ r = on_finishes[i]->wait();
+ }
+ delete on_finishes[i];
+ if (r < 0) {
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto finish;
+ }
+
+ for (int i = 0; i < snap_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ on_finishes[i] = new C_SaferCond;
+
+ std::shared_lock owner_locker{ictx->owner_lock};
+ std::string snap_name;
+ ictx->image_lock.lock_shared();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ r = ictx->get_snap_name(snap_id, &snap_name);
+ ictx->image_lock.unlock_shared();
+
+ if (r >= 0) {
+ ldout(cct, 20) << "rolling back to individual snapshot for image " << ictx->name
+ << dendl;
+ ictx->operations->execute_snap_rollback(ne, snap_name, pctx, on_finishes[i]);
+ } else {
+ on_finishes[i]->complete(r);
+ }
+ }
+
+ for (int i = 0; i < snap_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "Failed rolling back group to snapshot. Ret code: " << r << dendl;
+ ret_code = r;
+ }
+ }
+
+finish:
+ for (int i = 0; i < snap_count; ++i) {
+ if (ictxs[i] != nullptr) {
+ ictxs[i]->state->close();
+ }
+ }
+ return ret_code;
+}
+
+template <typename I>
+void notify_unquiesce(std::vector<I*> &ictxs,
+ const std::vector<uint64_t> &requests) {
+ if (requests.empty()) {
+ return;
+ }
+
+ ceph_assert(requests.size() == ictxs.size());
+ int image_count = ictxs.size();
+ std::vector<C_SaferCond> on_finishes(image_count);
+
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+
+ ictx->image_watcher->notify_unquiesce(requests[i], &on_finishes[i]);
+ }
+
+ for (int i = 0; i < image_count; ++i) {
+ on_finishes[i].wait();
+ }
+}
+
+template <typename I>
+int notify_quiesce(std::vector<I*> &ictxs, ProgressContext &prog_ctx,
+ std::vector<uint64_t> *requests) {
+ int image_count = ictxs.size();
+ std::vector<C_SaferCond> on_finishes(image_count);
+
+ requests->resize(image_count);
+ for (int i = 0; i < image_count; ++i) {
+ auto ictx = ictxs[i];
+
+ ictx->image_watcher->notify_quiesce(&(*requests)[i], prog_ctx,
+ &on_finishes[i]);
+ }
+
+ int ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+ int r = on_finishes[i].wait();
+ if (r < 0) {
+ ret_code = r;
+ }
+ }
+
+ if (ret_code != 0) {
+ notify_unquiesce(ictxs, *requests);
+ }
+
+ return ret_code;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int Group<I>::image_remove_by_id(librados::IoCtx& group_ioctx,
+ const char *group_name,
+ librados::IoCtx& image_ioctx,
+ const char *image_id)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << " image "
+ << &image_ioctx << " id " << image_id << dendl;
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "removing image from group name " << group_name
+ << " group id " << group_id << dendl;
+
+ return group_image_remove(group_ioctx, group_id, image_ioctx, string(image_id));
+}
+
+template <typename I>
+int Group<I>::create(librados::IoCtx& io_ctx, const char *group_name)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ string id = generate_uuid(io_ctx);
+
+ ldout(cct, 2) << "adding group to directory..." << dendl;
+
+ int r = cls_client::group_dir_add(&io_ctx, RBD_GROUP_DIRECTORY, group_name,
+ id);
+ if (r < 0) {
+ lderr(cct) << "error adding group to directory: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string header_oid = util::group_header_name(id);
+
+ r = io_ctx.create(header_oid, true);
+ if (r < 0) {
+ lderr(cct) << "error creating group header: " << cpp_strerror(r) << dendl;
+ goto err_remove_from_dir;
+ }
+
+ return 0;
+
+err_remove_from_dir:
+ int remove_r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY,
+ group_name, id);
+ if (remove_r < 0) {
+ lderr(cct) << "error cleaning up group from rbd_directory "
+ << "object after creation failed: " << cpp_strerror(remove_r)
+ << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int Group<I>::remove(librados::IoCtx& io_ctx, const char *group_name)
+{
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "group_remove " << &io_ctx << " " << group_name << dendl;
+
+ std::string group_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY,
+ std::string(group_name), &group_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error getting id of group" << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ r = group_snap_list(io_ctx, group_name, &snaps);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing group snapshots" << dendl;
+ return r;
+ }
+
+ for (auto &snap : snaps) {
+ r = group_snap_remove_by_record(io_ctx, snap, group_id, group_header_oid);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ std::vector<cls::rbd::GroupImageStatus> images;
+ r = group_image_list(io_ctx, group_name, &images);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing group images" << dendl;
+ return r;
+ }
+
+ for (auto image : images) {
+ IoCtx image_ioctx;
+ r = util::create_ioctx(io_ctx, "image", image.spec.pool_id, {},
+ &image_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = group_image_remove(io_ctx, group_id, image_ioctx, image.spec.image_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing image from a group" << dendl;
+ return r;
+ }
+ }
+
+ string header_oid = util::group_header_name(group_id);
+
+ r = io_ctx.remove(header_oid);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing header: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY,
+ group_name, group_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing group from directory" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::list(IoCtx& io_ctx, vector<string> *names)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl;
+
+ int max_read = 1024;
+ string last_read = "";
+ int r;
+ do {
+ map<string, string> groups;
+ r = cls_client::group_dir_list(&io_ctx, RBD_GROUP_DIRECTORY, last_read,
+ max_read, &groups);
+ if (r == -ENOENT) {
+ return 0; // Ignore missing rbd group directory. It means we don't have any groups yet.
+ }
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "error listing group in directory: "
+ << cpp_strerror(r) << dendl;
+ } else {
+ r = 0;
+ }
+ return r;
+ }
+ for (pair<string, string> group : groups) {
+ names->push_back(group.first);
+ }
+ if (!groups.empty()) {
+ last_read = groups.rbegin()->first;
+ }
+ r = groups.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::image_add(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << " image "
+ << &image_ioctx << " name " << image_name << dendl;
+
+ if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) {
+ lderr(cct) << "group and image cannot be in different namespaces" << dendl;
+ return -EINVAL;
+ }
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ string group_header_oid = util::group_header_name(group_id);
+
+
+ ldout(cct, 20) << "adding image to group name " << group_name
+ << " group id " << group_header_oid << dendl;
+
+ string image_id;
+
+ r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r < 0) {
+ lderr(cct) << "error reading image id object: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ string image_header_oid = util::header_name(image_id);
+
+ ldout(cct, 20) << "adding image " << image_name
+ << " image id " << image_header_oid << dendl;
+
+ cls::rbd::GroupImageStatus incomplete_st(
+ image_id, image_ioctx.get_id(),
+ cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE);
+ cls::rbd::GroupImageStatus attached_st(
+ image_id, image_ioctx.get_id(), cls::rbd::GROUP_IMAGE_LINK_STATE_ATTACHED);
+
+ r = cls_client::group_image_set(&group_ioctx, group_header_oid,
+ incomplete_st);
+
+ cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id());
+
+ if (r < 0) {
+ lderr(cct) << "error adding image reference to group: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = cls_client::image_group_add(&image_ioctx, image_header_oid, group_spec);
+ if (r < 0) {
+ lderr(cct) << "error adding group reference to image: "
+ << cpp_strerror(-r) << dendl;
+ cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id());
+ cls_client::group_image_remove(&group_ioctx, group_header_oid, spec);
+ // Ignore errors in the clean up procedure.
+ return r;
+ }
+ ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid);
+
+ r = cls_client::group_image_set(&group_ioctx, group_header_oid,
+ attached_st);
+
+ return r;
+}
+
+template <typename I>
+int Group<I>::image_remove(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << " image "
+ << &image_ioctx << " name " << image_name << dendl;
+
+ if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) {
+ lderr(cct) << "group and image cannot be in different namespaces" << dendl;
+ return -EINVAL;
+ }
+
+ string group_id;
+
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "removing image from group name " << group_name
+ << " group id " << group_id << dendl;
+
+ string image_id;
+ r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r < 0) {
+ lderr(cct) << "error reading image id object: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = group_image_remove(group_ioctx, group_id, image_ioctx, image_id);
+
+ return r;
+}
+
+template <typename I>
+int Group<I>::image_list(librados::IoCtx& group_ioctx,
+ const char *group_name,
+ std::vector<group_image_info_t>* images)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &group_ioctx
+ << " group name " << group_name << dendl;
+
+ std::vector<cls::rbd::GroupImageStatus> image_ids;
+
+ group_image_list(group_ioctx, group_name, &image_ids);
+
+ for (auto image_id : image_ids) {
+ IoCtx ioctx;
+ int r = util::create_ioctx(group_ioctx, "image", image_id.spec.pool_id, {},
+ &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string image_name;
+ r = cls_client::dir_get_name(&ioctx, RBD_DIRECTORY,
+ image_id.spec.image_id, &image_name);
+ if (r < 0) {
+ return r;
+ }
+
+ images->push_back(
+ group_image_info_t {
+ image_name,
+ ioctx.get_id(),
+ static_cast<group_image_state_t>(image_id.state)});
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::rename(librados::IoCtx& io_ctx, const char *src_name,
+ const char *dest_name)
+{
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "group_rename " << &io_ctx << " " << src_name
+ << " -> " << dest_name << dendl;
+
+ std::string group_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY,
+ std::string(src_name), &group_id);
+ if (r < 0) {
+ if (r != -ENOENT)
+ lderr(cct) << "error getting id of group" << dendl;
+ return r;
+ }
+
+ r = cls_client::group_dir_rename(&io_ctx, RBD_GROUP_DIRECTORY,
+ src_name, dest_name, group_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error renaming group from directory" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+template <typename I>
+int Group<I>::image_get_group(I *ictx, group_info_t *group_info)
+{
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ if (RBD_GROUP_INVALID_POOL != ictx->group_spec.pool_id) {
+ IoCtx ioctx;
+ r = util::create_ioctx(ictx->md_ctx, "group", ictx->group_spec.pool_id, {},
+ &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string group_name;
+ r = cls_client::dir_get_name(&ioctx, RBD_GROUP_DIRECTORY,
+ ictx->group_spec.group_id, &group_name);
+ if (r < 0)
+ return r;
+ group_info->pool = ioctx.get_id();
+ group_info->name = group_name;
+ } else {
+ group_info->pool = RBD_GROUP_INVALID_POOL;
+ group_info->name = "";
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::snap_create(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name,
+ uint32_t flags) {
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ cls::rbd::GroupSnapshot group_snap;
+ vector<cls::rbd::ImageSnapshotSpec> image_snaps;
+ std::string ind_snap_name;
+
+ std::vector<librbd::ImageCtx*> ictxs;
+ std::vector<C_SaferCond*> on_finishes;
+ std::vector<uint64_t> quiesce_requests;
+ NoOpProgressContext prog_ctx;
+ uint64_t internal_flags = 0;
+
+ int r = util::snap_create_flags_api_to_internal(cct, flags, &internal_flags);
+ if (r < 0) {
+ return r;
+ }
+ internal_flags &= ~(SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE |
+ SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR);
+
+ r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name,
+ &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupImageStatus> images;
+ r = group_image_list(group_ioctx, group_name, &images);
+ if (r < 0) {
+ return r;
+ }
+ int image_count = images.size();
+
+ ldout(cct, 20) << "Found " << image_count << " images in group" << dendl;
+
+ image_snaps = vector<cls::rbd::ImageSnapshotSpec>(image_count,
+ cls::rbd::ImageSnapshotSpec());
+
+ for (int i = 0; i < image_count; ++i) {
+ image_snaps[i].pool = images[i].spec.pool_id;
+ image_snaps[i].image_id = images[i].spec.image_id;
+ }
+
+ string group_header_oid = util::group_header_name(group_id);
+
+ group_snap.id = generate_uuid(group_ioctx);
+ group_snap.name = string(snap_name);
+ group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_INCOMPLETE;
+ group_snap.snaps = image_snaps;
+
+ cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id,
+ group_snap.id};
+
+ r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap);
+ if (r == -EEXIST) {
+ lderr(cct) << "snapshot with this name already exists: "
+ << cpp_strerror(r)
+ << dendl;
+ }
+ int ret_code = 0;
+ if (r < 0) {
+ ret_code = r;
+ goto finish;
+ }
+
+ for (auto image: images) {
+ librbd::IoCtx image_io_ctx;
+ r = util::create_ioctx(group_ioctx, "image", image.spec.pool_id, {},
+ &image_io_ctx);
+ if (r < 0) {
+ ret_code = r;
+ goto finish;
+ }
+
+ ldout(cct, 20) << "Opening image with id " << image.spec.image_id << dendl;
+
+ librbd::ImageCtx* image_ctx = new ImageCtx("", image.spec.image_id.c_str(),
+ nullptr, image_io_ctx, false);
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ image_ctx->state->open(0, on_finish);
+
+ ictxs.push_back(image_ctx);
+ on_finishes.push_back(on_finish);
+ }
+ ldout(cct, 20) << "Issued open request waiting for the completion" << dendl;
+ ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+
+ ldout(cct, 20) << "Waiting for completion on on_finish: " <<
+ on_finishes[i] << dendl;
+
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ ictxs[i] = nullptr;
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ goto remove_record;
+ }
+
+ if ((flags & RBD_SNAP_CREATE_SKIP_QUIESCE) == 0) {
+ ldout(cct, 20) << "Sending quiesce notification" << dendl;
+ ret_code = notify_quiesce(ictxs, prog_ctx, &quiesce_requests);
+ if (ret_code != 0 && (flags & RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR) == 0) {
+ goto remove_record;
+ }
+ }
+
+ ldout(cct, 20) << "Requesting exclusive locks for images" << dendl;
+
+ for (auto ictx: ictxs) {
+ std::shared_lock owner_lock{ictx->owner_lock};
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->block_requests(-EBUSY);
+ }
+ }
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ std::shared_lock owner_lock{ictx->owner_lock};
+
+ on_finishes[i] = new C_SaferCond;
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->acquire_lock(on_finishes[i]);
+ }
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+ r = 0;
+ ImageCtx *ictx = ictxs[i];
+ if (ictx->exclusive_lock != nullptr) {
+ r = on_finishes[i]->wait();
+ }
+ delete on_finishes[i];
+ if (r < 0) {
+ ret_code = r;
+ }
+ }
+ if (ret_code != 0) {
+ notify_unquiesce(ictxs, quiesce_requests);
+ goto remove_record;
+ }
+
+ ind_snap_name = calc_ind_image_snap_name(group_ioctx.get_id(), group_id,
+ group_snap.id);
+
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+
+ C_SaferCond* on_finish = new C_SaferCond;
+
+ std::shared_lock owner_locker{ictx->owner_lock};
+ ictx->operations->execute_snap_create(
+ ne, ind_snap_name.c_str(), on_finish, 0,
+ SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE, prog_ctx);
+
+ on_finishes[i] = on_finish;
+ }
+
+ ret_code = 0;
+ for (int i = 0; i < image_count; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0) {
+ ret_code = r;
+ } else {
+ ImageCtx *ictx = ictxs[i];
+ ictx->image_lock.lock_shared();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ ictx->image_lock.unlock_shared();
+ if (snap_id == CEPH_NOSNAP) {
+ ldout(cct, 20) << "Couldn't find created snapshot with namespace: "
+ << ne << dendl;
+ ret_code = -ENOENT;
+ } else {
+ image_snaps[i].snap_id = snapid_t(snap_id);
+ image_snaps[i].pool = ictx->md_ctx.get_id();
+ image_snaps[i].image_id = ictx->id;
+ }
+ }
+ }
+ if (ret_code != 0) {
+ goto remove_image_snaps;
+ }
+
+ group_snap.snaps = image_snaps;
+ group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_COMPLETE;
+
+ r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap);
+ if (r < 0) {
+ ret_code = r;
+ goto remove_image_snaps;
+ }
+
+ ldout(cct, 20) << "Sending unquiesce notification" << dendl;
+ notify_unquiesce(ictxs, quiesce_requests);
+
+ goto finish;
+
+remove_image_snaps:
+ notify_unquiesce(ictxs, quiesce_requests);
+
+ for (int i = 0; i < image_count; ++i) {
+ ImageCtx *ictx = ictxs[i];
+ ldout(cct, 20) << "Removing individual snapshot with name: " <<
+ ind_snap_name << dendl;
+
+ on_finishes[i] = new C_SaferCond;
+ std::string snap_name;
+ ictx->image_lock.lock_shared();
+ snap_t snap_id = get_group_snap_id(ictx, ne);
+ r = ictx->get_snap_name(snap_id, &snap_name);
+ ictx->image_lock.unlock_shared();
+ if (r >= 0) {
+ ictx->operations->snap_remove(ne, snap_name.c_str(), on_finishes[i]);
+ } else {
+ // Ignore missing image snapshots. The whole snapshot could have been
+ // inconsistent.
+ on_finishes[i]->complete(0);
+ }
+ }
+
+ for (int i = 0, n = on_finishes.size(); i < n; ++i) {
+ r = on_finishes[i]->wait();
+ delete on_finishes[i];
+ if (r < 0 && r != -ENOENT) { // if previous attempts to remove this snapshot failed then the image's snapshot may not exist
+ lderr(cct) << "Failed cleaning up image snapshot. Ret code: " << r << dendl;
+ // just report error, but don't abort the process
+ }
+ }
+
+remove_record:
+ r = cls_client::group_snap_remove(&group_ioctx, group_header_oid,
+ group_snap.id);
+ if (r < 0) {
+ lderr(cct) << "error while cleaning up group snapshot" << dendl;
+ // we ignore return value in clean up
+ }
+
+finish:
+ for (int i = 0, n = ictxs.size(); i < n; ++i) {
+ if (ictxs[i] != nullptr) {
+ ictxs[i]->state->close();
+ }
+ }
+ return ret_code;
+}
+
+template <typename I>
+int Group<I>::snap_remove(librados::IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ r = group_snap_list(group_ioctx, group_name, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot *group_snap = nullptr;
+ for (auto &snap : snaps) {
+ if (snap.name == string(snap_name)) {
+ group_snap = &snap;
+ break;
+ }
+ }
+ if (group_snap == nullptr) {
+ return -ENOENT;
+ }
+
+ string group_header_oid = util::group_header_name(group_id);
+ r = group_snap_remove_by_record(group_ioctx, *group_snap, group_id,
+ group_header_oid);
+ return r;
+}
+
+template <typename I>
+int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name) {
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+ if (0 == strcmp(old_snap_name, new_snap_name))
+ return -EEXIST;
+
+ std::string group_id;
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "error reading group id object: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupSnapshot> group_snaps;
+ r = group_snap_list(group_ioctx, group_name, &group_snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot group_snap;
+ for (auto &snap : group_snaps) {
+ if (snap.name == old_snap_name) {
+ group_snap = snap;
+ break;
+ }
+ }
+
+ if (group_snap.id.empty()) {
+ return -ENOENT;
+ }
+
+ std::string group_header_oid = util::group_header_name(group_id);
+ group_snap.name = new_snap_name;
+ r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps)
+{
+ std::vector<cls::rbd::GroupSnapshot> cls_snaps;
+
+ int r = group_snap_list(group_ioctx, group_name, &cls_snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto snap : cls_snaps) {
+ snaps->push_back(
+ group_snap_info_t {
+ snap.name,
+ static_cast<group_snap_state_t>(snap.state)});
+
+ }
+ return 0;
+}
+
+template <typename I>
+int Group<I>::snap_rollback(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name,
+ ProgressContext& pctx)
+{
+ CephContext *cct = (CephContext *)group_ioctx.cct();
+
+ string group_id;
+ int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY,
+ group_name, &group_id);
+ if (r < 0) {
+ lderr(cct) << "error reading group id object: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ r = group_snap_list(group_ioctx, group_name, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot *group_snap = nullptr;
+ for (auto &snap : snaps) {
+ if (snap.name == string(snap_name)) {
+ group_snap = &snap;
+ break;
+ }
+ }
+ if (group_snap == nullptr) {
+ return -ENOENT;
+ }
+
+ string group_header_oid = util::group_header_name(group_id);
+ r = group_snap_rollback_by_record(group_ioctx, *group_snap, group_id,
+ group_header_oid, pctx);
+ return r;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Group<librbd::ImageCtx>;
diff --git a/src/librbd/api/Group.h b/src/librbd/api/Group.h
new file mode 100644
index 000000000..9d3abcc59
--- /dev/null
+++ b/src/librbd/api/Group.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_GROUP_H
+#define CEPH_LIBRBD_API_GROUP_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+#include <vector>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Group {
+
+ static int create(librados::IoCtx& io_ctx, const char *group_name);
+ static int remove(librados::IoCtx& io_ctx, const char *group_name);
+ static int list(librados::IoCtx& io_ctx, std::vector<std::string> *names);
+ static int rename(librados::IoCtx& io_ctx, const char *src_group_name,
+ const char *dest_group_name);
+
+ static int image_add(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name);
+ static int image_remove(librados::IoCtx& group_ioctx, const char *group_name,
+ librados::IoCtx& image_ioctx, const char *image_name);
+ static int image_remove_by_id(librados::IoCtx& group_ioctx,
+ const char *group_name,
+ librados::IoCtx& image_ioctx,
+ const char *image_id);
+ static int image_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_image_info_t> *images);
+
+ static int image_get_group(ImageCtxT *ictx, group_info_t *group_info);
+
+ static int snap_create(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name,
+ uint32_t flags);
+ static int snap_remove(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name);
+ static int snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name, const char *new_snap_name);
+ static int snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps);
+ static int snap_rollback(librados::IoCtx& group_ioctx,
+ const char *group_name, const char *snap_name,
+ ProgressContext& pctx);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Group<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_GROUP_H
diff --git a/src/librbd/api/Image.cc b/src/librbd/api/Image.cc
new file mode 100644
index 000000000..55d0e15b8
--- /dev/null
+++ b/src/librbd/api/Image.cc
@@ -0,0 +1,1002 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Image.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/DeepCopyRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Trash.h"
+#include "librbd/api/Utils.h"
+#include "librbd/crypto/FormatRequest.h"
+#include "librbd/crypto/LoadRequest.h"
+#include "librbd/deep_copy/Handler.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/image/PreRemoveRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Image: " << __func__ << ": "
+
+using librados::snap_t;
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+bool compare_by_pool(const librbd::linked_image_spec_t& lhs,
+ const librbd::linked_image_spec_t& rhs)
+{
+ if (lhs.pool_id != rhs.pool_id) {
+ return lhs.pool_id < rhs.pool_id;
+ } else if (lhs.pool_namespace != rhs.pool_namespace) {
+ return lhs.pool_namespace < rhs.pool_namespace;
+ }
+ return false;
+}
+
+bool compare(const librbd::linked_image_spec_t& lhs,
+ const librbd::linked_image_spec_t& rhs)
+{
+ if (lhs.pool_name != rhs.pool_name) {
+ return lhs.pool_name < rhs.pool_name;
+ } else if (lhs.pool_id != rhs.pool_id) {
+ return lhs.pool_id < rhs.pool_id;
+ } else if (lhs.pool_namespace != rhs.pool_namespace) {
+ return lhs.pool_namespace < rhs.pool_namespace;
+ } else if (lhs.image_name != rhs.image_name) {
+ return lhs.image_name < rhs.image_name;
+ } else if (lhs.image_id != rhs.image_id) {
+ return lhs.image_id < rhs.image_id;
+ }
+ return false;
+}
+
+template <typename I>
+int pre_remove_image(librados::IoCtx& io_ctx, const std::string& image_id) {
+ I *image_ctx = I::create("", image_id, nullptr, io_ctx, false);
+ int r = image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto req = image::PreRemoveRequest<I>::create(image_ctx, false, &ctx);
+ req->send();
+
+ r = ctx.wait();
+ image_ctx->state->close();
+ return r;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int64_t Image<I>::get_data_pool_id(I *ictx) {
+ if (ictx->data_ctx.is_valid()) {
+ return ictx->data_ctx.get_id();
+ }
+
+ int64_t pool_id;
+ int r = cls_client::get_data_pool(&ictx->md_ctx, ictx->header_oid, &pool_id);
+ if (r < 0) {
+ CephContext *cct = ictx->cct;
+ lderr(cct) << "error getting data pool ID: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return pool_id;
+}
+
+template <typename I>
+int Image<I>::get_op_features(I *ictx, uint64_t *op_features) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "image_ctx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock image_locker{ictx->image_lock};
+ *op_features = ictx->op_features;
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_images(librados::IoCtx& io_ctx,
+ std::vector<image_spec_t> *images) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "list " << &io_ctx << dendl;
+
+ int r;
+ images->clear();
+
+ if (io_ctx.get_namespace().empty()) {
+ bufferlist bl;
+ r = io_ctx.read(RBD_DIRECTORY, bl, 0, 0);
+ if (r == -ENOENT) {
+ return 0;
+ } else if (r < 0) {
+ lderr(cct) << "error listing v1 images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // V1 format images are in a tmap
+ if (bl.length()) {
+ auto p = bl.cbegin();
+ bufferlist header;
+ std::map<std::string, bufferlist> m;
+ decode(header, p);
+ decode(m, p);
+ for (auto& it : m) {
+ images->push_back({.id ="", .name = it.first});
+ }
+ }
+ }
+
+ // V2 format images
+ std::map<std::string, std::string> image_names_to_ids;
+ r = list_images_v2(io_ctx, &image_names_to_ids);
+ if (r < 0) {
+ lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& img_pair : image_names_to_ids) {
+ images->push_back({.id = img_pair.second,
+ .name = img_pair.first});
+ }
+
+ // include V2 images in a partially removed state
+ std::vector<librbd::trash_image_info_t> trash_images;
+ r = Trash<I>::list(io_ctx, trash_images, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error listing trash images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& trash_image : trash_images) {
+ if (trash_image.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ images->push_back({.id = trash_image.id,
+ .name = trash_image.name});
+
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_images_v2(librados::IoCtx& io_ctx, ImageNameToIds *images) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl;
+
+ // new format images are accessed by class methods
+ int r;
+ int max_read = 1024;
+ string last_read = "";
+ do {
+ map<string, string> images_page;
+ r = cls_client::dir_list(&io_ctx, RBD_DIRECTORY, last_read, max_read,
+ &images_page);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing image in directory: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (r == -ENOENT) {
+ break;
+ }
+ for (map<string, string>::const_iterator it = images_page.begin();
+ it != images_page.end(); ++it) {
+ images->insert(*it);
+ }
+ if (!images_page.empty()) {
+ last_read = images_page.rbegin()->first;
+ }
+ r = images_page.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::get_parent(I *ictx,
+ librbd::linked_image_spec_t *parent_image,
+ librbd::snap_spec_t *parent_snap) {
+ auto cct = ictx->cct;
+ ldout(cct, 20) << "image_ctx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock image_locker{ictx->image_lock};
+
+ bool release_image_lock = false;
+ BOOST_SCOPE_EXIT_ALL(ictx, &release_image_lock) {
+ if (release_image_lock) {
+ ictx->parent->image_lock.unlock_shared();
+ }
+ };
+
+ // if a migration is in-progress, the true parent is the parent
+ // of the migration source image
+ auto parent = ictx->parent;
+ if (!ictx->migration_info.empty() && ictx->parent != nullptr) {
+ release_image_lock = true;
+ ictx->parent->image_lock.lock_shared();
+
+ parent = ictx->parent->parent;
+ }
+
+ if (parent == nullptr) {
+ return -ENOENT;
+ }
+
+ parent_image->pool_id = parent->md_ctx.get_id();
+ parent_image->pool_name = parent->md_ctx.get_pool_name();
+ parent_image->pool_namespace = parent->md_ctx.get_namespace();
+
+ std::shared_lock parent_image_locker{parent->image_lock};
+ parent_snap->id = parent->snap_id;
+ parent_snap->namespace_type = RBD_SNAP_NAMESPACE_TYPE_USER;
+ if (parent->snap_id != CEPH_NOSNAP) {
+ auto snap_info = parent->get_snap_info(parent->snap_id);
+ if (snap_info == nullptr) {
+ lderr(cct) << "error finding parent snap name: " << cpp_strerror(r)
+ << dendl;
+ return -ENOENT;
+ }
+
+ parent_snap->namespace_type = static_cast<snap_namespace_type_t>(
+ cls::rbd::get_snap_namespace_type(snap_info->snap_namespace));
+ parent_snap->name = snap_info->name;
+ }
+
+ parent_image->image_id = parent->id;
+ parent_image->image_name = parent->name;
+ parent_image->trash = true;
+
+ librbd::trash_image_info_t trash_info;
+ r = Trash<I>::get(parent->md_ctx, parent->id, &trash_info);
+ if (r == -ENOENT || r == -EOPNOTSUPP) {
+ parent_image->trash = false;
+ } else if (r < 0) {
+ lderr(cct) << "error looking up trash status: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_children(I *ictx,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ images->clear();
+ return list_descendants(ictx, 1, images);
+}
+
+template <typename I>
+int Image<I>::list_children(I *ictx,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ images->clear();
+ return list_descendants(ictx, parent_spec, 1, images);
+}
+
+template <typename I>
+int Image<I>::list_descendants(
+ librados::IoCtx& io_ctx, const std::string &image_id,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ ImageCtx *ictx = new librbd::ImageCtx("", image_id, nullptr,
+ io_ctx, true);
+ CephContext *cct = ictx->cct;
+ int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ return 0;
+ }
+ lderr(cct) << "failed to open descendant " << image_id
+ << " from pool " << io_ctx.get_pool_name() << ":"
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = list_descendants(ictx, max_level, images);
+
+ int r1 = ictx->state->close();
+ if (r1 < 0) {
+ lderr(cct) << "error when closing descendant " << image_id
+ << " from pool " << io_ctx.get_pool_name() << ":"
+ << cpp_strerror(r1) << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int Image<I>::list_descendants(
+ I *ictx, const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ std::shared_lock l{ictx->image_lock};
+ std::vector<librados::snap_t> snap_ids;
+ if (ictx->snap_id != CEPH_NOSNAP) {
+ snap_ids.push_back(ictx->snap_id);
+ } else {
+ snap_ids = ictx->snaps;
+ }
+ for (auto snap_id : snap_ids) {
+ cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(),
+ ictx->md_ctx.get_namespace(),
+ ictx->id, snap_id};
+ int r = list_descendants(ictx, parent_spec, max_level, images);
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+int Image<I>::list_descendants(
+ I *ictx, const cls::rbd::ParentImageSpec &parent_spec,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images) {
+ auto child_max_level = max_level;
+ if (child_max_level) {
+ if (child_max_level == 0) {
+ return 0;
+ }
+ (*child_max_level)--;
+ }
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ // no children for non-layered or old format image
+ if (!ictx->test_features(RBD_FEATURE_LAYERING, ictx->image_lock)) {
+ return 0;
+ }
+
+ librados::Rados rados(ictx->md_ctx);
+
+ // search all pools for clone v1 children dependent on this snapshot
+ std::list<std::pair<int64_t, std::string> > pools;
+ int r = rados.pool_list2(pools);
+ if (r < 0) {
+ lderr(cct) << "error listing pools: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& it : pools) {
+ int64_t base_tier;
+ r = rados.pool_get_base_tier(it.first, &base_tier);
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "pool " << it.second << " no longer exists" << dendl;
+ continue;
+ } else if (r < 0) {
+ lderr(cct) << "error retrieving base tier for pool " << it.second
+ << dendl;
+ return r;
+ }
+ if (it.first != base_tier) {
+ // pool is a cache; skip it
+ continue;
+ }
+
+ IoCtx ioctx;
+ r = librbd::util::create_ioctx(
+ ictx->md_ctx, "child image", it.first, {}, &ioctx);
+ if (r == -ENOENT) {
+ continue;
+ } else if (r < 0) {
+ return r;
+ }
+
+ std::set<std::string> image_ids;
+ r = cls_client::get_children(&ioctx, RBD_CHILDREN, parent_spec,
+ image_ids);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error reading list of children from pool " << it.second
+ << dendl;
+ return r;
+ }
+
+ for (auto& image_id : image_ids) {
+ images->push_back({
+ it.first, "", ictx->md_ctx.get_namespace(), image_id, "", false});
+ r = list_descendants(ioctx, image_id, child_max_level, images);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ // retrieve clone v2 children attached to this snapshot
+ IoCtx parent_io_ctx;
+ r = librbd::util::create_ioctx(
+ ictx->md_ctx, "parent image",parent_spec.pool_id,
+ parent_spec.pool_namespace, &parent_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::ChildImageSpecs child_images;
+ r = cls_client::children_list(
+ &parent_io_ctx, librbd::util::header_name(parent_spec.image_id),
+ parent_spec.snap_id, &child_images);
+ if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) {
+ lderr(cct) << "error retrieving children: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& child_image : child_images) {
+ images->push_back({
+ child_image.pool_id, "", child_image.pool_namespace,
+ child_image.image_id, "", false});
+ if (!child_max_level || *child_max_level > 0) {
+ IoCtx ioctx;
+ r = librbd::util::create_ioctx(
+ ictx->md_ctx, "child image", child_image.pool_id,
+ child_image.pool_namespace, &ioctx);
+ if (r == -ENOENT) {
+ continue;
+ } else if (r < 0) {
+ return r;
+ }
+ r = list_descendants(ioctx, child_image.image_id, child_max_level,
+ images);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ // batch lookups by pool + namespace
+ std::sort(images->begin(), images->end(), compare_by_pool);
+
+ int64_t child_pool_id = -1;
+ librados::IoCtx child_io_ctx;
+ std::map<std::string, std::pair<std::string, bool>> child_image_id_to_info;
+ for (auto& image : *images) {
+ if (child_pool_id == -1 || child_pool_id != image.pool_id ||
+ child_io_ctx.get_namespace() != image.pool_namespace) {
+ r = librbd::util::create_ioctx(
+ ictx->md_ctx, "child image", image.pool_id, image.pool_namespace,
+ &child_io_ctx);
+ if (r == -ENOENT) {
+ image.pool_name = "";
+ image.image_name = "";
+ continue;
+ } else if (r < 0) {
+ return r;
+ }
+ child_pool_id = image.pool_id;
+
+ child_image_id_to_info.clear();
+
+ std::map<std::string, std::string> image_names_to_ids;
+ r = list_images_v2(child_io_ctx, &image_names_to_ids);
+ if (r < 0) {
+ lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& [name, id] : image_names_to_ids) {
+ child_image_id_to_info.insert({id, {name, false}});
+ }
+
+ std::vector<librbd::trash_image_info_t> trash_images;
+ r = Trash<I>::list(child_io_ctx, trash_images, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error listing trash images: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto& it : trash_images) {
+ child_image_id_to_info.insert({
+ it.id,
+ {it.name,
+ it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING ? false : true}});
+ }
+ }
+
+ auto it = child_image_id_to_info.find(image.image_id);
+ if (it == child_image_id_to_info.end()) {
+ lderr(cct) << "error looking up name for image id "
+ << image.image_id << " in pool "
+ << child_io_ctx.get_pool_name()
+ << (image.pool_namespace.empty() ?
+ "" : "/" + image.pool_namespace) << dendl;
+ return -ENOENT;
+ }
+
+ image.pool_name = child_io_ctx.get_pool_name();
+ image.image_name = it->second.first;
+ image.trash = it->second.second;
+ }
+
+ // final sort by pool + image names
+ std::sort(images->begin(), images->end(), compare);
+ return 0;
+}
+
+template <typename I>
+int Image<I>::deep_copy(I *src, librados::IoCtx& dest_md_ctx,
+ const char *destname, ImageOptions& opts,
+ ProgressContext &prog_ctx) {
+ CephContext *cct = (CephContext *)dest_md_ctx.cct();
+ ldout(cct, 20) << src->name
+ << (src->snap_name.length() ? "@" + src->snap_name : "")
+ << " -> " << destname << " opts = " << opts << dendl;
+
+ uint64_t features;
+ uint64_t src_size;
+ {
+ std::shared_lock image_locker{src->image_lock};
+
+ if (!src->migration_info.empty()) {
+ lderr(cct) << "cannot deep copy migrating image" << dendl;
+ return -EBUSY;
+ }
+
+ features = src->features;
+ src_size = src->get_image_size(src->snap_id);
+ }
+ uint64_t format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ if (format == 1) {
+ lderr(cct) << "old format not supported for destination image" << dendl;
+ return -EINVAL;
+ }
+ uint64_t stripe_unit = src->stripe_unit;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+ uint64_t stripe_count = src->stripe_count;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+ uint64_t order = src->order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ }
+ if (features & ~RBD_FEATURES_ALL) {
+ lderr(cct) << "librbd does not support requested features" << dendl;
+ return -ENOSYS;
+ }
+
+ uint64_t flatten = 0;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
+ opts.unset(RBD_IMAGE_OPTION_FLATTEN);
+ }
+
+ cls::rbd::ParentImageSpec parent_spec;
+ if (flatten > 0) {
+ parent_spec.pool_id = -1;
+ } else {
+ std::shared_lock image_locker{src->image_lock};
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!src->snap_info.empty()) {
+ parent_spec = src->snap_info.begin()->second.parent.spec;
+ } else {
+ parent_spec = src->parent_md.spec;
+ }
+ }
+
+ int r;
+ if (parent_spec.pool_id == -1) {
+ r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
+ } else {
+ librados::IoCtx parent_io_ctx;
+ r = librbd::util::create_ioctx(
+ src->md_ctx, "parent image", parent_spec.pool_id,
+ parent_spec.pool_namespace, &parent_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ ConfigProxy config{cct->_conf};
+ api::Config<I>::apply_pool_overrides(dest_md_ctx, &config);
+
+ C_SaferCond ctx;
+ std::string dest_id = librbd::util::generate_image_id(dest_md_ctx);
+ auto *req = image::CloneRequest<I>::create(
+ config, parent_io_ctx, parent_spec.image_id, "", {}, parent_spec.snap_id,
+ dest_md_ctx, destname, dest_id, opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL,
+ "", "", src->op_work_queue, &ctx);
+ req->send();
+ r = ctx.wait();
+ }
+ if (r < 0) {
+ lderr(cct) << "header creation failed" << dendl;
+ return r;
+ }
+ opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
+
+ auto dest = new I(destname, "", nullptr, dest_md_ctx, false);
+ r = dest->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "failed to read newly created header" << dendl;
+ return r;
+ }
+
+ C_SaferCond lock_ctx;
+ {
+ std::unique_lock locker{dest->owner_lock};
+
+ if (dest->exclusive_lock == nullptr ||
+ dest->exclusive_lock->is_lock_owner()) {
+ lock_ctx.complete(0);
+ } else {
+ dest->exclusive_lock->acquire_lock(&lock_ctx);
+ }
+ }
+
+ r = lock_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ dest->state->close();
+ return r;
+ }
+
+ r = deep_copy(src, dest, flatten > 0, prog_ctx);
+
+ int close_r = dest->state->close();
+ if (r == 0 && close_r < 0) {
+ r = close_r;
+ }
+ return r;
+}
+
+template <typename I>
+int Image<I>::deep_copy(I *src, I *dest, bool flatten,
+ ProgressContext &prog_ctx) {
+ // ensure previous writes are visible to dest
+ C_SaferCond flush_ctx;
+ {
+ std::shared_lock owner_locker{src->owner_lock};
+ auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, src,
+ io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ *src, io::IMAGE_DISPATCH_LAYER_INTERNAL_START,
+ aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ }
+ int r = flush_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ librados::snap_t snap_id_start = 0;
+ librados::snap_t snap_id_end;
+ {
+ std::shared_lock image_locker{src->image_lock};
+ snap_id_end = src->snap_id;
+ }
+
+ AsioEngine asio_engine(src->md_ctx);
+
+ C_SaferCond cond;
+ SnapSeqs snap_seqs;
+ deep_copy::ProgressHandler progress_handler{&prog_ctx};
+ auto req = DeepCopyRequest<I>::create(
+ src, dest, snap_id_start, snap_id_end, 0U, flatten, boost::none,
+ asio_engine.get_work_queue(), &snap_seqs, &progress_handler, &cond);
+ req->send();
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::snap_set(I *ictx,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const char *snap_name) {
+ ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = "
+ << (snap_name ? snap_name : "NULL") << dendl;
+
+ // ignore return value, since we may be set to a non-existent
+ // snapshot and the user is trying to fix that
+ ictx->state->refresh_if_required();
+
+ uint64_t snap_id = CEPH_NOSNAP;
+ std::string name(snap_name == nullptr ? "" : snap_name);
+ if (!name.empty()) {
+ std::shared_lock image_locker{ictx->image_lock};
+ snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace{},
+ snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+ }
+
+ return snap_set(ictx, snap_id);
+}
+
+template <typename I>
+int Image<I>::snap_set(I *ictx, uint64_t snap_id) {
+ ldout(ictx->cct, 20) << "snap_set " << ictx << " "
+ << "snap_id=" << snap_id << dendl;
+
+ // ignore return value, since we may be set to a non-existent
+ // snapshot and the user is trying to fix that
+ ictx->state->refresh_if_required();
+
+ C_SaferCond ctx;
+ ictx->state->snap_set(snap_id, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(ictx->cct) << "failed to " << (snap_id == CEPH_NOSNAP ? "un" : "")
+ << "set snapshot: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::remove(IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext& prog_ctx)
+{
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "name=" << image_name << dendl;
+
+ // look up the V2 image id based on the image name
+ std::string image_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r == -ENOENT) {
+ // check if it already exists in trash from an aborted trash remove attempt
+ std::vector<trash_image_info_t> trash_entries;
+ r = Trash<I>::list(io_ctx, trash_entries, false);
+ if (r < 0) {
+ return r;
+ } else if (r >= 0) {
+ for (auto& entry : trash_entries) {
+ if (entry.name == image_name &&
+ entry.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ return Trash<I>::remove(io_ctx, entry.id, true, prog_ctx);
+ }
+ }
+ }
+
+ // fall-through if we failed to locate the image in the V2 directory and
+ // trash
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ // attempt to move the image to the trash (and optionally immediately
+ // delete the image)
+ ConfigProxy config(cct->_conf);
+ Config<I>::apply_pool_overrides(io_ctx, &config);
+
+ rbd_trash_image_source_t trash_image_source =
+ RBD_TRASH_IMAGE_SOURCE_REMOVING;
+ uint64_t expire_seconds = 0;
+ if (config.get_val<bool>("rbd_move_to_trash_on_remove")) {
+ // keep the image in the trash upon remove requests
+ trash_image_source = RBD_TRASH_IMAGE_SOURCE_USER;
+ expire_seconds = config.get_val<uint64_t>(
+ "rbd_move_to_trash_on_remove_expire_seconds");
+ } else {
+ // attempt to pre-validate the removal before moving to trash and
+ // removing
+ r = pre_remove_image<I>(io_ctx, image_id);
+ if (r == -ECHILD) {
+ if (config.get_val<bool>("rbd_move_parent_to_trash_on_remove")) {
+ // keep the image in the trash until the last child is removed
+ trash_image_source = RBD_TRASH_IMAGE_SOURCE_USER_PARENT;
+ } else {
+ lderr(cct) << "image has snapshots - not removing" << dendl;
+ return -ENOTEMPTY;
+ }
+ } else if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ }
+
+ r = Trash<I>::move(io_ctx, trash_image_source, image_name, image_id,
+ expire_seconds);
+ if (r >= 0) {
+ if (trash_image_source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ // proceed with attempting to immediately remove the image
+ r = Trash<I>::remove(io_ctx, image_id, true, prog_ctx);
+
+ if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK) {
+ // best-effort try to restore the image if the removal
+ // failed for possible expected reasons
+ Trash<I>::restore(io_ctx, {cls::rbd::TRASH_IMAGE_SOURCE_REMOVING},
+ image_id, image_name);
+ }
+ }
+ return r;
+ } else if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ // fall-through if trash isn't supported
+ }
+
+ AsioEngine asio_engine(io_ctx);
+
+ // might be a V1 image format that cannot be moved to the trash
+ // and would not have been listed in the V2 directory -- or the OSDs
+ // are too old and don't support the trash feature
+ C_SaferCond cond;
+ auto req = librbd::image::RemoveRequest<I>::create(
+ io_ctx, image_name, "", false, false, prog_ctx,
+ asio_engine.get_work_queue(), &cond);
+ req->send();
+
+ return cond.wait();
+}
+
+template <typename I>
+int Image<I>::flatten_children(I *ictx, const char* snap_name,
+ ProgressContext& pctx) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "children flatten " << ictx->name << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock l{ictx->image_lock};
+ snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ snap_name);
+
+ cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(),
+ ictx->md_ctx.get_namespace(),
+ ictx->id, snap_id};
+ std::vector<librbd::linked_image_spec_t> child_images;
+ r = list_children(ictx, parent_spec, &child_images);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t size = child_images.size();
+ if (size == 0) {
+ return 0;
+ }
+
+ librados::IoCtx child_io_ctx;
+ int64_t child_pool_id = -1;
+ size_t i = 0;
+ for (auto &child_image : child_images){
+ std::string pool = child_image.pool_name;
+ if (child_pool_id == -1 ||
+ child_pool_id != child_image.pool_id ||
+ child_io_ctx.get_namespace() != child_image.pool_namespace) {
+ r = librbd::util::create_ioctx(
+ ictx->md_ctx, "child image", child_image.pool_id,
+ child_image.pool_namespace, &child_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ child_pool_id = child_image.pool_id;
+ }
+
+ ImageCtx *imctx = new ImageCtx("", child_image.image_id, nullptr,
+ child_io_ctx, false);
+ r = imctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((imctx->features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
+ !imctx->snaps.empty()) {
+ lderr(cct) << "snapshot in-use by " << pool << "/" << imctx->name
+ << dendl;
+ imctx->state->close();
+ return -EBUSY;
+ }
+
+ librbd::NoOpProgressContext prog_ctx;
+ r = imctx->operations->flatten(prog_ctx);
+ if (r < 0) {
+ lderr(cct) << "error flattening image: " << pool << "/"
+ << (child_image.pool_namespace.empty() ?
+ "" : "/" + child_image.pool_namespace)
+ << child_image.image_name << cpp_strerror(r) << dendl;
+ imctx->state->close();
+ return r;
+ }
+
+ r = imctx->state->close();
+ if (r < 0) {
+ lderr(cct) << "failed to close image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ pctx.update_progress(++i, size);
+ ceph_assert(i <= size);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Image<I>::encryption_format(I* ictx, encryption_format_t format,
+ encryption_options_t opts, size_t opts_size,
+ bool c_api) {
+ if (ictx->parent != nullptr) {
+ lderr(ictx->cct) << "cannot format a cloned image" << dendl;
+ return -ENOTSUP;
+ }
+
+ crypto::EncryptionFormat<I>* result_format;
+ auto r = util::create_encryption_format(
+ ictx->cct, format, opts, opts_size, c_api, &result_format);
+ if (r != 0) {
+ return r;
+ }
+
+ C_SaferCond cond;
+ auto req = librbd::crypto::FormatRequest<I>::create(
+ ictx, std::unique_ptr<crypto::EncryptionFormat<I>>(result_format),
+ &cond);
+ req->send();
+ return cond.wait();
+}
+
+template <typename I>
+int Image<I>::encryption_load(I* ictx, encryption_format_t format,
+ encryption_options_t opts, size_t opts_size,
+ bool c_api) {
+ crypto::EncryptionFormat<I>* result_format;
+ auto r = util::create_encryption_format(
+ ictx->cct, format, opts, opts_size, c_api, &result_format);
+ if (r != 0) {
+ return r;
+ }
+
+ C_SaferCond cond;
+ auto req = librbd::crypto::LoadRequest<I>::create(
+ ictx, std::unique_ptr<crypto::EncryptionFormat<I>>(result_format),
+ &cond);
+ req->send();
+ return cond.wait();
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Image<librbd::ImageCtx>;
diff --git a/src/librbd/api/Image.h b/src/librbd/api/Image.h
new file mode 100644
index 000000000..192f9b7a7
--- /dev/null
+++ b/src/librbd/api/Image.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_IMAGE_H
+#define LIBRBD_API_IMAGE_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/rados/librados_fwd.hpp"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+
+namespace librbd {
+
+class ImageOptions;
+class ProgressContext;
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Image {
+ typedef std::map<std::string, std::string> ImageNameToIds;
+
+ static int64_t get_data_pool_id(ImageCtxT *ictx);
+
+ static int get_op_features(ImageCtxT *ictx, uint64_t *op_features);
+
+ static int list_images(librados::IoCtx& io_ctx,
+ std::vector<image_spec_t> *images);
+ static int list_images_v2(librados::IoCtx& io_ctx,
+ ImageNameToIds *images);
+
+ static int get_parent(ImageCtxT *ictx,
+ librbd::linked_image_spec_t *parent_image,
+ librbd::snap_spec_t *parent_snap);
+
+ static int list_children(ImageCtxT *ictx,
+ std::vector<librbd::linked_image_spec_t> *images);
+ static int list_children(ImageCtxT *ictx,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ std::vector<librbd::linked_image_spec_t> *images);
+
+ static int list_descendants(IoCtx& io_ctx, const std::string &image_id,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images);
+ static int list_descendants(ImageCtxT *ictx,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images);
+ static int list_descendants(ImageCtxT *ictx,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ const std::optional<size_t> &max_level,
+ std::vector<librbd::linked_image_spec_t> *images);
+
+ static int deep_copy(ImageCtxT *ictx, librados::IoCtx& dest_md_ctx,
+ const char *destname, ImageOptions& opts,
+ ProgressContext &prog_ctx);
+ static int deep_copy(ImageCtxT *src, ImageCtxT *dest, bool flatten,
+ ProgressContext &prog_ctx);
+
+ static int snap_set(ImageCtxT *ictx,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const char *snap_name);
+ static int snap_set(ImageCtxT *ictx, uint64_t snap_id);
+
+ static int remove(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext& prog_ctx);
+
+ static int flatten_children(ImageCtxT *ictx, const char* snap_name, ProgressContext& pctx);
+
+ static int encryption_format(ImageCtxT *ictx, encryption_format_t format,
+ encryption_options_t opts, size_t opts_size,
+ bool c_api);
+ static int encryption_load(ImageCtxT *ictx, encryption_format_t format,
+ encryption_options_t opts, size_t opts_size,
+ bool c_api);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Image<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_IMAGE_H
diff --git a/src/librbd/api/Io.cc b/src/librbd/api/Io.cc
new file mode 100644
index 000000000..31b48b3f6
--- /dev/null
+++ b/src/librbd/api/Io.cc
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Io.h"
+#include "include/intarith.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "common/EventTrace.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/Types.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Io " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+template <typename I>
+bool is_valid_io(I& image_ctx, io::AioCompletion* aio_comp) {
+ auto cct = image_ctx.cct;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(cct) << "missing data pool" << dendl;
+
+ aio_comp->fail(-ENODEV);
+ return false;
+ }
+
+ return true;
+}
+
+} // anonymous namespace
+
+template <typename I>
+ssize_t Io<I>::read(
+ I &image_ctx, uint64_t off, uint64_t len, io::ReadResult &&read_result,
+ int op_flags) {
+ auto cct = image_ctx.cct;
+
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_read(image_ctx, aio_comp, off, len, std::move(read_result), op_flags,
+ false);
+ return ctx.wait();
+}
+
+template <typename I>
+ssize_t Io<I>::write(
+ I &image_ctx, uint64_t off, uint64_t len, bufferlist &&bl, int op_flags) {
+ auto cct = image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ image_ctx.image_lock.lock_shared();
+ int r = clip_io(util::get_image_ctx(&image_ctx), off, &len);
+ image_ctx.image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_write(image_ctx, aio_comp, off, len, std::move(bl), op_flags, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t Io<I>::discard(
+ I &image_ctx, uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes) {
+ auto cct = image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ image_ctx.image_lock.lock_shared();
+ int r = clip_io(util::get_image_ctx(&image_ctx), off, &len);
+ image_ctx.image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_discard(image_ctx, aio_comp, off, len, discard_granularity_bytes, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t Io<I>::write_same(
+ I &image_ctx, uint64_t off, uint64_t len, bufferlist &&bl, int op_flags) {
+ auto cct = image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", "
+ << "len = " << len << ", data_len " << bl.length() << dendl;
+
+ image_ctx.image_lock.lock_shared();
+ int r = clip_io(util::get_image_ctx(&image_ctx), off, &len);
+ image_ctx.image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_write_same(image_ctx, aio_comp, off, len, std::move(bl), op_flags, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t Io<I>::write_zeroes(I& image_ctx, uint64_t off, uint64_t len,
+ int zero_flags, int op_flags) {
+ auto cct = image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ image_ctx.image_lock.lock_shared();
+ int r = clip_io(util::get_image_ctx(&image_ctx), off, &len);
+ image_ctx.image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_write_zeroes(image_ctx, aio_comp, off, len, zero_flags, op_flags, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+ssize_t Io<I>::compare_and_write(
+ I &image_ctx, uint64_t off, uint64_t len, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_off, int op_flags) {
+ auto cct = image_ctx.cct;
+ ldout(cct, 20) << "compare_and_write ictx=" << &image_ctx << ", off="
+ << off << ", " << "len = " << len << dendl;
+
+ image_ctx.image_lock.lock_shared();
+ int r = clip_io(util::get_image_ctx(&image_ctx), off, &len);
+ image_ctx.image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_compare_and_write(image_ctx, aio_comp, off, len, std::move(cmp_bl),
+ std::move(bl), mismatch_off, op_flags, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
+template <typename I>
+int Io<I>::flush(I &image_ctx) {
+ auto cct = image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &image_ctx << dendl;
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_flush(image_ctx, aio_comp, false);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Io<I>::aio_read(I &image_ctx, io::AioCompletion *aio_comp, uint64_t off,
+ uint64_t len, io::ReadResult &&read_result, int op_flags,
+ bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: read", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_READ);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << ", " << "flags=" << op_flags << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ auto req = io::ImageDispatchSpec::create_read(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, {{off, len}},
+ std::move(read_result), image_ctx.get_data_io_context(), op_flags, 0,
+ trace);
+ req->send();
+}
+
+template <typename I>
+void Io<I>::aio_write(I &image_ctx, io::AioCompletion *aio_comp, uint64_t off,
+ uint64_t len, bufferlist &&bl, int op_flags,
+ bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: write", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_WRITE);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << ", flags=" << op_flags << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ auto req = io::ImageDispatchSpec::create_write(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, {{off, len}},
+ std::move(bl), image_ctx.get_data_io_context(), op_flags, trace);
+ req->send();
+}
+
+template <typename I>
+void Io<I>::aio_discard(I &image_ctx, io::AioCompletion *aio_comp, uint64_t off,
+ uint64_t len, uint32_t discard_granularity_bytes,
+ bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: discard", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_DISCARD);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ auto req = io::ImageDispatchSpec::create_discard(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, off, len,
+ discard_granularity_bytes, image_ctx.get_data_io_context(), trace);
+ req->send();
+}
+
+template <typename I>
+void Io<I>::aio_write_same(I &image_ctx, io::AioCompletion *aio_comp,
+ uint64_t off, uint64_t len, bufferlist &&bl,
+ int op_flags, bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: writesame", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_WRITESAME);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << ", data_len = " << bl.length() << ", "
+ << "flags=" << op_flags << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ auto req = io::ImageDispatchSpec::create_write_same(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, off, len,
+ std::move(bl), image_ctx.get_data_io_context(), op_flags, trace);
+ req->send();
+}
+
+template <typename I>
+void Io<I>::aio_write_zeroes(I& image_ctx, io::AioCompletion *aio_comp,
+ uint64_t off, uint64_t len, int zero_flags,
+ int op_flags, bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: write_zeroes", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ auto io_type = io::AIO_TYPE_DISCARD;
+ if ((zero_flags & RBD_WRITE_ZEROES_FLAG_THICK_PROVISION) != 0) {
+ zero_flags &= ~RBD_WRITE_ZEROES_FLAG_THICK_PROVISION;
+ io_type = io::AIO_TYPE_WRITESAME;
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx), io_type);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ // validate the supported flags
+ if (zero_flags != 0U) {
+ aio_comp->fail(-EINVAL);
+ return;
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ if (io_type == io::AIO_TYPE_WRITESAME) {
+ // write-same needs to be aligned to its buffer but librbd has never forced
+ // block alignment. Hide that requirement from the user by adding optional
+ // writes.
+ const uint64_t data_length = 512;
+ uint64_t write_same_offset = p2roundup(off, data_length);
+ uint64_t write_same_offset_end = p2align(off + len, data_length);
+ uint64_t write_same_length = 0;
+ if (write_same_offset_end > write_same_offset) {
+ write_same_length = write_same_offset_end - write_same_offset;
+ }
+
+ uint64_t prepend_offset = off;
+ uint64_t prepend_length = write_same_offset - off;
+ uint64_t append_offset = write_same_offset + write_same_length;
+ uint64_t append_length = len - prepend_length - write_same_length;
+ ldout(cct, 20) << "prepend_offset=" << prepend_offset << ", "
+ << "prepend_length=" << prepend_length << ", "
+ << "write_same_offset=" << write_same_offset << ", "
+ << "write_same_length=" << write_same_length << ", "
+ << "append_offset=" << append_offset << ", "
+ << "append_length=" << append_length << dendl;
+ ceph_assert(prepend_length + write_same_length + append_length == len);
+
+ if (write_same_length <= data_length) {
+ // unaligned or small write-zeroes request -- use single write
+ bufferlist bl;
+ bl.append_zero(len);
+
+ aio_comp->aio_type = io::AIO_TYPE_WRITE;
+ auto req = io::ImageDispatchSpec::create_write(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, {{off, len}},
+ std::move(bl), image_ctx.get_data_io_context(), op_flags, trace);
+ req->send();
+ return;
+ } else if (prepend_length == 0 && append_length == 0) {
+ // fully aligned -- use a single write-same image request
+ bufferlist bl;
+ bl.append_zero(data_length);
+
+ auto req = io::ImageDispatchSpec::create_write_same(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, off, len,
+ std::move(bl), image_ctx.get_data_io_context(), op_flags, trace);
+ req->send();
+ return;
+ }
+
+ // to reach this point, we need at least one prepend/append write along with
+ // a write-same -- therefore we will need to wrap the provided AioCompletion
+ auto request_count = 1;
+ if (prepend_length > 0) {
+ ++request_count;
+ }
+ if (append_length > 0) {
+ ++request_count;
+ }
+
+ ceph_assert(request_count > 1);
+ aio_comp->start_op();
+ aio_comp->set_request_count(request_count);
+
+ if (prepend_length > 0) {
+ bufferlist bl;
+ bl.append_zero(prepend_length);
+
+ Context* prepend_ctx = new io::C_AioRequest(aio_comp);
+ auto prepend_aio_comp = io::AioCompletion::create_and_start(
+ prepend_ctx, &image_ctx, io::AIO_TYPE_WRITE);
+ auto prepend_req = io::ImageDispatchSpec::create_write(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, prepend_aio_comp,
+ {{prepend_offset, prepend_length}}, std::move(bl),
+ image_ctx.get_data_io_context(), op_flags, trace);
+ prepend_req->send();
+ }
+
+ if (append_length > 0) {
+ bufferlist bl;
+ bl.append_zero(append_length);
+
+ Context* append_ctx = new io::C_AioRequest(aio_comp);
+ auto append_aio_comp = io::AioCompletion::create_and_start(
+ append_ctx, &image_ctx, io::AIO_TYPE_WRITE);
+ auto append_req = io::ImageDispatchSpec::create_write(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, append_aio_comp,
+ {{append_offset, append_length}}, std::move(bl),
+ image_ctx.get_data_io_context(), op_flags, trace);
+ append_req->send();
+ }
+
+ bufferlist bl;
+ bl.append_zero(data_length);
+
+ Context* write_same_ctx = new io::C_AioRequest(aio_comp);
+ auto write_same_aio_comp = io::AioCompletion::create_and_start(
+ write_same_ctx, &image_ctx, io::AIO_TYPE_WRITESAME);
+ auto req = io::ImageDispatchSpec::create_write_same(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, write_same_aio_comp,
+ write_same_offset, write_same_length, std::move(bl),
+ image_ctx.get_data_io_context(), op_flags, trace);
+ req->send();
+ return;
+ }
+
+ // enable partial discard (zeroing) of objects
+ uint32_t discard_granularity_bytes = 0;
+
+ auto req = io::ImageDispatchSpec::create_discard(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, off, len,
+ discard_granularity_bytes, image_ctx.get_data_io_context(), trace);
+ req->send();
+}
+
+template <typename I>
+void Io<I>::aio_compare_and_write(I &image_ctx, io::AioCompletion *aio_comp,
+ uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_off,
+ int op_flags, bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: compare_and_write", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx),
+ io::AIO_TYPE_COMPARE_AND_WRITE);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ auto req = io::ImageDispatchSpec::create_compare_and_write(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, {{off, len}},
+ std::move(cmp_bl), std::move(bl), mismatch_off,
+ image_ctx.get_data_io_context(), op_flags, trace);
+ req->send();
+}
+
+template <typename I>
+void Io<I>::aio_flush(I &image_ctx, io::AioCompletion *aio_comp,
+ bool native_async) {
+ auto cct = image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (image_ctx.blkin_trace_all) {
+ trace.init("io: flush", &image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_FLUSH);
+ ldout(cct, 20) << "ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << dendl;
+
+ if (native_async && image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ if (!is_valid_io(image_ctx, aio_comp)) {
+ return;
+ }
+
+ auto req = io::ImageDispatchSpec::create_flush(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp,
+ io::FLUSH_SOURCE_USER, trace);
+ req->send();
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Io<librbd::ImageCtx>;
diff --git a/src/librbd/api/Io.h b/src/librbd/api/Io.h
new file mode 100644
index 000000000..4e2ec5028
--- /dev/null
+++ b/src/librbd/api/Io.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_IO_H
+#define LIBRBD_API_IO_H
+
+#include "include/int_types.h"
+#include "librbd/io/ReadResult.h"
+
+namespace librbd {
+
+struct ImageCtx;
+namespace io { struct AioCompletion; }
+
+namespace api {
+
+template<typename ImageCtxT = ImageCtx>
+struct Io {
+ static ssize_t read(ImageCtxT &image_ctx, uint64_t off, uint64_t len,
+ io::ReadResult &&read_result, int op_flags);
+ static ssize_t write(ImageCtxT &image_ctx, uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags);
+ static ssize_t discard(ImageCtxT &image_ctx, uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes);
+ static ssize_t write_same(ImageCtxT &image_ctx, uint64_t off, uint64_t len,
+ bufferlist &&bl, int op_flags);
+ static ssize_t write_zeroes(ImageCtxT &image_ctx, uint64_t off, uint64_t len,
+ int zero_flags, int op_flags);
+ static ssize_t compare_and_write(ImageCtxT &image_ctx, uint64_t off,
+ uint64_t len, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_off,
+ int op_flags);
+ static int flush(ImageCtxT &image_ctx);
+
+ static void aio_read(ImageCtxT &image_ctx, io::AioCompletion *c, uint64_t off,
+ uint64_t len, io::ReadResult &&read_result, int op_flags,
+ bool native_async);
+ static void aio_write(ImageCtxT &image_ctx, io::AioCompletion *c,
+ uint64_t off, uint64_t len, bufferlist &&bl,
+ int op_flags, bool native_async);
+ static void aio_discard(ImageCtxT &image_ctx, io::AioCompletion *c,
+ uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes,
+ bool native_async);
+ static void aio_write_same(ImageCtxT &image_ctx, io::AioCompletion *c,
+ uint64_t off, uint64_t len, bufferlist &&bl,
+ int op_flags, bool native_async);
+ static void aio_write_zeroes(ImageCtxT &image_ctx, io::AioCompletion *c,
+ uint64_t off, uint64_t len, int zero_flags,
+ int op_flags, bool native_async);
+ static void aio_compare_and_write(ImageCtxT &image_ctx, io::AioCompletion *c,
+ uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl, bufferlist &&bl,
+ uint64_t *mismatch_off, int op_flags,
+ bool native_async);
+ static void aio_flush(ImageCtxT &image_ctx, io::AioCompletion *c,
+ bool native_async);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Io<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_IO_H
diff --git a/src/librbd/api/Migration.cc b/src/librbd/api/Migration.cc
new file mode 100644
index 000000000..957c872ac
--- /dev/null
+++ b/src/librbd/api/Migration.cc
@@ -0,0 +1,2126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Migration.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Group.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Snapshot.h"
+#include "librbd/api/Trash.h"
+#include "librbd/deep_copy/Handler.h"
+#include "librbd/deep_copy/ImageCopyRequest.h"
+#include "librbd/deep_copy/MetadataCopyRequest.h"
+#include "librbd/deep_copy/SnapshotCopyRequest.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/image/AttachChildRequest.h"
+#include "librbd/image/AttachParentRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/image/DetachParentRequest.h"
+#include "librbd/image/ListWatchersRequest.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/image/Types.h"
+#include "librbd/internal.h"
+#include "librbd/migration/FormatInterface.h"
+#include "librbd/migration/OpenSourceImageRequest.h"
+#include "librbd/migration/NativeFormat.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Migration: " << __func__ << ": "
+
+namespace librbd {
+
+inline bool operator==(const linked_image_spec_t& rhs,
+ const linked_image_spec_t& lhs) {
+ bool result = (rhs.pool_id == lhs.pool_id &&
+ rhs.pool_namespace == lhs.pool_namespace &&
+ rhs.image_id == lhs.image_id);
+ return result;
+}
+
+namespace api {
+
+using util::create_rados_callback;
+
+namespace {
+
+class MigrationProgressContext : public ProgressContext {
+public:
+ MigrationProgressContext(librados::IoCtx& io_ctx,
+ const std::string &header_oid,
+ cls::rbd::MigrationState state,
+ ProgressContext *prog_ctx)
+ : m_io_ctx(io_ctx), m_header_oid(header_oid), m_state(state),
+ m_prog_ctx(prog_ctx), m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ m_lock(ceph::make_mutex(
+ util::unique_lock_name("librbd::api::MigrationProgressContext",
+ this))) {
+ ceph_assert(m_prog_ctx != nullptr);
+ }
+
+ ~MigrationProgressContext() {
+ wait_for_in_flight_updates();
+ }
+
+ int update_progress(uint64_t offset, uint64_t total) override {
+ ldout(m_cct, 20) << "offset=" << offset << ", total=" << total << dendl;
+
+ m_prog_ctx->update_progress(offset, total);
+
+ std::string description = stringify(offset * 100 / total) + "% complete";
+
+ send_state_description_update(description);
+
+ return 0;
+ }
+
+private:
+ librados::IoCtx& m_io_ctx;
+ std::string m_header_oid;
+ cls::rbd::MigrationState m_state;
+ ProgressContext *m_prog_ctx;
+
+ CephContext* m_cct;
+ mutable ceph::mutex m_lock;
+ ceph::condition_variable m_cond;
+ std::string m_state_description;
+ bool m_pending_update = false;
+ int m_in_flight_state_updates = 0;
+
+ void send_state_description_update(const std::string &description) {
+ std::lock_guard locker{m_lock};
+
+ if (description == m_state_description) {
+ return;
+ }
+
+ m_state_description = description;
+
+ if (m_in_flight_state_updates > 0) {
+ m_pending_update = true;
+ return;
+ }
+
+ set_state_description();
+ }
+
+ void set_state_description() {
+ ldout(m_cct, 20) << "state_description=" << m_state_description << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ librados::ObjectWriteOperation op;
+ cls_client::migration_set_state(&op, m_state, m_state_description);
+
+ using klass = MigrationProgressContext;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_state_description>(this);
+ int r = m_io_ctx.aio_operate(m_header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+
+ m_in_flight_state_updates++;
+ }
+
+ void handle_set_state_description(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_in_flight_state_updates--;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update migration state: " << cpp_strerror(r)
+ << dendl;
+ } else if (m_pending_update) {
+ set_state_description();
+ m_pending_update = false;
+ } else {
+ m_cond.notify_all();
+ }
+ }
+
+ void wait_for_in_flight_updates() {
+ std::unique_lock locker{m_lock};
+
+ ldout(m_cct, 20) << "m_in_flight_state_updates="
+ << m_in_flight_state_updates << dendl;
+ m_pending_update = false;
+ m_cond.wait(locker, [this] { return m_in_flight_state_updates <= 0; });
+ }
+};
+
+int trash_search(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, std::string *image_id) {
+ std::vector<trash_image_info_t> entries;
+
+ int r = Trash<>::list(io_ctx, entries, false);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto &entry : entries) {
+ if (entry.source == source && entry.name == image_name) {
+ *image_id = entry.id;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+template <typename I>
+int open_images(librados::IoCtx& io_ctx, const std::string &image_name,
+ I **src_image_ctx, I **dst_image_ctx,
+ cls::rbd::MigrationSpec* src_migration_spec,
+ cls::rbd::MigrationSpec* dst_migration_spec,
+ bool skip_open_dst_image) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ *src_image_ctx = nullptr;
+ *dst_image_ctx = nullptr;
+
+ ldout(cct, 10) << "trying to open image by name " << io_ctx.get_pool_name()
+ << "/" << image_name << dendl;
+ auto image_ctx = I::create(image_name, "", nullptr, io_ctx, false);
+ int r = image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ if (r == -ENOENT) {
+ // presume user passed the source image so we need to search the trash
+ ldout(cct, 10) << "Source image is not found. Trying trash" << dendl;
+
+ std::string src_image_id;
+ r = trash_search(io_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION, image_name,
+ &src_image_id);
+ if (r < 0) {
+ lderr(cct) << "failed to determine image id: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 10) << "source image id from trash: " << src_image_id << dendl;
+ image_ctx = I::create(image_name, src_image_id, nullptr, io_ctx, false);
+ r = image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ }
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ image_ctx = nullptr;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(&r, &image_ctx, src_image_ctx, dst_image_ctx) {
+ if (r != 0) {
+ if (*src_image_ctx != nullptr) {
+ (*src_image_ctx)->state->close();
+ }
+ if (*dst_image_ctx != nullptr) {
+ (*dst_image_ctx)->state->close();
+ }
+ if (image_ctx != nullptr) {
+ image_ctx->state->close();
+ }
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ // The opened image is either a source or destination
+ cls::rbd::MigrationSpec migration_spec;
+ r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ &migration_spec);
+ if (r < 0) {
+ lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 10) << "migration spec: " << migration_spec << dendl;
+ if (migration_spec.header_type == cls::rbd::MIGRATION_HEADER_TYPE_SRC) {
+ ldout(cct, 10) << "the source image is opened" << dendl;
+ *src_image_ctx = image_ctx;
+ *src_migration_spec = migration_spec;
+ image_ctx = nullptr;
+ } else if (migration_spec.header_type ==
+ cls::rbd::MIGRATION_HEADER_TYPE_DST) {
+ ldout(cct, 10) << "the destination image is opened" << dendl;
+ std::string image_id = image_ctx->id;
+ image_ctx->state->close();
+ image_ctx = I::create(image_name, image_id, nullptr, io_ctx, false);
+
+ if (!skip_open_dst_image) {
+ ldout(cct, 10) << "re-opening the destination image" << dendl;
+ r = image_ctx->state->open(0);
+ if (r < 0) {
+ image_ctx = nullptr;
+ lderr(cct) << "failed to re-open destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ *dst_image_ctx = image_ctx;
+ *dst_migration_spec = migration_spec;
+ image_ctx = nullptr;
+ } else {
+ lderr(cct) << "unexpected migration header type: "
+ << migration_spec.header_type << dendl;
+ r = -EINVAL;
+ return r;
+ }
+
+ // attempt to open the other (paired) image
+ I** other_image_ctx = nullptr;
+ std::string other_image_type;
+ std::string other_image_name;
+ std::string other_image_id;
+ cls::rbd::MigrationSpec* other_migration_spec = nullptr;
+ librados::IoCtx other_io_ctx;
+
+ int flags = OPEN_FLAG_IGNORE_MIGRATING;
+ if (*src_image_ctx == nullptr &&
+ dst_migration_spec->source_spec.empty()) {
+ r = util::create_ioctx(io_ctx, "source image", migration_spec.pool_id,
+ migration_spec.pool_namespace, &other_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ other_image_type = "source";
+ other_image_ctx = src_image_ctx;
+ other_migration_spec = src_migration_spec;
+ other_image_name = migration_spec.image_name;
+ other_image_id = migration_spec.image_id;
+
+ if (other_image_id.empty()) {
+ ldout(cct, 20) << "trying to open v1 image by name "
+ << other_io_ctx.get_pool_name() << "/"
+ << other_image_name << dendl;
+ flags |= OPEN_FLAG_OLD_FORMAT;
+ } else {
+ ldout(cct, 20) << "trying to open v2 image by id "
+ << other_io_ctx.get_pool_name() << "/"
+ << other_image_id << dendl;
+ }
+
+ *src_image_ctx = I::create(other_image_name, other_image_id, nullptr,
+ other_io_ctx, false);
+ } else if (*dst_image_ctx == nullptr) {
+ r = util::create_ioctx(io_ctx, "destination image", migration_spec.pool_id,
+ migration_spec.pool_namespace, &other_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ other_image_name = migration_spec.image_name;
+ if (skip_open_dst_image) {
+ other_image_id = migration_spec.image_id;
+ } else {
+ other_image_type = "destination";
+ other_image_ctx = dst_image_ctx;
+ other_migration_spec = dst_migration_spec;
+ other_image_id = migration_spec.image_id;
+ }
+
+ *dst_image_ctx = I::create(other_image_name, other_image_id, nullptr,
+ other_io_ctx, false);
+ }
+
+ if (other_image_ctx != nullptr) {
+ r = (*other_image_ctx)->state->open(flags);
+ if (r < 0) {
+ lderr(cct) << "failed to open " << other_image_type << " image "
+ << other_io_ctx.get_pool_name()
+ << "/" << (other_image_id.empty() ?
+ other_image_name : other_image_id)
+ << ": " << cpp_strerror(r) << dendl;
+ *other_image_ctx = nullptr;
+ return r;
+ }
+
+ r = cls_client::migration_get(&(*other_image_ctx)->md_ctx,
+ (*other_image_ctx)->header_oid,
+ other_migration_spec);
+ if (r < 0) {
+ lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << other_image_type << " migration spec: "
+ << *other_migration_spec << dendl;
+ }
+
+ if (!skip_open_dst_image) {
+ // legacy clients will only store status in the source images
+ if (dst_migration_spec->source_spec.empty()) {
+ dst_migration_spec->state = migration_spec.state;
+ dst_migration_spec->state_description =
+ migration_spec.state_description;
+ }
+ }
+
+ return 0;
+}
+
+class SteppedProgressContext : public ProgressContext {
+public:
+ SteppedProgressContext(ProgressContext* progress_ctx, size_t total_steps)
+ : m_progress_ctx(progress_ctx), m_total_steps(total_steps) {
+ }
+
+ void next_step() {
+ ceph_assert(m_current_step < m_total_steps);
+ ++m_current_step;
+ }
+
+ int update_progress(uint64_t object_number,
+ uint64_t object_count) override {
+ return m_progress_ctx->update_progress(
+ object_number + (object_count * (m_current_step - 1)),
+ object_count * m_total_steps);
+ }
+
+private:
+ ProgressContext* m_progress_ctx;
+ size_t m_total_steps;
+ size_t m_current_step = 1;
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Migration<I>::prepare(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name_,
+ ImageOptions& opts) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ std::string dest_image_name = dest_image_name_.empty() ? image_name :
+ dest_image_name_;
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << " -> "
+ << dest_io_ctx.get_pool_name() << "/" << dest_image_name
+ << ", opts=" << opts << dendl;
+
+ auto src_image_ctx = I::create(image_name, "", nullptr, io_ctx, false);
+ int r = src_image_ctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ BOOST_SCOPE_EXIT_TPL(src_image_ctx) {
+ src_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ std::list<obj_watch_t> watchers;
+ int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES;
+ C_SaferCond on_list_watchers;
+ auto list_watchers_request = librbd::image::ListWatchersRequest<I>::create(
+ *src_image_ctx, flags, &watchers, &on_list_watchers);
+ list_watchers_request->send();
+ r = on_list_watchers.wait();
+ if (r < 0) {
+ lderr(cct) << "failed listing watchers:" << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (!watchers.empty()) {
+ lderr(cct) << "image has watchers - not migrating" << dendl;
+ return -EBUSY;
+ }
+
+ uint64_t format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ if (format != 2) {
+ lderr(cct) << "unsupported destination image format: " << format << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t features;
+ {
+ std::shared_lock image_locker{src_image_ctx->image_lock};
+ features = src_image_ctx->features;
+ }
+ opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
+ if ((features & ~RBD_FEATURES_ALL) != 0) {
+ lderr(cct) << "librbd does not support requested features" << dendl;
+ return -ENOSYS;
+ }
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+
+ uint64_t order = src_image_ctx->order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ r = image::CreateRequest<I>::validate_order(cct, order);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t stripe_unit = src_image_ctx->stripe_unit;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+ uint64_t stripe_count = src_image_ctx->stripe_count;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+
+ uint64_t flatten = 0;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
+ opts.unset(RBD_IMAGE_OPTION_FLATTEN);
+ }
+
+ ldout(cct, 20) << "updated opts=" << opts << dendl;
+
+ auto dst_image_ctx = I::create(
+ dest_image_name, util::generate_image_id(dest_io_ctx), nullptr,
+ dest_io_ctx, false);
+ src_image_ctx->image_lock.lock_shared();
+ cls::rbd::MigrationSpec dst_migration_spec{
+ cls::rbd::MIGRATION_HEADER_TYPE_DST,
+ src_image_ctx->md_ctx.get_id(), src_image_ctx->md_ctx.get_namespace(),
+ src_image_ctx->name, src_image_ctx->id, "", {}, 0, false,
+ cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, flatten > 0,
+ cls::rbd::MIGRATION_STATE_PREPARING, ""};
+ src_image_ctx->image_lock.unlock_shared();
+
+ Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec,
+ opts, nullptr);
+ r = migration.prepare();
+
+ return r;
+}
+
+template <typename I>
+int Migration<I>::prepare_import(
+ const std::string& source_spec, librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name, ImageOptions& opts) {
+ if (source_spec.empty() || !dest_io_ctx.is_valid() ||
+ dest_image_name.empty()) {
+ return -EINVAL;
+ }
+
+ auto cct = reinterpret_cast<CephContext *>(dest_io_ctx.cct());
+ ldout(cct, 10) << source_spec << " -> "
+ << dest_io_ctx.get_pool_name() << "/"
+ << dest_image_name << ", opts=" << opts << dendl;
+
+ I* src_image_ctx = nullptr;
+ C_SaferCond open_ctx;
+ auto req = migration::OpenSourceImageRequest<I>::create(
+ dest_io_ctx, nullptr, CEPH_NOSNAP,
+ {-1, "", "", "", source_spec, {}, 0, false}, &src_image_ctx, &open_ctx);
+ req->send();
+
+ int r = open_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to open source image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto asio_engine = src_image_ctx->asio_engine;
+ BOOST_SCOPE_EXIT_TPL(src_image_ctx) {
+ src_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ uint64_t image_format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &image_format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, image_format);
+ }
+ if (image_format != 2) {
+ lderr(cct) << "unsupported destination image format: " << image_format
+ << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 20) << "updated opts=" << opts << dendl;
+
+ // use json-spirit to clean-up json formatting
+ json_spirit::mObject source_spec_object;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(source_spec, json_root)) {
+ try {
+ source_spec_object = json_root.get_obj();
+ } catch (std::runtime_error&) {
+ lderr(cct) << "failed to clean source spec" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ auto dst_image_ctx = I::create(
+ dest_image_name, util::generate_image_id(dest_io_ctx), nullptr,
+ dest_io_ctx, false);
+ cls::rbd::MigrationSpec dst_migration_spec{
+ cls::rbd::MIGRATION_HEADER_TYPE_DST, -1, "", "", "",
+ json_spirit::write(source_spec_object), {},
+ 0, false, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, true,
+ cls::rbd::MIGRATION_STATE_PREPARING, ""};
+
+ Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec,
+ opts, nullptr);
+ return migration.prepare_import();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::execute(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ ProgressContext &prog_ctx) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *src_image_ctx;
+ I *dst_image_ctx;
+ cls::rbd::MigrationSpec src_migration_spec;
+ cls::rbd::MigrationSpec dst_migration_spec;
+ int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx,
+ &src_migration_spec, &dst_migration_spec, false);
+ if (r < 0) {
+ return r;
+ }
+
+ // ensure the destination loads the migration info
+ dst_image_ctx->ignore_migrating = false;
+ r = dst_image_ctx->state->refresh();
+ if (r < 0) {
+ lderr(cct) << "failed to refresh destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(src_image_ctx, dst_image_ctx) {
+ dst_image_ctx->state->close();
+ if (src_image_ctx != nullptr) {
+ src_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ if (dst_migration_spec.state != cls::rbd::MIGRATION_STATE_PREPARED &&
+ dst_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTING) {
+ lderr(cct) << "current migration state is '" << dst_migration_spec.state
+ << "' (should be 'prepared')" << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 5) << "migrating ";
+ if (!dst_migration_spec.source_spec.empty()) {
+ *_dout << dst_migration_spec.source_spec;
+ } else {
+ *_dout << src_image_ctx->md_ctx.get_pool_name() << "/"
+ << src_image_ctx->name;
+ }
+ *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/"
+ << dst_image_ctx->name << dendl;
+
+ ImageOptions opts;
+ Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec,
+ opts, &prog_ctx);
+ r = migration.execute();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::abort(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *src_image_ctx;
+ I *dst_image_ctx;
+ cls::rbd::MigrationSpec src_migration_spec;
+ cls::rbd::MigrationSpec dst_migration_spec;
+ int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx,
+ &src_migration_spec, &dst_migration_spec, true);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(cct, 5) << "canceling incomplete migration ";
+ if (!dst_migration_spec.source_spec.empty()) {
+ *_dout << dst_migration_spec.source_spec;
+ } else {
+ *_dout << src_image_ctx->md_ctx.get_pool_name() << "/"
+ << src_image_ctx->name;
+ }
+ *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/"
+ << dst_image_ctx->name << dendl;
+
+ ImageOptions opts;
+ Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec,
+ opts, &prog_ctx);
+ r = migration.abort();
+
+ if (src_image_ctx != nullptr) {
+ src_image_ctx->state->close();
+ }
+
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::commit(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ ProgressContext &prog_ctx) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *src_image_ctx;
+ I *dst_image_ctx;
+ cls::rbd::MigrationSpec src_migration_spec;
+ cls::rbd::MigrationSpec dst_migration_spec;
+ int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx,
+ &src_migration_spec, &dst_migration_spec, false);
+ if (r < 0) {
+ return r;
+ }
+
+ if (dst_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTED) {
+ lderr(cct) << "current migration state is '" << dst_migration_spec.state
+ << "' (should be 'executed')" << dendl;
+ dst_image_ctx->state->close();
+ if (src_image_ctx != nullptr) {
+ src_image_ctx->state->close();
+ }
+ return -EINVAL;
+ }
+
+ // ensure the destination loads the migration info
+ dst_image_ctx->ignore_migrating = false;
+ r = dst_image_ctx->state->refresh();
+ if (r < 0) {
+ lderr(cct) << "failed to refresh destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ldout(cct, 5) << "migrating ";
+ if (!dst_migration_spec.source_spec.empty()) {
+ *_dout << dst_migration_spec.source_spec;
+ } else {
+ *_dout << src_image_ctx->md_ctx.get_pool_name() << "/"
+ << src_image_ctx->name;
+ }
+ *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/"
+ << dst_image_ctx->name << dendl;
+
+ ImageOptions opts;
+ Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec,
+ opts, &prog_ctx);
+ r = migration.commit();
+
+ // image_ctx is closed in commit when removing src image
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::status(librados::IoCtx& io_ctx,
+ const std::string &image_name,
+ image_migration_status_t *status) {
+ CephContext* cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl;
+
+ I *src_image_ctx;
+ I *dst_image_ctx;
+ cls::rbd::MigrationSpec src_migration_spec;
+ cls::rbd::MigrationSpec dst_migration_spec;
+ int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx,
+ &src_migration_spec, &dst_migration_spec, false);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(cct, 5) << "migrating ";
+ if (!dst_migration_spec.source_spec.empty()) {
+ *_dout << dst_migration_spec.source_spec;
+ } else {
+ *_dout << src_image_ctx->md_ctx.get_pool_name() << "/"
+ << src_image_ctx->name;
+ }
+ *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/"
+ << dst_image_ctx->name << dendl;
+
+ ImageOptions opts;
+ Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec,
+ opts, nullptr);
+ r = migration.status(status);
+
+ dst_image_ctx->state->close();
+ if (src_image_ctx != nullptr) {
+ src_image_ctx->state->close();
+ }
+
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::get_source_spec(I* image_ctx, std::string* source_spec) {
+ auto cct = image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ image_ctx->image_lock.lock_shared();
+ auto migration_info = image_ctx->migration_info;
+ image_ctx->image_lock.unlock_shared();
+
+ if (migration_info.empty()) {
+ // attempt to directly read the spec in case the state is EXECUTED
+ cls::rbd::MigrationSpec migration_spec;
+ int r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ &migration_spec);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ migration_info = {
+ migration_spec.pool_id, migration_spec.pool_namespace,
+ migration_spec.image_name, migration_spec.image_id,
+ migration_spec.source_spec, {}, 0, false};
+ }
+
+ if (!migration_info.source_spec.empty()) {
+ *source_spec = migration_info.source_spec;
+ } else {
+ // legacy migration source
+ *source_spec = migration::NativeFormat<I>::build_source_spec(
+ migration_info.pool_id,
+ migration_info.pool_namespace,
+ migration_info.image_name,
+ migration_info.image_id);
+ }
+
+ return 0;
+}
+
+template <typename I>
+Migration<I>::Migration(ImageCtx* src_image_ctx,
+ ImageCtx* dst_image_ctx,
+ const cls::rbd::MigrationSpec& dst_migration_spec,
+ ImageOptions& opts, ProgressContext *prog_ctx)
+ : m_cct(dst_image_ctx->cct),
+ m_src_image_ctx(src_image_ctx), m_dst_image_ctx(dst_image_ctx),
+ m_dst_io_ctx(dst_image_ctx->md_ctx), m_dst_image_name(dst_image_ctx->name),
+ m_dst_image_id(dst_image_ctx->id),
+ m_dst_header_oid(util::header_name(m_dst_image_id)),
+ m_image_options(opts), m_flatten(dst_migration_spec.flatten),
+ m_mirroring(dst_migration_spec.mirroring),
+ m_mirror_image_mode(dst_migration_spec.mirror_image_mode),
+ m_prog_ctx(prog_ctx),
+ m_src_migration_spec(cls::rbd::MIGRATION_HEADER_TYPE_SRC,
+ m_dst_io_ctx.get_id(), m_dst_io_ctx.get_namespace(),
+ m_dst_image_name, m_dst_image_id, "", {}, 0,
+ m_mirroring, m_mirror_image_mode, m_flatten,
+ dst_migration_spec.state,
+ dst_migration_spec.state_description),
+ m_dst_migration_spec(dst_migration_spec) {
+ m_dst_io_ctx.dup(dst_image_ctx->md_ctx);
+}
+
+template <typename I>
+int Migration<I>::prepare() {
+ ldout(m_cct, 10) << dendl;
+
+ BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx) {
+ if (m_dst_image_ctx != nullptr) {
+ m_dst_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ int r = validate_src_snaps(m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = disable_mirroring(m_src_image_ctx, &m_mirroring, &m_mirror_image_mode);
+ if (r < 0) {
+ return r;
+ }
+
+ r = unlink_src_image(m_src_image_ctx);
+ if (r < 0) {
+ enable_mirroring(m_src_image_ctx, m_mirroring, m_mirror_image_mode);
+ return r;
+ }
+
+ r = set_src_migration(m_src_image_ctx);
+ if (r < 0) {
+ relink_src_image(m_src_image_ctx);
+ enable_mirroring(m_src_image_ctx, m_mirroring, m_mirror_image_mode);
+ return r;
+ }
+
+ r = create_dst_image(&m_dst_image_ctx);
+ if (r < 0) {
+ abort();
+ return r;
+ }
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::prepare_import() {
+ ldout(m_cct, 10) << dendl;
+
+ BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx) {
+ if (m_dst_image_ctx != nullptr) {
+ m_dst_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ int r = create_dst_image(&m_dst_image_ctx);
+ if (r < 0) {
+ abort();
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::execute() {
+ ldout(m_cct, 10) << dendl;
+
+ int r = set_state(cls::rbd::MIGRATION_STATE_EXECUTING, "");
+ if (r < 0) {
+ return r;
+ }
+
+ {
+ MigrationProgressContext dst_prog_ctx(
+ m_dst_image_ctx->md_ctx, m_dst_image_ctx->header_oid,
+ cls::rbd::MIGRATION_STATE_EXECUTING, m_prog_ctx);
+ std::optional<MigrationProgressContext> src_prog_ctx;
+ if (m_src_image_ctx != nullptr) {
+ src_prog_ctx.emplace(m_src_image_ctx->md_ctx, m_src_image_ctx->header_oid,
+ cls::rbd::MIGRATION_STATE_EXECUTING, &dst_prog_ctx);
+ }
+
+ while (true) {
+ r = m_dst_image_ctx->operations->migrate(
+ *(src_prog_ctx ? &src_prog_ctx.value() : &dst_prog_ctx));
+ if (r == -EROFS) {
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ if (m_dst_image_ctx->exclusive_lock != nullptr &&
+ !m_dst_image_ctx->exclusive_lock->accept_ops()) {
+ ldout(m_cct, 5) << "lost exclusive lock, retrying remote" << dendl;
+ continue;
+ }
+ }
+ break;
+ }
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "migration failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = set_state(cls::rbd::MIGRATION_STATE_EXECUTED, "");
+ if (r < 0) {
+ return r;
+ }
+
+ m_dst_image_ctx->notify_update();
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::abort() {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+ if (m_src_image_ctx != nullptr) {
+ m_src_image_ctx->owner_lock.lock_shared();
+ if (m_src_image_ctx->exclusive_lock != nullptr &&
+ !m_src_image_ctx->exclusive_lock->is_lock_owner()) {
+ C_SaferCond ctx;
+ m_src_image_ctx->exclusive_lock->acquire_lock(&ctx);
+ m_src_image_ctx->owner_lock.unlock_shared();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "error acquiring exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ } else {
+ m_src_image_ctx->owner_lock.unlock_shared();
+ }
+ }
+
+ group_info_t group_info;
+ group_info.pool = -1;
+
+ r = m_dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ if (r < 0) {
+ ldout(m_cct, 1) << "failed to open destination image: " << cpp_strerror(r)
+ << dendl;
+ m_dst_image_ctx = nullptr;
+ } else {
+ BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx) {
+ if (m_dst_image_ctx != nullptr) {
+ m_dst_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ std::list<obj_watch_t> watchers;
+ int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES;
+ C_SaferCond on_list_watchers;
+ auto list_watchers_request = librbd::image::ListWatchersRequest<I>::create(
+ *m_dst_image_ctx, flags, &watchers, &on_list_watchers);
+ list_watchers_request->send();
+ r = on_list_watchers.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed listing watchers:" << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (!watchers.empty()) {
+ lderr(m_cct) << "image has watchers - cannot abort migration" << dendl;
+ return -EBUSY;
+ }
+
+ // ensure destination image is now read-only
+ r = set_state(cls::rbd::MIGRATION_STATE_ABORTING, "");
+ if (r < 0) {
+ return r;
+ }
+
+ SteppedProgressContext progress_ctx(
+ m_prog_ctx, (m_src_image_ctx != nullptr ? 2 : 1));
+ if (m_src_image_ctx != nullptr) {
+ // copy dst HEAD -> src HEAD
+ revert_data(m_dst_image_ctx, m_src_image_ctx, &progress_ctx);
+ progress_ctx.next_step();
+
+ ldout(m_cct, 10) << "relinking children" << dendl;
+ r = relink_children(m_dst_image_ctx, m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ ldout(m_cct, 10) << "removing dst image snapshots" << dendl;
+ std::vector<librbd::snap_info_t> snaps;
+ r = Snapshot<I>::list(m_dst_image_ctx, snaps);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto &snap : snaps) {
+ librbd::NoOpProgressContext prog_ctx;
+ int r = Snapshot<I>::remove(m_dst_image_ctx, snap.name.c_str(),
+ RBD_SNAP_REMOVE_UNPROTECT, prog_ctx);
+ if (r < 0) {
+ lderr(m_cct) << "failed removing snapshot: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ ldout(m_cct, 10) << "removing group" << dendl;
+
+ r = remove_group(m_dst_image_ctx, &group_info);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ ldout(m_cct, 10) << "removing dst image" << dendl;
+
+ ceph_assert(m_dst_image_ctx->ignore_migrating);
+
+ auto asio_engine = m_dst_image_ctx->asio_engine;
+ librados::IoCtx dst_io_ctx(m_dst_image_ctx->md_ctx);
+
+ C_SaferCond on_remove;
+ auto req = librbd::image::RemoveRequest<>::create(
+ dst_io_ctx, m_dst_image_ctx, false, false, progress_ctx,
+ asio_engine->get_work_queue(), &on_remove);
+ req->send();
+ r = on_remove.wait();
+
+ m_dst_image_ctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed removing destination image '"
+ << dst_io_ctx.get_pool_name() << "/" << m_dst_image_name
+ << " (" << m_dst_image_id << ")': " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ if (m_src_image_ctx != nullptr) {
+ r = relink_src_image(m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = add_group(m_src_image_ctx, group_info);
+ if (r < 0) {
+ return r;
+ }
+
+ r = remove_migration(m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = enable_mirroring(m_src_image_ctx, m_mirroring, m_mirror_image_mode);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::commit() {
+ ldout(m_cct, 10) << dendl;
+
+ BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx, &m_src_image_ctx) {
+ m_dst_image_ctx->state->close();
+ if (m_src_image_ctx != nullptr) {
+ m_src_image_ctx->state->close();
+ }
+ } BOOST_SCOPE_EXIT_END;
+
+ int r = remove_migration(m_dst_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_src_image_ctx != nullptr) {
+ r = remove_src_image(&m_src_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ r = enable_mirroring(m_dst_image_ctx, m_mirroring, m_mirror_image_mode);
+ if (r < 0) {
+ return r;
+ }
+
+ ldout(m_cct, 10) << "succeeded" << dendl;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::status(image_migration_status_t *status) {
+ ldout(m_cct, 10) << dendl;
+
+ status->source_pool_id = m_dst_migration_spec.pool_id;
+ status->source_pool_namespace = m_dst_migration_spec.pool_namespace;
+ status->source_image_name = m_dst_migration_spec.image_name;
+ status->source_image_id = m_dst_migration_spec.image_id;
+ status->dest_pool_id = m_src_migration_spec.pool_id;
+ status->dest_pool_namespace = m_src_migration_spec.pool_namespace;
+ status->dest_image_name = m_src_migration_spec.image_name;
+ status->dest_image_id = m_src_migration_spec.image_id;
+
+ switch (m_src_migration_spec.state) {
+ case cls::rbd::MIGRATION_STATE_ERROR:
+ status->state = RBD_IMAGE_MIGRATION_STATE_ERROR;
+ break;
+ case cls::rbd::MIGRATION_STATE_PREPARING:
+ status->state = RBD_IMAGE_MIGRATION_STATE_PREPARING;
+ break;
+ case cls::rbd::MIGRATION_STATE_PREPARED:
+ status->state = RBD_IMAGE_MIGRATION_STATE_PREPARED;
+ break;
+ case cls::rbd::MIGRATION_STATE_EXECUTING:
+ status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTING;
+ break;
+ case cls::rbd::MIGRATION_STATE_EXECUTED:
+ status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTED;
+ break;
+ default:
+ status->state = RBD_IMAGE_MIGRATION_STATE_UNKNOWN;
+ break;
+ }
+
+ status->state_description = m_src_migration_spec.state_description;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::set_state(I* image_ctx, const std::string& image_description,
+ cls::rbd::MigrationState state,
+ const std::string &description) {
+ int r = cls_client::migration_set_state(&image_ctx->md_ctx,
+ image_ctx->header_oid,
+ state, description);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set " << image_description << " "
+ << "migration header: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::set_state(cls::rbd::MigrationState state,
+ const std::string &description) {
+ int r;
+ if (m_src_image_ctx != nullptr) {
+ r = set_state(m_src_image_ctx, "source", state, description);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ r = set_state(m_dst_image_ctx, "destination", state, description);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::list_src_snaps(I* image_ctx,
+ std::vector<librbd::snap_info_t> *snaps) {
+ ldout(m_cct, 10) << dendl;
+
+ int r = Snapshot<I>::list(image_ctx, *snaps);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto &snap : *snaps) {
+ librbd::snap_namespace_type_t namespace_type;
+ r = Snapshot<I>::get_namespace_type(image_ctx, snap.id,
+ &namespace_type);
+ if (r < 0) {
+ lderr(m_cct) << "error getting snap namespace type: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER) {
+ if (namespace_type == RBD_SNAP_NAMESPACE_TYPE_TRASH) {
+ lderr(m_cct) << "image has snapshots with linked clones that must be "
+ << "deleted or flattened before the image can be migrated"
+ << dendl;
+ } else {
+ lderr(m_cct) << "image has non-user type snapshots "
+ << "that are not supported by migration" << dendl;
+ }
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::validate_src_snaps(I* image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ std::vector<librbd::snap_info_t> snaps;
+ int r = list_src_snaps(image_ctx, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t dst_features = 0;
+ r = m_image_options.get(RBD_IMAGE_OPTION_FEATURES, &dst_features);
+ ceph_assert(r == 0);
+
+ if (!image_ctx->test_features(RBD_FEATURE_LAYERING)) {
+ return 0;
+ }
+
+ for (auto &snap : snaps) {
+ std::shared_lock image_locker{image_ctx->image_lock};
+ cls::rbd::ParentImageSpec parent_spec{image_ctx->md_ctx.get_id(),
+ image_ctx->md_ctx.get_namespace(),
+ image_ctx->id, snap.id};
+ std::vector<librbd::linked_image_spec_t> child_images;
+ r = api::Image<I>::list_children(image_ctx, parent_spec,
+ &child_images);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing children: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ if (!child_images.empty()) {
+ ldout(m_cct, 1) << image_ctx->name << "@" << snap.name
+ << " has children" << dendl;
+
+ if ((dst_features & RBD_FEATURE_LAYERING) == 0) {
+ lderr(m_cct) << "can't migrate to destination without layering feature: "
+ << "image has children" << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+template <typename I>
+int Migration<I>::set_src_migration(I* image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ image_ctx->ignore_migrating = true;
+
+ int r = cls_client::migration_set(&image_ctx->md_ctx, image_ctx->header_oid,
+ m_src_migration_spec);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set source migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ image_ctx->notify_update();
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::remove_migration(I *image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ int r;
+
+ r = cls_client::migration_remove(&image_ctx->md_ctx, image_ctx->header_oid);
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed removing migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ image_ctx->notify_update();
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::unlink_src_image(I* image_ctx) {
+ if (image_ctx->old_format) {
+ return v1_unlink_src_image(image_ctx);
+ } else {
+ return v2_unlink_src_image(image_ctx);
+ }
+}
+
+template <typename I>
+int Migration<I>::v1_unlink_src_image(I* image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+ int r = tmap_rm(image_ctx->md_ctx, image_ctx->name);
+ if (r < 0) {
+ lderr(m_cct) << "failed removing " << image_ctx->name << " from tmap: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::v2_unlink_src_image(I* image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ image_ctx->owner_lock.lock_shared();
+ if (image_ctx->exclusive_lock != nullptr &&
+ image_ctx->exclusive_lock->is_lock_owner()) {
+ C_SaferCond ctx;
+ image_ctx->exclusive_lock->release_lock(&ctx);
+ image_ctx->owner_lock.unlock_shared();
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "error releasing exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ } else {
+ image_ctx->owner_lock.unlock_shared();
+ }
+
+ int r = Trash<I>::move(image_ctx->md_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION,
+ image_ctx->name, 0);
+ if (r < 0) {
+ lderr(m_cct) << "failed moving image to trash: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::relink_src_image(I* image_ctx) {
+ if (image_ctx->old_format) {
+ return v1_relink_src_image(image_ctx);
+ } else {
+ return v2_relink_src_image(image_ctx);
+ }
+}
+
+template <typename I>
+int Migration<I>::v1_relink_src_image(I* image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+ int r = tmap_set(image_ctx->md_ctx, image_ctx->name);
+ if (r < 0) {
+ lderr(m_cct) << "failed adding " << image_ctx->name << " to tmap: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::v2_relink_src_image(I* image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+ int r = Trash<I>::restore(image_ctx->md_ctx,
+ {cls::rbd::TRASH_IMAGE_SOURCE_MIGRATION},
+ image_ctx->id, image_ctx->name);
+ if (r < 0) {
+ lderr(m_cct) << "failed restoring image from trash: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::create_dst_image(I** image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ uint64_t size;
+ cls::rbd::ParentImageSpec parent_spec;
+ {
+ std::shared_lock image_locker{m_src_image_ctx->image_lock};
+ size = m_src_image_ctx->size;
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!m_src_image_ctx->snap_info.empty()) {
+ parent_spec = m_src_image_ctx->snap_info.begin()->second.parent.spec;
+ } else {
+ parent_spec = m_src_image_ctx->parent_md.spec;
+ }
+ }
+
+ ConfigProxy config{m_cct->_conf};
+ api::Config<I>::apply_pool_overrides(m_dst_io_ctx, &config);
+
+ uint64_t mirror_image_mode;
+ if (m_image_options.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
+ &mirror_image_mode) == 0) {
+ m_mirroring = true;
+ m_mirror_image_mode = static_cast<cls::rbd::MirrorImageMode>(
+ mirror_image_mode);
+ m_image_options.unset(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE);
+ }
+
+ int r;
+ C_SaferCond on_create;
+ librados::IoCtx parent_io_ctx;
+ if (parent_spec.pool_id == -1) {
+ auto *req = image::CreateRequest<I>::create(
+ config, m_dst_io_ctx, m_dst_image_name, m_dst_image_id, size,
+ m_image_options, image::CREATE_FLAG_SKIP_MIRROR_ENABLE,
+ cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", "",
+ m_src_image_ctx->op_work_queue, &on_create);
+ req->send();
+ } else {
+ r = util::create_ioctx(m_src_image_ctx->md_ctx, "parent image",
+ parent_spec.pool_id, parent_spec.pool_namespace,
+ &parent_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ auto *req = image::CloneRequest<I>::create(
+ config, parent_io_ctx, parent_spec.image_id, "", {}, parent_spec.snap_id,
+ m_dst_io_ctx, m_dst_image_name, m_dst_image_id, m_image_options,
+ cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", "",
+ m_src_image_ctx->op_work_queue, &on_create);
+ req->send();
+ }
+
+ r = on_create.wait();
+ if (r < 0) {
+ lderr(m_cct) << "header creation failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto dst_image_ctx = *image_ctx;
+ dst_image_ctx->id = m_dst_image_id;
+ *image_ctx = nullptr; // prevent prepare from cleaning up the ImageCtx
+
+ r = dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING);
+ if (r < 0) {
+ lderr(m_cct) << "failed to open newly created header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_TPL(dst_image_ctx) {
+ dst_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ {
+ std::shared_lock owner_locker{dst_image_ctx->owner_lock};
+ r = dst_image_ctx->operations->prepare_image_update(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true);
+ if (r < 0) {
+ lderr(m_cct) << "cannot obtain exclusive lock" << dendl;
+ return r;
+ }
+ if (dst_image_ctx->exclusive_lock != nullptr) {
+ dst_image_ctx->exclusive_lock->block_requests(0);
+ }
+ }
+
+ SnapSeqs snap_seqs;
+
+ C_SaferCond on_snapshot_copy;
+ auto snapshot_copy_req = librbd::deep_copy::SnapshotCopyRequest<I>::create(
+ m_src_image_ctx, dst_image_ctx, 0, CEPH_NOSNAP, 0, m_flatten,
+ m_src_image_ctx->op_work_queue, &snap_seqs, &on_snapshot_copy);
+ snapshot_copy_req->send();
+ r = on_snapshot_copy.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy snapshots: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (!m_src_image_ctx->header_oid.empty()) {
+ C_SaferCond on_metadata_copy;
+ auto metadata_copy_req = librbd::deep_copy::MetadataCopyRequest<I>::create(
+ m_src_image_ctx, dst_image_ctx, &on_metadata_copy);
+ metadata_copy_req->send();
+ r = on_metadata_copy.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ m_dst_migration_spec.snap_seqs = snap_seqs;
+ m_dst_migration_spec.overlap = size;
+ m_dst_migration_spec.mirroring = m_mirroring;
+ m_dst_migration_spec.mirror_image_mode = m_mirror_image_mode;
+ m_dst_migration_spec.flatten = m_flatten;
+ r = cls_client::migration_set(&m_dst_io_ctx, m_dst_header_oid,
+ m_dst_migration_spec);
+ if (r < 0) {
+ lderr(m_cct) << "failed to set migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (m_dst_migration_spec.source_spec.empty()) {
+ r = update_group(m_src_image_ctx, dst_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = set_state(m_src_image_ctx, "source",
+ cls::rbd::MIGRATION_STATE_PREPARED, "");
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ r = set_state(dst_image_ctx, "destination",
+ cls::rbd::MIGRATION_STATE_PREPARED, "");
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_dst_migration_spec.source_spec.empty()) {
+ r = dst_image_ctx->state->refresh();
+ if (r < 0) {
+ lderr(m_cct) << "failed to refresh destination image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = relink_children(m_src_image_ctx, dst_image_ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::remove_group(I *image_ctx, group_info_t *group_info) {
+ int r = librbd::api::Group<I>::image_get_group(image_ctx, group_info);
+ if (r < 0) {
+ lderr(m_cct) << "failed to get image group: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (group_info->pool == -1) {
+ return -ENOENT;
+ }
+
+ ceph_assert(!image_ctx->id.empty());
+
+ ldout(m_cct, 10) << dendl;
+
+ IoCtx group_ioctx;
+ r = util::create_ioctx(image_ctx->md_ctx, "group", group_info->pool, {},
+ &group_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = librbd::api::Group<I>::image_remove_by_id(group_ioctx,
+ group_info->name.c_str(),
+ image_ctx->md_ctx,
+ image_ctx->id.c_str());
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove image from group: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::add_group(I *image_ctx, group_info_t &group_info) {
+ if (group_info.pool == -1) {
+ return 0;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ IoCtx group_ioctx;
+ int r = util::create_ioctx(image_ctx->md_ctx, "group", group_info.pool, {},
+ &group_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = librbd::api::Group<I>::image_add(group_ioctx, group_info.name.c_str(),
+ image_ctx->md_ctx,
+ image_ctx->name.c_str());
+ if (r < 0) {
+ lderr(m_cct) << "failed to add image to group: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::update_group(I *from_image_ctx, I *to_image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ group_info_t group_info;
+
+ int r = remove_group(from_image_ctx, &group_info);
+ if (r < 0) {
+ return r == -ENOENT ? 0 : r;
+ }
+
+ r = add_group(to_image_ctx, group_info);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::disable_mirroring(
+ I *image_ctx, bool *was_enabled,
+ cls::rbd::MirrorImageMode *mirror_image_mode) {
+ *was_enabled = false;
+
+ cls::rbd::MirrorImage mirror_image;
+ int r = cls_client::mirror_image_get(&image_ctx->md_ctx, image_ctx->id,
+ &mirror_image);
+ if (r == -ENOENT) {
+ ldout(m_cct, 10) << "mirroring is not enabled for this image" << dendl;
+ return 0;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ *was_enabled = true;
+ *mirror_image_mode = mirror_image.mode;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ C_SaferCond ctx;
+ auto req = mirror::DisableRequest<I>::create(image_ctx, false, true, &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to disable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ m_src_migration_spec.mirroring = true;
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::enable_mirroring(
+ I *image_ctx, bool was_enabled,
+ cls::rbd::MirrorImageMode mirror_image_mode) {
+ cls::rbd::MirrorMode mirror_mode;
+ int r = cls_client::mirror_mode_get(&image_ctx->md_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ ldout(m_cct, 10) << "mirroring is not enabled for destination pool"
+ << dendl;
+ return 0;
+ }
+ if (mirror_mode == cls::rbd::MIRROR_MODE_IMAGE && !was_enabled) {
+ ldout(m_cct, 10) << "mirroring is not enabled for image" << dendl;
+ return 0;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ C_SaferCond ctx;
+ auto req = mirror::EnableRequest<I>::create(
+ image_ctx, mirror_image_mode, "", false, &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+// When relinking children we should be careful as it my be interrupted
+// at any moment by some reason and we may end up in an inconsistent
+// state, which we have to be able to fix with "migration abort". Below
+// are all possible states during migration (P1 - sourse parent, P2 -
+// destination parent, C - child):
+//
+// P1 P2 P1 P2 P1 P2 P1 P2
+// ^\ \ ^ \ /^ /^
+// \v v/ v/ v/
+// C C C C
+//
+// 1 2 3 4
+//
+// (1) and (4) are the initial and the final consistent states. (2)
+// and (3) are intermediate inconsistent states that have to be fixed
+// by relink_children running in "migration abort" mode. For this, it
+// scans P2 for all children attached and relinks (fixes) states (3)
+// and (4) to state (1). Then it scans P1 for remaining children and
+// fixes the states (2).
+
+template <typename I>
+int Migration<I>::relink_children(I *from_image_ctx, I *to_image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ bool migration_abort = (to_image_ctx == m_src_image_ctx);
+
+ std::vector<librbd::snap_info_t> snaps;
+ int r = list_src_snaps(
+ migration_abort ? to_image_ctx : from_image_ctx, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto it = snaps.begin(); it != snaps.end(); it++) {
+ auto &snap = *it;
+ std::vector<librbd::linked_image_spec_t> src_child_images;
+
+ if (from_image_ctx != m_src_image_ctx) {
+ ceph_assert(migration_abort);
+
+ // We run list snaps against the src image to get only those snapshots
+ // that are migrated. If the "from" image is not the src image
+ // (abort migration case), we need to remap snap ids.
+ // Also collect the list of the children currently attached to the
+ // source, so we could make a proper decision later about relinking.
+
+ std::shared_lock src_image_locker{to_image_ctx->image_lock};
+ cls::rbd::ParentImageSpec src_parent_spec{to_image_ctx->md_ctx.get_id(),
+ to_image_ctx->md_ctx.get_namespace(),
+ to_image_ctx->id, snap.id};
+ r = api::Image<I>::list_children(to_image_ctx, src_parent_spec,
+ &src_child_images);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing children: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::shared_lock image_locker{from_image_ctx->image_lock};
+ snap.id = from_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ snap.name);
+ if (snap.id == CEPH_NOSNAP) {
+ ldout(m_cct, 5) << "skipping snapshot " << snap.name << dendl;
+ continue;
+ }
+ }
+
+ std::vector<librbd::linked_image_spec_t> child_images;
+ {
+ std::shared_lock image_locker{from_image_ctx->image_lock};
+ cls::rbd::ParentImageSpec parent_spec{from_image_ctx->md_ctx.get_id(),
+ from_image_ctx->md_ctx.get_namespace(),
+ from_image_ctx->id, snap.id};
+ r = api::Image<I>::list_children(from_image_ctx, parent_spec,
+ &child_images);
+ if (r < 0) {
+ lderr(m_cct) << "failed listing children: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ for (auto &child_image : child_images) {
+ r = relink_child(from_image_ctx, to_image_ctx, snap, child_image,
+ migration_abort, true);
+ if (r < 0) {
+ return r;
+ }
+
+ src_child_images.erase(std::remove(src_child_images.begin(),
+ src_child_images.end(), child_image),
+ src_child_images.end());
+ }
+
+ for (auto &child_image : src_child_images) {
+ r = relink_child(from_image_ctx, to_image_ctx, snap, child_image,
+ migration_abort, false);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::relink_child(I *from_image_ctx, I *to_image_ctx,
+ const librbd::snap_info_t &from_snap,
+ const librbd::linked_image_spec_t &child_image,
+ bool migration_abort, bool reattach_child) {
+ ldout(m_cct, 10) << from_snap.name << " " << child_image.pool_name << "/"
+ << child_image.pool_namespace << "/"
+ << child_image.image_name << " (migration_abort="
+ << migration_abort << ", reattach_child=" << reattach_child
+ << ")" << dendl;
+
+ librados::snap_t to_snap_id;
+ {
+ std::shared_lock image_locker{to_image_ctx->image_lock};
+ to_snap_id = to_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(),
+ from_snap.name);
+ if (to_snap_id == CEPH_NOSNAP) {
+ lderr(m_cct) << "no snapshot " << from_snap.name << " on destination image"
+ << dendl;
+ return -ENOENT;
+ }
+ }
+
+ librados::IoCtx child_io_ctx;
+ int r = util::create_ioctx(to_image_ctx->md_ctx,
+ "child image " + child_image.image_name,
+ child_image.pool_id, child_image.pool_namespace,
+ &child_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ I *child_image_ctx = I::create("", child_image.image_id, nullptr,
+ child_io_ctx, false);
+ r = child_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r < 0) {
+ lderr(m_cct) << "failed to open child image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ BOOST_SCOPE_EXIT_TPL(child_image_ctx) {
+ child_image_ctx->state->close();
+ } BOOST_SCOPE_EXIT_END;
+
+ uint32_t clone_format = 1;
+ if (child_image_ctx->test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) {
+ clone_format = 2;
+ }
+
+ cls::rbd::ParentImageSpec parent_spec;
+ uint64_t parent_overlap;
+ {
+ std::shared_lock image_locker{child_image_ctx->image_lock};
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!child_image_ctx->snap_info.empty()) {
+ parent_spec = child_image_ctx->snap_info.begin()->second.parent.spec;
+ parent_overlap = child_image_ctx->snap_info.begin()->second.parent.overlap;
+ } else {
+ parent_spec = child_image_ctx->parent_md.spec;
+ parent_overlap = child_image_ctx->parent_md.overlap;
+ }
+ }
+
+ if (migration_abort &&
+ parent_spec.pool_id == to_image_ctx->md_ctx.get_id() &&
+ parent_spec.pool_namespace == to_image_ctx->md_ctx.get_namespace() &&
+ parent_spec.image_id == to_image_ctx->id &&
+ parent_spec.snap_id == to_snap_id) {
+ ldout(m_cct, 10) << "no need for parent re-attach" << dendl;
+ } else {
+ if (parent_spec.pool_id != from_image_ctx->md_ctx.get_id() ||
+ parent_spec.pool_namespace != from_image_ctx->md_ctx.get_namespace() ||
+ parent_spec.image_id != from_image_ctx->id ||
+ parent_spec.snap_id != from_snap.id) {
+ lderr(m_cct) << "parent is not source image: " << parent_spec.pool_id
+ << "/" << parent_spec.pool_namespace << "/"
+ << parent_spec.image_id << "@" << parent_spec.snap_id
+ << dendl;
+ return -ESTALE;
+ }
+
+ parent_spec.pool_id = to_image_ctx->md_ctx.get_id();
+ parent_spec.pool_namespace = to_image_ctx->md_ctx.get_namespace();
+ parent_spec.image_id = to_image_ctx->id;
+ parent_spec.snap_id = to_snap_id;
+
+ C_SaferCond on_reattach_parent;
+ auto reattach_parent_req = image::AttachParentRequest<I>::create(
+ *child_image_ctx, parent_spec, parent_overlap, true, &on_reattach_parent);
+ reattach_parent_req->send();
+ r = on_reattach_parent.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to re-attach parent: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (reattach_child) {
+ C_SaferCond on_reattach_child;
+ auto reattach_child_req = image::AttachChildRequest<I>::create(
+ child_image_ctx, to_image_ctx, to_snap_id, from_image_ctx, from_snap.id,
+ clone_format, &on_reattach_child);
+ reattach_child_req->send();
+ r = on_reattach_child.wait();
+ if (r < 0) {
+ lderr(m_cct) << "failed to re-attach child: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ child_image_ctx->notify_update();
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::remove_src_image(I** image_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ auto src_image_ctx = *image_ctx;
+
+ std::vector<librbd::snap_info_t> snaps;
+ int r = list_src_snaps(src_image_ctx, &snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto it = snaps.rbegin(); it != snaps.rend(); it++) {
+ auto &snap = *it;
+
+ librbd::NoOpProgressContext prog_ctx;
+ int r = Snapshot<I>::remove(src_image_ctx, snap.name.c_str(),
+ RBD_SNAP_REMOVE_UNPROTECT, prog_ctx);
+ if (r < 0) {
+ lderr(m_cct) << "failed removing source image snapshot '" << snap.name
+ << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ ceph_assert(src_image_ctx->ignore_migrating);
+
+ auto asio_engine = src_image_ctx->asio_engine;
+ auto src_image_id = src_image_ctx->id;
+ librados::IoCtx src_io_ctx(src_image_ctx->md_ctx);
+
+ C_SaferCond on_remove;
+ auto req = librbd::image::RemoveRequest<I>::create(
+ src_io_ctx, src_image_ctx, false, true, *m_prog_ctx,
+ asio_engine->get_work_queue(), &on_remove);
+ req->send();
+ r = on_remove.wait();
+
+ *image_ctx = nullptr;
+
+ // For old format image it will return -ENOENT due to expected
+ // tmap_rm failure at the end.
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed removing source image: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (!src_image_id.empty()) {
+ r = cls_client::trash_remove(&src_io_ctx, src_image_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing image " << src_image_id
+ << " from rbd_trash object" << dendl;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Migration<I>::revert_data(I* src_image_ctx, I* dst_image_ctx,
+ ProgressContext* prog_ctx) {
+ ldout(m_cct, 10) << dendl;
+
+ cls::rbd::MigrationSpec migration_spec;
+ int r = cls_client::migration_get(&src_image_ctx->md_ctx,
+ src_image_ctx->header_oid,
+ &migration_spec);
+
+ if (r < 0) {
+ lderr(m_cct) << "failed retrieving migration header: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST) {
+ lderr(m_cct) << "unexpected migration header type: "
+ << migration_spec.header_type << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t src_snap_id_start = 0;
+ uint64_t src_snap_id_end = CEPH_NOSNAP;
+ uint64_t dst_snap_id_start = 0;
+ if (!migration_spec.snap_seqs.empty()) {
+ src_snap_id_start = migration_spec.snap_seqs.rbegin()->second;
+ }
+
+ // we only care about the HEAD revision so only add a single mapping to
+ // represent the most recent state
+ SnapSeqs snap_seqs;
+ snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP;
+
+ ldout(m_cct, 20) << "src_snap_id_start=" << src_snap_id_start << ", "
+ << "src_snap_id_end=" << src_snap_id_end << ", "
+ << "dst_snap_id_start=" << dst_snap_id_start << ", "
+ << "snap_seqs=" << snap_seqs << dendl;
+
+ C_SaferCond ctx;
+ deep_copy::ProgressHandler progress_handler(prog_ctx);
+ auto request = deep_copy::ImageCopyRequest<I>::create(
+ src_image_ctx, dst_image_ctx, src_snap_id_start, src_snap_id_end,
+ dst_snap_id_start, false, {}, snap_seqs, &progress_handler, &ctx);
+ request->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(m_cct) << "error reverting destination image data blocks back to "
+ << "source image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Migration<librbd::ImageCtx>;
diff --git a/src/librbd/api/Migration.h b/src/librbd/api/Migration.h
new file mode 100644
index 000000000..dd70dcc23
--- /dev/null
+++ b/src/librbd/api/Migration.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_MIGRATION_H
+#define CEPH_LIBRBD_API_MIGRATION_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+#include <vector>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Migration {
+public:
+ static int prepare(librados::IoCtx& io_ctx, const std::string &image_name,
+ librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name, ImageOptions& opts);
+ static int prepare_import(const std::string& source_spec,
+ librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name,
+ ImageOptions& opts);
+ static int execute(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx);
+ static int abort(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx);
+ static int commit(librados::IoCtx& io_ctx, const std::string &image_name,
+ ProgressContext &prog_ctx);
+ static int status(librados::IoCtx& io_ctx, const std::string &image_name,
+ image_migration_status_t *status);
+
+ static int get_source_spec(ImageCtxT* image_ctx, std::string* source_spec);
+
+private:
+ CephContext* m_cct;
+ ImageCtx* m_src_image_ctx;
+ ImageCtx* m_dst_image_ctx;
+ librados::IoCtx m_dst_io_ctx;
+ std::string m_dst_image_name;
+ std::string m_dst_image_id;
+ std::string m_dst_header_oid;
+ ImageOptions &m_image_options;
+ bool m_flatten;
+ bool m_mirroring;
+ cls::rbd::MirrorImageMode m_mirror_image_mode;
+ ProgressContext *m_prog_ctx;
+
+ cls::rbd::MigrationSpec m_src_migration_spec;
+ cls::rbd::MigrationSpec m_dst_migration_spec;
+
+ Migration(ImageCtx* src_image_ctx, ImageCtx* dst_image_ctx,
+ const cls::rbd::MigrationSpec& dst_migration_spec,
+ ImageOptions& opts, ProgressContext *prog_ctx);
+
+ int prepare();
+ int prepare_import();
+ int execute();
+ int abort();
+ int commit();
+ int status(image_migration_status_t *status);
+
+ int set_state(ImageCtxT* image_ctx, const std::string& image_description,
+ cls::rbd::MigrationState state,
+ const std::string &description);
+ int set_state(cls::rbd::MigrationState state, const std::string &description);
+
+ int list_src_snaps(ImageCtxT* image_ctx,
+ std::vector<librbd::snap_info_t> *snaps);
+ int validate_src_snaps(ImageCtxT* image_ctx);
+ int disable_mirroring(ImageCtxT* image_ctx, bool *was_enabled,
+ cls::rbd::MirrorImageMode *mirror_image_mode);
+ int enable_mirroring(ImageCtxT* image_ctx, bool was_enabled,
+ cls::rbd::MirrorImageMode mirror_image_mode);
+ int set_src_migration(ImageCtxT* image_ctx);
+ int unlink_src_image(ImageCtxT* image_ctx);
+ int relink_src_image(ImageCtxT* image_ctx);
+ int create_dst_image(ImageCtxT** image_ctx);
+ int remove_group(ImageCtxT* image_ctx, group_info_t *group_info);
+ int add_group(ImageCtxT* image_ctx, group_info_t &group_info);
+ int update_group(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx);
+ int remove_migration(ImageCtxT* image_ctx);
+ int relink_children(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx);
+ int remove_src_image(ImageCtxT** image_ctx);
+
+ int v1_set_src_migration(ImageCtxT* image_ctx);
+ int v2_set_src_migration(ImageCtxT* image_ctx);
+ int v1_unlink_src_image(ImageCtxT* image_ctx);
+ int v2_unlink_src_image(ImageCtxT* image_ctx);
+ int v1_relink_src_image(ImageCtxT* image_ctx);
+ int v2_relink_src_image(ImageCtxT* image_ctx);
+
+ int relink_child(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx,
+ const librbd::snap_info_t &src_snap,
+ const librbd::linked_image_spec_t &child_image,
+ bool migration_abort, bool reattach_child);
+
+ int revert_data(ImageCtxT* src_image_ctx, ImageCtxT* dst_image_ctx,
+ ProgressContext *prog_ctx);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Migration<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_MIGRATION_H
diff --git a/src/librbd/api/Mirror.cc b/src/librbd/api/Mirror.cc
new file mode 100644
index 000000000..bbaa4eff7
--- /dev/null
+++ b/src/librbd/api/Mirror.cc
@@ -0,0 +1,2089 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Mirror.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/ceph_json.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Namespace.h"
+#include "librbd/mirror/DemoteRequest.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "librbd/mirror/GetStatusRequest.h"
+#include "librbd/mirror/GetUuidRequest.h"
+#include "librbd/mirror/PromoteRequest.h"
+#include "librbd/mirror/Types.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/mirror/snapshot/CreatePrimaryRequest.h"
+#include "librbd/mirror/snapshot/ImageMeta.h"
+#include "librbd/mirror/snapshot/UnlinkPeerRequest.h"
+#include "librbd/mirror/snapshot/Utils.h"
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/algorithm/string/replace.hpp>
+#include <boost/scope_exit.hpp>
+#include "json_spirit/json_spirit.h"
+
+#include <algorithm>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Mirror: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+int get_config_key(librados::Rados& rados, const std::string& key,
+ std::string* value) {
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" + key + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -EINVAL) {
+ return -EOPNOTSUPP;
+ } else if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ *value = out_bl.to_str();
+ return 0;
+}
+
+int set_config_key(librados::Rados& rados, const std::string& key,
+ const std::string& value) {
+ std::string cmd;
+ if (value.empty()) {
+ cmd = "{"
+ "\"prefix\": \"config-key rm\", "
+ "\"key\": \"" + key + "\""
+ "}";
+ } else {
+ cmd = "{"
+ "\"prefix\": \"config-key set\", "
+ "\"key\": \"" + key + "\", "
+ "\"val\": \"" + value + "\""
+ "}";
+ }
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -EINVAL) {
+ return -EOPNOTSUPP;
+ } else if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+std::string get_peer_config_key_name(int64_t pool_id,
+ const std::string& peer_uuid) {
+ return RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) + "/" +
+ peer_uuid;
+}
+
+int remove_peer_config_key(librados::IoCtx& io_ctx,
+ const std::string& peer_uuid) {
+ int64_t pool_id = io_ctx.get_id();
+ auto key = get_peer_config_key_name(pool_id, peer_uuid);
+
+ librados::Rados rados(io_ctx);
+ int r = set_config_key(rados, key, "");
+ if (r < 0 && r != -ENOENT && r != -EPERM) {
+ return r;
+ }
+ return 0;
+}
+
+std::string get_mon_host(CephContext* cct) {
+ std::string mon_host;
+ if (auto mon_addrs = cct->get_mon_addrs();
+ mon_addrs != nullptr && !mon_addrs->empty()) {
+ CachedStackStringStream css;
+ for (auto it = mon_addrs->begin(); it != mon_addrs->end(); ++it) {
+ if (it != mon_addrs->begin()) {
+ *css << ",";
+ }
+ *css << *it;
+ }
+ mon_host = css->str();
+ } else {
+ ldout(cct, 20) << "falling back to mon_host in conf" << dendl;
+ mon_host = cct->_conf.get_val<std::string>("mon_host");
+ }
+ ldout(cct, 20) << "mon_host=" << mon_host << dendl;
+ return mon_host;
+}
+
+int create_bootstrap_user(CephContext* cct, librados::Rados& rados,
+ std::string* peer_client_id, std::string* cephx_key) {
+ ldout(cct, 20) << dendl;
+
+ // retrieve peer CephX user from config-key
+ int r = get_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY,
+ peer_client_id);
+ if (r == -EACCES) {
+ ldout(cct, 5) << "insufficient permissions to get peer-client-id "
+ << "config-key" << dendl;
+ return r;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve peer client id key: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (r == -ENOENT || peer_client_id->empty()) {
+ ldout(cct, 20) << "creating new peer-client-id config-key" << dendl;
+
+ *peer_client_id = "rbd-mirror-peer";
+ r = set_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY,
+ *peer_client_id);
+ if (r == -EACCES) {
+ ldout(cct, 5) << "insufficient permissions to update peer-client-id "
+ << "config-key" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to update peer client id key: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ ldout(cct, 20) << "peer_client_id=" << *peer_client_id << dendl;
+
+ // create peer client user
+ std::string cmd =
+ R"({)" \
+ R"( "prefix": "auth get-or-create",)" \
+ R"( "entity": "client.)" + *peer_client_id + R"(",)" \
+ R"( "caps": [)" \
+ R"( "mon", "profile rbd-mirror-peer",)" \
+ R"( "osd", "profile rbd"],)" \
+ R"( "format": "json")" \
+ R"(})";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -EINVAL) {
+ ldout(cct, 5) << "caps mismatch for existing user" << dendl;
+ return -EEXIST;
+ } else if (r == -EACCES) {
+ ldout(cct, 5) << "insufficient permissions to create user" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to create or update RBD mirroring bootstrap user: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // extract key from response
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(out_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_array()[0].get_obj();
+ *cephx_key = json_obj["key"].get_str();
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(cct) << "invalid auth keyring JSON received" << dendl;
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+int create_bootstrap_peer(CephContext* cct, librados::IoCtx& io_ctx,
+ mirror_peer_direction_t direction,
+ const std::string& site_name, const std::string& fsid,
+ const std::string& client_id, const std::string& key,
+ const std::string& mon_host,
+ const std::string& cluster1,
+ const std::string& cluster2) {
+ ldout(cct, 20) << dendl;
+
+ std::string peer_uuid;
+ std::vector<mirror_peer_site_t> peers;
+ int r = Mirror<>::peer_site_list(io_ctx, &peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror peers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (peers.empty()) {
+ r = Mirror<>::peer_site_add(io_ctx, &peer_uuid, direction, site_name,
+ "client." + client_id);
+ if (r < 0) {
+ lderr(cct) << "failed to add " << cluster1 << " peer to "
+ << cluster2 << " " << "cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (peers[0].site_name != site_name &&
+ peers[0].site_name != fsid) {
+ // only support a single peer
+ lderr(cct) << "multiple peers are not currently supported" << dendl;
+ return -EINVAL;
+ } else {
+ peer_uuid = peers[0].uuid;
+
+ if (peers[0].site_name != site_name) {
+ r = Mirror<>::peer_site_set_name(io_ctx, peer_uuid, site_name);
+ if (r < 0) {
+ // non-fatal attempt to update site name
+ lderr(cct) << "failed to update peer site name" << dendl;
+ }
+ }
+ }
+
+ Mirror<>::Attributes attributes {
+ {"mon_host", mon_host},
+ {"key", key}};
+ r = Mirror<>::peer_site_set_attributes(io_ctx, peer_uuid, attributes);
+ if (r < 0) {
+ lderr(cct) << "failed to update " << cluster1 << " cluster connection "
+ << "attributes in " << cluster2 << " cluster: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int list_mirror_images(librados::IoCtx& io_ctx,
+ std::set<std::string>& mirror_image_ids) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ std::string last_read = "";
+ int max_read = 1024;
+ int r;
+ do {
+ std::map<std::string, std::string> mirror_images;
+ r = cls_client::mirror_image_list(&io_ctx, last_read, max_read,
+ &mirror_images);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing mirrored image directory: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (auto it = mirror_images.begin(); it != mirror_images.end(); ++it) {
+ mirror_image_ids.insert(it->first);
+ }
+ if (!mirror_images.empty()) {
+ last_read = mirror_images.rbegin()->first;
+ }
+ r = mirror_images.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+struct C_ImageGetInfo : public Context {
+ mirror_image_info_t *mirror_image_info;
+ mirror_image_mode_t *mirror_image_mode;
+ Context *on_finish;
+
+ cls::rbd::MirrorImage mirror_image;
+ mirror::PromotionState promotion_state = mirror::PROMOTION_STATE_PRIMARY;
+ std::string primary_mirror_uuid;
+
+ C_ImageGetInfo(mirror_image_info_t *mirror_image_info,
+ mirror_image_mode_t *mirror_image_mode, Context *on_finish)
+ : mirror_image_info(mirror_image_info),
+ mirror_image_mode(mirror_image_mode), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0 && r != -ENOENT) {
+ on_finish->complete(r);
+ return;
+ }
+
+ if (mirror_image_info != nullptr) {
+ mirror_image_info->global_id = mirror_image.global_image_id;
+ mirror_image_info->state = static_cast<rbd_mirror_image_state_t>(
+ mirror_image.state);
+ mirror_image_info->primary = (
+ promotion_state == mirror::PROMOTION_STATE_PRIMARY);
+ }
+
+ if (mirror_image_mode != nullptr) {
+ *mirror_image_mode =
+ static_cast<rbd_mirror_image_mode_t>(mirror_image.mode);
+ }
+
+ on_finish->complete(0);
+ }
+};
+
+struct C_ImageGetGlobalStatus : public C_ImageGetInfo {
+ std::string image_name;
+ mirror_image_global_status_t *mirror_image_global_status;
+
+ cls::rbd::MirrorImageStatus mirror_image_status_internal;
+
+ C_ImageGetGlobalStatus(
+ const std::string &image_name,
+ mirror_image_global_status_t *mirror_image_global_status,
+ Context *on_finish)
+ : C_ImageGetInfo(&mirror_image_global_status->info, nullptr, on_finish),
+ image_name(image_name),
+ mirror_image_global_status(mirror_image_global_status) {
+ }
+
+ void finish(int r) override {
+ if (r < 0 && r != -ENOENT) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_global_status->name = image_name;
+ mirror_image_global_status->site_statuses.clear();
+ mirror_image_global_status->site_statuses.reserve(
+ mirror_image_status_internal.mirror_image_site_statuses.size());
+ for (auto& site_status :
+ mirror_image_status_internal.mirror_image_site_statuses) {
+ mirror_image_global_status->site_statuses.push_back({
+ site_status.mirror_uuid,
+ static_cast<mirror_image_status_state_t>(site_status.state),
+ site_status.description, site_status.last_update.sec(),
+ site_status.up});
+ }
+ C_ImageGetInfo::finish(0);
+ }
+};
+
+template <typename I>
+struct C_ImageSnapshotCreate : public Context {
+ I *ictx;
+ uint64_t snap_create_flags;
+ uint64_t *snap_id;
+ Context *on_finish;
+
+ cls::rbd::MirrorImage mirror_image;
+ mirror::PromotionState promotion_state;
+ std::string primary_mirror_uuid;
+
+ C_ImageSnapshotCreate(I *ictx, uint64_t snap_create_flags, uint64_t *snap_id,
+ Context *on_finish)
+ : ictx(ictx), snap_create_flags(snap_create_flags), snap_id(snap_id),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0 && r != -ENOENT) {
+ on_finish->complete(r);
+ return;
+ }
+
+ if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT ||
+ mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(ictx->cct) << "snapshot based mirroring is not enabled" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto req = mirror::snapshot::CreatePrimaryRequest<I>::create(
+ ictx, mirror_image.global_image_id, CEPH_NOSNAP, snap_create_flags, 0U,
+ snap_id, on_finish);
+ req->send();
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Mirror<I>::image_enable(I *ictx, mirror_image_mode_t mode,
+ bool relax_same_pool_parent_check) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << " mode=" << mode
+ << " relax_same_pool_parent_check="
+ << relax_same_pool_parent_check << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "cannot enable mirroring: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ lderr(cct) << "cannot enable mirroring in the current pool mirroring mode"
+ << dendl;
+ return -EINVAL;
+ }
+
+ // is mirroring not enabled for the parent?
+ {
+ std::shared_lock image_locker{ictx->image_lock};
+ ImageCtx *parent = ictx->parent;
+ if (parent) {
+ if (parent->md_ctx.get_id() != ictx->md_ctx.get_id() ||
+ !relax_same_pool_parent_check) {
+ cls::rbd::MirrorImage mirror_image_internal;
+ r = cls_client::mirror_image_get(&(parent->md_ctx), parent->id,
+ &mirror_image_internal);
+ if (r == -ENOENT) {
+ lderr(cct) << "mirroring is not enabled for the parent" << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+
+ if (mode == RBD_MIRROR_IMAGE_MODE_JOURNAL &&
+ !ictx->test_features(RBD_FEATURE_JOURNALING)) {
+ uint64_t features = RBD_FEATURE_JOURNALING;
+ if (!ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+ }
+ r = ictx->operations->update_features(features, true);
+ if (r < 0) {
+ lderr(cct) << "cannot enable journaling: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ C_SaferCond ctx;
+ auto req = mirror::EnableRequest<ImageCtx>::create(
+ ictx, static_cast<cls::rbd::MirrorImageMode>(mode), "", false, &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_disable(I *ictx, bool force) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "cannot disable mirroring: failed to retrieve pool "
+ "mirroring mode: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ lderr(cct) << "cannot disable mirroring in the current pool mirroring "
+ "mode" << dendl;
+ return -EINVAL;
+ }
+
+ // is mirroring enabled for the image?
+ cls::rbd::MirrorImage mirror_image_internal;
+ r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
+ &mirror_image_internal);
+ if (r == -ENOENT) {
+ // mirroring is not enabled for this image
+ ldout(cct, 20) << "ignoring disable command: mirroring is not enabled for "
+ << "this image" << dendl;
+ return 0;
+ } else if (r == -EOPNOTSUPP) {
+ ldout(cct, 5) << "mirroring not supported by OSD" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror image metadata: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
+ r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
+ mirror_image_internal);
+ if (r < 0) {
+ lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ bool rollback = false;
+ BOOST_SCOPE_EXIT_ALL(ictx, &mirror_image_internal, &rollback) {
+ if (rollback) {
+ // restore the mask bit for treating the non-primary feature as read-only
+ ictx->image_lock.lock();
+ ictx->read_only_mask |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ ictx->image_lock.unlock();
+
+ ictx->state->handle_update_notification();
+
+ // attempt to restore the image state
+ CephContext *cct = ictx->cct;
+ mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+ int r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
+ mirror_image_internal);
+ if (r < 0) {
+ lderr(cct) << "failed to re-enable image mirroring: "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+ };
+
+ std::unique_lock image_locker{ictx->image_lock};
+ map<librados::snap_t, SnapInfo> snap_info = ictx->snap_info;
+ for (auto &info : snap_info) {
+ cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(),
+ ictx->md_ctx.get_namespace(),
+ ictx->id, info.first};
+ std::vector<librbd::linked_image_spec_t> child_images;
+ r = Image<I>::list_children(ictx, parent_spec, &child_images);
+ if (r < 0) {
+ rollback = true;
+ return r;
+ }
+
+ if (child_images.empty()) {
+ continue;
+ }
+
+ librados::IoCtx child_io_ctx;
+ int64_t child_pool_id = -1;
+ for (auto &child_image : child_images){
+ std::string pool = child_image.pool_name;
+ if (child_pool_id == -1 ||
+ child_pool_id != child_image.pool_id ||
+ child_io_ctx.get_namespace() != child_image.pool_namespace) {
+ r = util::create_ioctx(ictx->md_ctx, "child image",
+ child_image.pool_id,
+ child_image.pool_namespace,
+ &child_io_ctx);
+ if (r < 0) {
+ rollback = true;
+ return r;
+ }
+
+ child_pool_id = child_image.pool_id;
+ }
+
+ cls::rbd::MirrorImage child_mirror_image_internal;
+ r = cls_client::mirror_image_get(&child_io_ctx, child_image.image_id,
+ &child_mirror_image_internal);
+ if (r != -ENOENT) {
+ rollback = true;
+ lderr(cct) << "mirroring is enabled on one or more children "
+ << dendl;
+ return -EBUSY;
+ }
+ }
+ }
+ image_locker.unlock();
+
+ if (mirror_image_internal.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ // don't let the non-primary feature bit prevent image updates
+ ictx->image_lock.lock();
+ ictx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ ictx->image_lock.unlock();
+
+ r = ictx->state->refresh();
+ if (r < 0) {
+ rollback = true;
+ return r;
+ }
+
+ // remove any snapshot-based mirroring image-meta from image
+ std::string mirror_uuid;
+ r = uuid_get(ictx->md_ctx, &mirror_uuid);
+ if (r < 0) {
+ rollback = true;
+ return r;
+ }
+
+ r = ictx->operations->metadata_remove(
+ mirror::snapshot::util::get_image_meta_key(mirror_uuid));
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "cannot remove snapshot image-meta key: " << cpp_strerror(r)
+ << dendl;
+ rollback = true;
+ return r;
+ }
+ }
+
+ C_SaferCond ctx;
+ auto req = mirror::DisableRequest<ImageCtx>::create(ictx, force, true,
+ &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+ rollback = true;
+ return r;
+ }
+
+ if (mirror_image_internal.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ r = ictx->operations->update_features(RBD_FEATURE_JOURNALING, false);
+ if (r < 0) {
+ lderr(cct) << "cannot disable journaling: " << cpp_strerror(r) << dendl;
+ // not fatal
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_promote(I *ictx, bool force) {
+ CephContext *cct = ictx->cct;
+
+ C_SaferCond ctx;
+ Mirror<I>::image_promote(ictx, force, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to promote image" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_promote(I *ictx, bool force, Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << ", "
+ << "force=" << force << dendl;
+
+ // don't let the non-primary feature bit prevent image updates
+ ictx->image_lock.lock();
+ ictx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ ictx->image_lock.unlock();
+
+ auto on_promote = new LambdaContext([ictx, on_finish](int r) {
+ ictx->image_lock.lock();
+ ictx->read_only_mask |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ ictx->image_lock.unlock();
+
+ ictx->state->handle_update_notification();
+ on_finish->complete(r);
+ });
+
+ auto on_refresh = new LambdaContext([ictx, force, on_promote](int r) {
+ if (r < 0) {
+ lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl;
+ on_promote->complete(r);
+ return;
+ }
+
+ auto req = mirror::PromoteRequest<>::create(*ictx, force, on_promote);
+ req->send();
+ });
+ ictx->state->refresh(on_refresh);
+}
+
+template <typename I>
+int Mirror<I>::image_demote(I *ictx) {
+ CephContext *cct = ictx->cct;
+
+ C_SaferCond ctx;
+ Mirror<I>::image_demote(ictx, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to demote image" << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_demote(I *ictx, Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto on_cleanup = new LambdaContext([ictx, on_finish](int r) {
+ ictx->image_lock.lock();
+ ictx->read_only_mask |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ ictx->image_lock.unlock();
+
+ ictx->state->handle_update_notification();
+
+ on_finish->complete(r);
+ });
+ auto on_refresh = new LambdaContext([ictx, on_cleanup](int r) {
+ if (r < 0) {
+ lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl;
+ on_cleanup->complete(r);
+ return;
+ }
+
+ auto req = mirror::DemoteRequest<>::create(*ictx, on_cleanup);
+ req->send();
+ });
+
+ // ensure we can create a snapshot after setting the non-primary
+ // feature bit
+ ictx->image_lock.lock();
+ ictx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ ictx->image_lock.unlock();
+
+ ictx->state->refresh(on_refresh);
+}
+
+template <typename I>
+int Mirror<I>::image_resync(I *ictx) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::MirrorImage mirror_image;
+ mirror::PromotionState promotion_state;
+ std::string primary_mirror_uuid;
+ C_SaferCond get_info_ctx;
+ auto req = mirror::GetInfoRequest<I>::create(*ictx, &mirror_image,
+ &promotion_state,
+ &primary_mirror_uuid,
+ &get_info_ctx);
+ req->send();
+
+ r = get_info_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ if (promotion_state == mirror::PROMOTION_STATE_PRIMARY) {
+ lderr(cct) << "image is primary, cannot resync to itself" << dendl;
+ return -EINVAL;
+ }
+
+ if (mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ // flag the journal indicating that we want to rebuild the local image
+ r = Journal<I>::request_resync(ictx);
+ if (r < 0) {
+ lderr(cct) << "failed to request resync: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ std::string mirror_uuid;
+ r = uuid_get(ictx->md_ctx, &mirror_uuid);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror::snapshot::ImageMeta image_meta(ictx, mirror_uuid);
+
+ C_SaferCond load_meta_ctx;
+ image_meta.load(&load_meta_ctx);
+ r = load_meta_ctx.wait();
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to load mirror image-meta: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ image_meta.resync_requested = true;
+
+ C_SaferCond save_meta_ctx;
+ image_meta.save(&save_meta_ctx);
+ r = save_meta_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to request resync: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ lderr(cct) << "unknown mirror mode" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info,
+ Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto on_refresh = new LambdaContext(
+ [ictx, mirror_image_info, on_finish](int r) {
+ if (r < 0) {
+ lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto ctx = new C_ImageGetInfo(mirror_image_info, nullptr, on_finish);
+ auto req = mirror::GetInfoRequest<I>::create(*ictx, &ctx->mirror_image,
+ &ctx->promotion_state,
+ &ctx->primary_mirror_uuid,
+ ctx);
+ req->send();
+ });
+
+ if (ictx->state->is_refresh_required()) {
+ ictx->state->refresh(on_refresh);
+ } else {
+ on_refresh->complete(0);
+ }
+}
+
+template <typename I>
+int Mirror<I>::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info) {
+ C_SaferCond ctx;
+ image_get_info(ictx, mirror_image_info, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_get_info(librados::IoCtx& io_ctx,
+ asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ mirror_image_info_t *mirror_image_info,
+ Context *on_finish) {
+ auto cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "pool_id=" << io_ctx.get_id() << ", image_id=" << image_id
+ << dendl;
+
+ auto ctx = new C_ImageGetInfo(mirror_image_info, nullptr, on_finish);
+ auto req = mirror::GetInfoRequest<I>::create(io_ctx, op_work_queue, image_id,
+ &ctx->mirror_image,
+ &ctx->promotion_state,
+ &ctx->primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::image_get_info(librados::IoCtx& io_ctx,
+ asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ mirror_image_info_t *mirror_image_info) {
+ C_SaferCond ctx;
+ image_get_info(io_ctx, op_work_queue, image_id, mirror_image_info, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_get_mode(I *ictx, mirror_image_mode_t *mode,
+ Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto ctx = new C_ImageGetInfo(nullptr, mode, on_finish);
+ auto req = mirror::GetInfoRequest<I>::create(*ictx, &ctx->mirror_image,
+ &ctx->promotion_state,
+ &ctx->primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::image_get_mode(I *ictx, mirror_image_mode_t *mode) {
+ C_SaferCond ctx;
+ image_get_mode(ictx, mode, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::image_get_global_status(I *ictx,
+ mirror_image_global_status_t *status,
+ Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ auto ctx = new C_ImageGetGlobalStatus(ictx->name, status, on_finish);
+ auto req = mirror::GetStatusRequest<I>::create(
+ *ictx, &ctx->mirror_image_status_internal, &ctx->mirror_image,
+ &ctx->promotion_state, ctx);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::image_get_global_status(I *ictx,
+ mirror_image_global_status_t *status) {
+ C_SaferCond ctx;
+ image_get_global_status(ictx, status, &ctx);
+
+ int r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_get_instance_id(I *ictx, std::string *instance_id) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+ int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, &mirror_image);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ return -EINVAL;
+ }
+
+ entity_inst_t instance;
+ r = cls_client::mirror_image_instance_get(&ictx->md_ctx,
+ mirror_image.global_image_id,
+ &instance);
+ if (r < 0) {
+ if (r != -ENOENT && r != -ESTALE) {
+ lderr(cct) << "failed to get mirror image instance: " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ }
+
+ *instance_id = stringify(instance.name.num());
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::site_name_get(librados::Rados& rados, std::string* name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(rados.cct());
+ ldout(cct, 20) << dendl;
+
+ int r = get_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name);
+ if (r == -EOPNOTSUPP) {
+ return r;
+ } else if (r == -ENOENT || name->empty()) {
+ // default to the cluster fsid
+ r = rados.cluster_fsid(name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve site name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::site_name_set(librados::Rados& rados, const std::string& name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(rados.cct());
+
+ std::string site_name{name};
+ boost::algorithm::trim(site_name);
+ ldout(cct, 20) << "site_name=" << site_name << dendl;
+
+ int r = set_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name);
+ if (r == -EOPNOTSUPP) {
+ return r;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to update site name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::mode_get(librados::IoCtx& io_ctx,
+ rbd_mirror_mode_t *mirror_mode) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ cls::rbd::MirrorMode mirror_mode_internal;
+ int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode_internal);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ switch (mirror_mode_internal) {
+ case cls::rbd::MIRROR_MODE_DISABLED:
+ case cls::rbd::MIRROR_MODE_IMAGE:
+ case cls::rbd::MIRROR_MODE_POOL:
+ *mirror_mode = static_cast<rbd_mirror_mode_t>(mirror_mode_internal);
+ break;
+ default:
+ lderr(cct) << "unknown mirror mode ("
+ << static_cast<uint32_t>(mirror_mode_internal) << ")"
+ << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::mode_set(librados::IoCtx& io_ctx,
+ rbd_mirror_mode_t mirror_mode) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ cls::rbd::MirrorMode next_mirror_mode;
+ switch (mirror_mode) {
+ case RBD_MIRROR_MODE_DISABLED:
+ case RBD_MIRROR_MODE_IMAGE:
+ case RBD_MIRROR_MODE_POOL:
+ next_mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode);
+ break;
+ default:
+ lderr(cct) << "unknown mirror mode ("
+ << static_cast<uint32_t>(mirror_mode) << ")" << dendl;
+ return -EINVAL;
+ }
+
+ int r;
+ if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ // fail early if pool still has peers registered and attempting to disable
+ std::vector<cls::rbd::MirrorPeer> mirror_peers;
+ r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (!mirror_peers.empty()) {
+ lderr(cct) << "mirror peers still registered" << dendl;
+ return -EBUSY;
+ }
+ }
+
+ cls::rbd::MirrorMode current_mirror_mode;
+ r = cls_client::mirror_mode_get(&io_ctx, &current_mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (current_mirror_mode == next_mirror_mode) {
+ return 0;
+ } else if (current_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ r = cls_client::mirror_uuid_set(&io_ctx, uuid_gen.to_string());
+ if (r < 0) {
+ lderr(cct) << "failed to allocate mirroring uuid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ if (current_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ r = cls_client::mirror_mode_set(&io_ctx, cls::rbd::MIRROR_MODE_IMAGE);
+ if (r < 0) {
+ lderr(cct) << "failed to set mirror mode to image: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = MirroringWatcher<>::notify_mode_updated(io_ctx,
+ cls::rbd::MIRROR_MODE_IMAGE);
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+
+ if (next_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) {
+ return 0;
+ }
+
+ if (next_mirror_mode == cls::rbd::MIRROR_MODE_POOL) {
+ map<string, string> images;
+ r = Image<I>::list_images_v2(io_ctx, &images);
+ if (r < 0) {
+ lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& img_pair : images) {
+ uint64_t features;
+ uint64_t incompatible_features;
+ r = cls_client::get_features(&io_ctx, util::header_name(img_pair.second),
+ true, &features, &incompatible_features);
+ if (r < 0) {
+ lderr(cct) << "error getting features for image " << img_pair.first
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Enable only journal based mirroring
+
+ if ((features & RBD_FEATURE_JOURNALING) != 0) {
+ I *img_ctx = I::create("", img_pair.second, nullptr, io_ctx, false);
+ r = img_ctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening image "<< img_pair.first << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = image_enable(img_ctx, RBD_MIRROR_IMAGE_MODE_JOURNAL, true);
+ int close_r = img_ctx->state->close();
+ if (r < 0) {
+ lderr(cct) << "error enabling mirroring for image "
+ << img_pair.first << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (close_r < 0) {
+ lderr(cct) << "failed to close image " << img_pair.first << ": "
+ << cpp_strerror(close_r) << dendl;
+ return close_r;
+ }
+ }
+ }
+ } else if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ while (true) {
+ bool retry_busy = false;
+ bool pending_busy = false;
+
+ std::set<std::string> image_ids;
+ r = list_mirror_images(io_ctx, image_ids);
+ if (r < 0) {
+ lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& img_id : image_ids) {
+ if (current_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) {
+ cls::rbd::MirrorImage mirror_image;
+ r = cls_client::mirror_image_get(&io_ctx, img_id, &mirror_image);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state for image id "
+ << img_id << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "failed to disable mirror mode: there are still "
+ << "images with mirroring enabled" << dendl;
+ return -EINVAL;
+ }
+ } else {
+ I *img_ctx = I::create("", img_id, nullptr, io_ctx, false);
+ r = img_ctx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening image id "<< img_id << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = image_disable(img_ctx, false);
+ int close_r = img_ctx->state->close();
+ if (r == -EBUSY) {
+ pending_busy = true;
+ } else if (r < 0) {
+ lderr(cct) << "error disabling mirroring for image id " << img_id
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (close_r < 0) {
+ lderr(cct) << "failed to close image id " << img_id << ": "
+ << cpp_strerror(close_r) << dendl;
+ return close_r;
+ } else if (pending_busy) {
+ // at least one mirrored image was successfully disabled, so we can
+ // retry any failures caused by busy parent/child relationships
+ retry_busy = true;
+ }
+ }
+ }
+
+ if (!retry_busy && pending_busy) {
+ lderr(cct) << "error disabling mirroring for one or more images"
+ << dendl;
+ return -EBUSY;
+ } else if (!retry_busy) {
+ break;
+ }
+ }
+ }
+
+ r = cls_client::mirror_mode_set(&io_ctx, next_mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "failed to set mirror mode: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = MirroringWatcher<>::notify_mode_updated(io_ctx, next_mirror_mode);
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ C_SaferCond ctx;
+ uuid_get(io_ctx, mirror_uuid, &ctx);
+ int r = ctx.wait();
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring uuid: " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Mirror<I>::uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid,
+ Context* on_finish) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ auto req = mirror::GetUuidRequest<I>::create(io_ctx, mirror_uuid, on_finish);
+ req->send();
+}
+
+template <typename I>
+int Mirror<I>::peer_bootstrap_create(librados::IoCtx& io_ctx,
+ std::string* token) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ auto mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ return -EINVAL;
+ }
+
+ // retrieve the cluster fsid
+ std::string fsid;
+ librados::Rados rados(io_ctx);
+ r = rados.cluster_fsid(&fsid);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string peer_client_id;
+ std::string cephx_key;
+ r = create_bootstrap_user(cct, rados, &peer_client_id, &cephx_key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string mon_host = get_mon_host(cct);
+
+ // format the token response
+ bufferlist token_bl;
+ token_bl.append(
+ R"({)" \
+ R"("fsid":")" + fsid + R"(",)" + \
+ R"("client_id":")" + peer_client_id + R"(",)" + \
+ R"("key":")" + cephx_key + R"(",)" + \
+ R"("mon_host":")" + \
+ boost::replace_all_copy(mon_host, "\"", "\\\"") + R"(")" + \
+ R"(})");
+ ldout(cct, 20) << "token=" << token_bl.to_str() << dendl;
+
+ bufferlist base64_bl;
+ token_bl.encode_base64(base64_bl);
+ *token = base64_bl.to_str();
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_bootstrap_import(librados::IoCtx& io_ctx,
+ rbd_mirror_peer_direction_t direction,
+ const std::string& token) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ if (direction != RBD_MIRROR_PEER_DIRECTION_RX &&
+ direction != RBD_MIRROR_PEER_DIRECTION_RX_TX) {
+ lderr(cct) << "invalid mirror peer direction" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist token_bl;
+ try {
+ bufferlist base64_bl;
+ base64_bl.append(token);
+ token_bl.decode_base64(base64_bl);
+ } catch (buffer::error& err) {
+ lderr(cct) << "failed to decode base64" << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 20) << "token=" << token_bl.to_str() << dendl;
+
+ bool json_valid = false;
+ std::string expected_remote_fsid;
+ std::string remote_client_id;
+ std::string remote_key;
+ std::string remote_mon_host;
+
+ json_spirit::mValue json_root;
+ if(json_spirit::read(token_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ expected_remote_fsid = json_obj["fsid"].get_str();
+ remote_client_id = json_obj["client_id"].get_str();
+ remote_key = json_obj["key"].get_str();
+ remote_mon_host = json_obj["mon_host"].get_str();
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(cct) << "invalid bootstrap token JSON received" << dendl;
+ return -EINVAL;
+ }
+
+ // sanity check import process
+ std::string local_fsid;
+ librados::Rados rados(io_ctx);
+ int r = rados.cluster_fsid(&local_fsid);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string local_site_name;
+ r = site_name_get(rados, &local_site_name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster site name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ // attempt to connect to remote cluster
+ librados::Rados remote_rados;
+ remote_rados.init(remote_client_id.c_str());
+
+ auto remote_cct = reinterpret_cast<CephContext*>(remote_rados.cct());
+ remote_cct->_conf.set_val("mon_host", remote_mon_host);
+ remote_cct->_conf.set_val("key", remote_key);
+
+ r = remote_rados.connect();
+ if (r < 0) {
+ lderr(cct) << "failed to connect to peer cluster: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string remote_fsid;
+ r = remote_rados.cluster_fsid(&remote_fsid);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve remote cluster fsid: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (local_fsid == remote_fsid) {
+ lderr(cct) << "cannot import token for local cluster" << dendl;
+ return -EINVAL;
+ } else if (expected_remote_fsid != remote_fsid) {
+ lderr(cct) << "unexpected remote cluster fsid" << dendl;
+ return -EINVAL;
+ }
+
+ std::string remote_site_name;
+ r = site_name_get(remote_rados, &remote_site_name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve remote cluster site name: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (local_site_name == remote_site_name) {
+ lderr(cct) << "cannot import token for duplicate site name" << dendl;
+ return -EINVAL;
+ }
+
+ librados::IoCtx remote_io_ctx;
+ r = remote_rados.ioctx_create(io_ctx.get_pool_name().c_str(), remote_io_ctx);
+ if (r == -ENOENT) {
+ ldout(cct, 10) << "remote pool does not exist" << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to open remote pool '" << io_ctx.get_pool_name()
+ << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto remote_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ r = cls_client::mirror_mode_get(&remote_io_ctx, &remote_mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve remote mirroring mode: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (remote_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ return -ENOSYS;
+ }
+
+ auto local_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ r = cls_client::mirror_mode_get(&io_ctx, &local_mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve local mirroring mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (local_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ // copy mirror mode from remote peer
+ r = mode_set(io_ctx, static_cast<rbd_mirror_mode_t>(remote_mirror_mode));
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (direction == RBD_MIRROR_PEER_DIRECTION_RX_TX) {
+ // create a local mirror peer user and export it to the remote cluster
+ std::string local_client_id;
+ std::string local_key;
+ r = create_bootstrap_user(cct, rados, &local_client_id, &local_key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string local_mon_host = get_mon_host(cct);
+
+ // create local cluster peer in remote cluster
+ r = create_bootstrap_peer(cct, remote_io_ctx,
+ RBD_MIRROR_PEER_DIRECTION_RX_TX, local_site_name,
+ local_fsid, local_client_id, local_key,
+ local_mon_host, "local", "remote");
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ // create remote cluster peer in local cluster
+ r = create_bootstrap_peer(cct, io_ctx, direction, remote_site_name,
+ remote_fsid, remote_client_id, remote_key,
+ remote_mon_host, "remote", "local");
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_add(librados::IoCtx& io_ctx, std::string *uuid,
+ mirror_peer_direction_t direction,
+ const std::string &site_name,
+ const std::string &client_name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "name=" << site_name << ", "
+ << "client=" << client_name << dendl;
+
+ if (cct->_conf->cluster == site_name) {
+ lderr(cct) << "cannot add self as remote peer" << dendl;
+ return -EINVAL;
+ }
+
+ if (direction == RBD_MIRROR_PEER_DIRECTION_TX) {
+ return -EINVAL;
+ }
+
+ int r;
+ do {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+
+ *uuid = uuid_gen.to_string();
+ r = cls_client::mirror_peer_add(
+ &io_ctx, {*uuid, static_cast<cls::rbd::MirrorPeerDirection>(direction),
+ site_name, client_name, ""});
+ if (r == -ESTALE) {
+ ldout(cct, 5) << "duplicate UUID detected, retrying" << dendl;
+ } else if (r < 0) {
+ lderr(cct) << "failed to add mirror peer '" << site_name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } while (r == -ESTALE);
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_remove(librados::IoCtx& io_ctx,
+ const std::string &uuid) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << dendl;
+
+ int r = remove_peer_config_key(io_ctx, uuid);
+ if (r < 0) {
+ lderr(cct) << "failed to remove peer attributes '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = cls_client::mirror_peer_remove(&io_ctx, uuid);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to remove peer '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ vector<string> names;
+ r = Namespace<I>::list(io_ctx, &names);
+ if (r < 0) {
+ return r;
+ }
+
+ names.push_back("");
+
+ librados::IoCtx ns_io_ctx;
+ ns_io_ctx.dup(io_ctx);
+
+ for (auto &name : names) {
+ ns_io_ctx.set_namespace(name);
+
+ std::set<std::string> image_ids;
+ r = list_mirror_images(ns_io_ctx, image_ids);
+ if (r < 0) {
+ lderr(cct) << "failed listing images in "
+ << (name.empty() ? "default" : name) << " namespace : "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& image_id : image_ids) {
+ cls::rbd::MirrorImage mirror_image;
+ r = cls_client::mirror_image_get(&ns_io_ctx, image_id, &mirror_image);
+ if (r == -ENOENT) {
+ continue;
+ }
+ if (r < 0) {
+ lderr(cct) << "error getting mirror info for image " << image_id
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ continue;
+ }
+
+ // Snapshot based mirroring. Unlink the peer from mirroring snapshots.
+ // TODO: optimize.
+
+ I *img_ctx = I::create("", image_id, nullptr, ns_io_ctx, false);
+ img_ctx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+
+ r = img_ctx->state->open(0);
+ if (r == -ENOENT) {
+ continue;
+ }
+ if (r < 0) {
+ lderr(cct) << "error opening image " << image_id << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::list<uint64_t> snap_ids;
+ {
+ std::shared_lock image_locker{img_ctx->image_lock};
+ for (auto &it : img_ctx->snap_info) {
+ auto info = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &it.second.snap_namespace);
+ if (info && info->mirror_peer_uuids.count(uuid)) {
+ snap_ids.push_back(it.first);
+ }
+ }
+ }
+ for (auto snap_id : snap_ids) {
+ C_SaferCond cond;
+ auto req = mirror::snapshot::UnlinkPeerRequest<I>::create(
+ img_ctx, snap_id, uuid, &cond);
+ req->send();
+ r = cond.wait();
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ break;
+ }
+ }
+
+ int close_r = img_ctx->state->close();
+ if (r < 0) {
+ lderr(cct) << "error unlinking peer for image " << image_id << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else if (close_r < 0) {
+ lderr(cct) << "failed to close image " << image_id << ": "
+ << cpp_strerror(close_r) << dendl;
+ return close_r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_list(librados::IoCtx& io_ctx,
+ std::vector<mirror_peer_site_t> *peers) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << dendl;
+
+ std::vector<cls::rbd::MirrorPeer> mirror_peers;
+ int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ peers->clear();
+ peers->reserve(mirror_peers.size());
+ for (auto &mirror_peer : mirror_peers) {
+ mirror_peer_site_t peer;
+ peer.uuid = mirror_peer.uuid;
+ peer.direction = static_cast<mirror_peer_direction_t>(
+ mirror_peer.mirror_peer_direction);
+ peer.site_name = mirror_peer.site_name;
+ peer.mirror_uuid = mirror_peer.mirror_uuid;
+ peer.client_name = mirror_peer.client_name;
+ peer.last_seen = mirror_peer.last_seen.sec();
+ peers->push_back(peer);
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_set_client(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const std::string &client_name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "client=" << client_name << dendl;
+
+ int r = cls_client::mirror_peer_set_client(&io_ctx, uuid, client_name);
+ if (r < 0) {
+ lderr(cct) << "failed to update client '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_set_name(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const std::string &site_name) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "name=" << site_name << dendl;
+
+ if (cct->_conf->cluster == site_name) {
+ lderr(cct) << "cannot set self as remote peer" << dendl;
+ return -EINVAL;
+ }
+
+ int r = cls_client::mirror_peer_set_cluster(&io_ctx, uuid, site_name);
+ if (r < 0) {
+ lderr(cct) << "failed to update site '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_set_direction(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ mirror_peer_direction_t direction) {
+ cls::rbd::MirrorPeerDirection mirror_peer_direction = static_cast<
+ cls::rbd::MirrorPeerDirection>(direction);
+
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "direction=" << mirror_peer_direction << dendl;
+
+ int r = cls_client::mirror_peer_set_direction(&io_ctx, uuid,
+ mirror_peer_direction);
+ if (r < 0) {
+ lderr(cct) << "failed to update direction '" << uuid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_get_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ Attributes* attributes) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << dendl;
+
+ attributes->clear();
+
+ librados::Rados rados(io_ctx);
+ std::string value;
+ int r = get_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid),
+ &value);
+ if (r == -ENOENT || value.empty()) {
+ return -ENOENT;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve peer attributes: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(value, json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ for (auto& pairs : json_obj) {
+ (*attributes)[pairs.first] = pairs.second.get_str();
+ }
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(cct) << "invalid peer attributes JSON received" << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::peer_site_set_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const Attributes& attributes) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "uuid=" << uuid << ", "
+ << "attributes=" << attributes << dendl;
+
+ std::vector<mirror_peer_site_t> mirror_peers;
+ int r = peer_site_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ return r;
+ }
+
+ if (std::find_if(mirror_peers.begin(), mirror_peers.end(),
+ [&uuid](const librbd::mirror_peer_site_t& peer) {
+ return uuid == peer.uuid;
+ }) == mirror_peers.end()) {
+ ldout(cct, 5) << "mirror peer uuid " << uuid << " does not exist" << dendl;
+ return -ENOENT;
+ }
+
+ std::stringstream ss;
+ ss << "{";
+ for (auto& pair : attributes) {
+ ss << "\\\"" << pair.first << "\\\": "
+ << "\\\"" << pair.second << "\\\"";
+ if (&pair != &(*attributes.rbegin())) {
+ ss << ", ";
+ }
+ }
+ ss << "}";
+
+ librados::Rados rados(io_ctx);
+ r = set_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid),
+ ss.str());
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to update peer attributes: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_global_status_list(
+ librados::IoCtx& io_ctx, const std::string &start_id, size_t max,
+ IdToMirrorImageGlobalStatus *images) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ int r;
+
+ map<string, string> id_to_name;
+ {
+ map<string, string> name_to_id;
+ r = Image<I>::list_images_v2(io_ctx, &name_to_id);
+ if (r < 0) {
+ return r;
+ }
+ for (auto it : name_to_id) {
+ id_to_name[it.second] = it.first;
+ }
+ }
+
+ map<std::string, cls::rbd::MirrorImage> images_;
+ map<std::string, cls::rbd::MirrorImageStatus> statuses_;
+
+ r = librbd::cls_client::mirror_image_status_list(&io_ctx, start_id, max,
+ &images_, &statuses_);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror image statuses: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ const std::string STATUS_NOT_FOUND("status not found");
+ for (auto it = images_.begin(); it != images_.end(); ++it) {
+ auto &image_id = it->first;
+ auto &info = it->second;
+ if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) {
+ continue;
+ }
+
+ auto &image_name = id_to_name[image_id];
+ if (image_name.empty()) {
+ lderr(cct) << "failed to find image name for image " << image_id << ", "
+ << "using image id as name" << dendl;
+ image_name = image_id;
+ }
+
+ mirror_image_global_status_t& global_status = (*images)[image_id];
+ global_status.name = image_name;
+ global_status.info = mirror_image_info_t{
+ info.global_image_id,
+ static_cast<mirror_image_state_t>(info.state),
+ false}; // XXX: To set "primary" right would require an additional call.
+
+ bool found_local_site_status = false;
+ auto s_it = statuses_.find(image_id);
+ if (s_it != statuses_.end()) {
+ auto& status = s_it->second;
+
+ global_status.site_statuses.reserve(
+ status.mirror_image_site_statuses.size());
+ for (auto& site_status : status.mirror_image_site_statuses) {
+ if (site_status.mirror_uuid ==
+ cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID) {
+ found_local_site_status = true;
+ }
+
+ global_status.site_statuses.push_back(mirror_image_site_status_t{
+ site_status.mirror_uuid,
+ static_cast<mirror_image_status_state_t>(site_status.state),
+ site_status.state == cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN ?
+ STATUS_NOT_FOUND : site_status.description,
+ site_status.last_update.sec(), site_status.up});
+ }
+ }
+
+ if (!found_local_site_status) {
+ global_status.site_statuses.push_back(mirror_image_site_status_t{
+ cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID,
+ MIRROR_IMAGE_STATUS_STATE_UNKNOWN, STATUS_NOT_FOUND, 0, false});
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_status_summary(librados::IoCtx& io_ctx,
+ MirrorImageStatusStates *states) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+ std::vector<cls::rbd::MirrorPeer> mirror_peers;
+ int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror peers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::map<cls::rbd::MirrorImageStatusState, int32_t> states_;
+ r = cls_client::mirror_image_status_get_summary(&io_ctx, mirror_peers,
+ &states_);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to get mirror status summary: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (auto &s : states_) {
+ (*states)[static_cast<mirror_image_status_state_t>(s.first)] = s.second;
+ }
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_instance_id_list(
+ librados::IoCtx& io_ctx, const std::string &start_image_id, size_t max,
+ std::map<std::string, std::string> *instance_ids) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ std::map<std::string, entity_inst_t> instances;
+
+ int r = librbd::cls_client::mirror_image_instance_list(
+ &io_ctx, start_image_id, max, &instances);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror image instances: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto it : instances) {
+ (*instance_ids)[it.first] = stringify(it.second.name.num());
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_info_list(
+ librados::IoCtx& io_ctx, mirror_image_mode_t *mode_filter,
+ const std::string &start_id, size_t max,
+ std::map<std::string, std::pair<mirror_image_mode_t,
+ mirror_image_info_t>> *entries) {
+ CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+ ldout(cct, 20) << "pool=" << io_ctx.get_pool_name() << ", mode_filter="
+ << (mode_filter ? stringify(*mode_filter) : "null")
+ << ", start_id=" << start_id << ", max=" << max << dendl;
+
+ std::string last_read = start_id;
+ entries->clear();
+
+ while (entries->size() < max) {
+ map<std::string, cls::rbd::MirrorImage> images;
+ map<std::string, cls::rbd::MirrorImageStatus> statuses;
+
+ int r = librbd::cls_client::mirror_image_status_list(&io_ctx, last_read,
+ max, &images,
+ &statuses);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list mirror image statuses: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (images.empty()) {
+ break;
+ }
+
+ AsioEngine asio_engine(io_ctx);
+
+ for (auto &it : images) {
+ auto &image_id = it.first;
+ auto &image = it.second;
+ auto mode = static_cast<mirror_image_mode_t>(image.mode);
+
+ if ((mode_filter && mode != *mode_filter) ||
+ image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ continue;
+ }
+
+ // need to call get_info for every image to retrieve promotion state
+
+ mirror_image_info_t info;
+ r = image_get_info(io_ctx, asio_engine.get_work_queue(), image_id, &info);
+ if (r < 0) {
+ continue;
+ }
+
+ (*entries)[image_id] = std::make_pair(mode, info);
+ if (entries->size() == max) {
+ break;
+ }
+ }
+
+ last_read = images.rbegin()->first;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Mirror<I>::image_snapshot_create(I *ictx, uint32_t flags,
+ uint64_t *snap_id) {
+ C_SaferCond ctx;
+ Mirror<I>::image_snapshot_create(ictx, flags, snap_id, &ctx);
+
+ return ctx.wait();
+}
+
+template <typename I>
+void Mirror<I>::image_snapshot_create(I *ictx, uint32_t flags,
+ uint64_t *snap_id, Context *on_finish) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "ictx=" << ictx << dendl;
+
+ uint64_t snap_create_flags = 0;
+ int r = util::snap_create_flags_api_to_internal(cct, flags,
+ &snap_create_flags);
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ auto on_refresh = new LambdaContext(
+ [ictx, snap_create_flags, snap_id, on_finish](int r) {
+ if (r < 0) {
+ lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto ctx = new C_ImageSnapshotCreate<I>(ictx, snap_create_flags, snap_id,
+ on_finish);
+ auto req = mirror::GetInfoRequest<I>::create(*ictx, &ctx->mirror_image,
+ &ctx->promotion_state,
+ &ctx->primary_mirror_uuid,
+ ctx);
+ req->send();
+ });
+
+ if (ictx->state->is_refresh_required()) {
+ ictx->state->refresh(on_refresh);
+ } else {
+ on_refresh->complete(0);
+ }
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Mirror<librbd::ImageCtx>;
diff --git a/src/librbd/api/Mirror.h b/src/librbd/api/Mirror.h
new file mode 100644
index 000000000..b3a552b13
--- /dev/null
+++ b/src/librbd/api/Mirror.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_MIRROR_H
+#define LIBRBD_API_MIRROR_H
+
+#include "include/rbd/librbd.hpp"
+#include <map>
+#include <string>
+#include <vector>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Mirror {
+ typedef std::map<std::string, std::string> Attributes;
+ typedef std::map<std::string, mirror_image_global_status_t>
+ IdToMirrorImageGlobalStatus;
+ typedef std::map<mirror_image_status_state_t, int> MirrorImageStatusStates;
+
+ static int site_name_get(librados::Rados& rados, std::string* name);
+ static int site_name_set(librados::Rados& rados, const std::string& name);
+
+ static int mode_get(librados::IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
+ static int mode_set(librados::IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
+
+ static int uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid);
+ static void uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid,
+ Context* on_finish);
+
+ static int peer_bootstrap_create(librados::IoCtx& io_ctx, std::string* token);
+ static int peer_bootstrap_import(librados::IoCtx& io_ctx,
+ rbd_mirror_peer_direction_t direction,
+ const std::string& token);
+
+ static int peer_site_add(librados::IoCtx& io_ctx, std::string *uuid,
+ mirror_peer_direction_t direction,
+ const std::string &site_name,
+ const std::string &client_name);
+ static int peer_site_remove(librados::IoCtx& io_ctx, const std::string &uuid);
+ static int peer_site_list(librados::IoCtx& io_ctx,
+ std::vector<mirror_peer_site_t> *peers);
+ static int peer_site_set_client(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const std::string &client_name);
+ static int peer_site_set_name(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const std::string &site_name);
+ static int peer_site_set_direction(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ mirror_peer_direction_t direction);
+ static int peer_site_get_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ Attributes* attributes);
+ static int peer_site_set_attributes(librados::IoCtx& io_ctx,
+ const std::string &uuid,
+ const Attributes& attributes);
+
+ static int image_global_status_list(librados::IoCtx& io_ctx,
+ const std::string &start_id, size_t max,
+ IdToMirrorImageGlobalStatus *images);
+
+ static int image_status_summary(librados::IoCtx& io_ctx,
+ MirrorImageStatusStates *states);
+ static int image_instance_id_list(librados::IoCtx& io_ctx,
+ const std::string &start_image_id,
+ size_t max,
+ std::map<std::string, std::string> *ids);
+
+ static int image_info_list(
+ librados::IoCtx& io_ctx, mirror_image_mode_t *mode_filter,
+ const std::string &start_id, size_t max,
+ std::map<std::string, std::pair<mirror_image_mode_t,
+ mirror_image_info_t>> *entries);
+
+ static int image_enable(ImageCtxT *ictx, mirror_image_mode_t mode,
+ bool relax_same_pool_parent_check);
+ static int image_disable(ImageCtxT *ictx, bool force);
+ static int image_promote(ImageCtxT *ictx, bool force);
+ static void image_promote(ImageCtxT *ictx, bool force, Context *on_finish);
+ static int image_demote(ImageCtxT *ictx);
+ static void image_demote(ImageCtxT *ictx, Context *on_finish);
+ static int image_resync(ImageCtxT *ictx);
+ static int image_get_info(ImageCtxT *ictx,
+ mirror_image_info_t *mirror_image_info);
+ static void image_get_info(ImageCtxT *ictx,
+ mirror_image_info_t *mirror_image_info,
+ Context *on_finish);
+ static int image_get_info(librados::IoCtx& io_ctx,
+ asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ mirror_image_info_t *mirror_image_info);
+ static void image_get_info(librados::IoCtx& io_ctx,
+ asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ mirror_image_info_t *mirror_image_info,
+ Context *on_finish);
+ static int image_get_mode(ImageCtxT *ictx, mirror_image_mode_t *mode);
+ static void image_get_mode(ImageCtxT *ictx, mirror_image_mode_t *mode,
+ Context *on_finish);
+ static int image_get_global_status(ImageCtxT *ictx,
+ mirror_image_global_status_t *status);
+ static void image_get_global_status(ImageCtxT *ictx,
+ mirror_image_global_status_t *status,
+ Context *on_finish);
+ static int image_get_instance_id(ImageCtxT *ictx, std::string *instance_id);
+
+ static int image_snapshot_create(ImageCtxT *ictx, uint32_t flags,
+ uint64_t *snap_id);
+ static void image_snapshot_create(ImageCtxT *ictx, uint32_t flags,
+ uint64_t *snap_id, Context *on_finish);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Mirror<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_MIRROR_H
diff --git a/src/librbd/api/Namespace.cc b/src/librbd/api/Namespace.cc
new file mode 100644
index 000000000..6c5ac7fda
--- /dev/null
+++ b/src/librbd/api/Namespace.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/api/Mirror.h"
+#include "librbd/api/Namespace.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Namespace: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+const std::list<std::string> POOL_OBJECTS {
+ RBD_CHILDREN,
+ RBD_GROUP_DIRECTORY,
+ RBD_INFO,
+ RBD_MIRRORING,
+ RBD_TASK,
+ RBD_TRASH,
+ RBD_DIRECTORY
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Namespace<I>::create(librados::IoCtx& io_ctx, const std::string& name)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << "name=" << name << dendl;
+
+ if (name.empty()) {
+ return -EINVAL;
+ }
+
+ librados::Rados rados(io_ctx);
+ int8_t require_osd_release;
+ int r = rados.get_min_compatible_osd(&require_osd_release);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve min OSD release: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (require_osd_release < CEPH_RELEASE_NAUTILUS) {
+ ldout(cct, 1) << "namespace support requires nautilus or later OSD"
+ << dendl;
+ return -ENOSYS;
+ }
+
+
+ librados::IoCtx default_ns_ctx;
+ default_ns_ctx.dup(io_ctx);
+ default_ns_ctx.set_namespace("");
+
+ r = cls_client::namespace_add(&default_ns_ctx, name);
+ if (r < 0) {
+ lderr(cct) << "failed to add namespace: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ librados::IoCtx ns_ctx;
+ ns_ctx.dup(io_ctx);
+ ns_ctx.set_namespace(name);
+
+ r = cls_client::dir_state_set(&ns_ctx, RBD_DIRECTORY,
+ cls::rbd::DIRECTORY_STATE_READY);
+ if (r < 0) {
+ lderr(cct) << "failed to initialize image directory: " << cpp_strerror(r)
+ << dendl;
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ int ret_val = cls_client::namespace_remove(&default_ns_ctx, name);
+ if (ret_val < 0) {
+ lderr(cct) << "failed to remove namespace: " << cpp_strerror(ret_val) << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int Namespace<I>::remove(librados::IoCtx& io_ctx, const std::string& name)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << "name=" << name << dendl;
+
+ if (name.empty()) {
+ return -EINVAL;
+ }
+
+ librados::IoCtx default_ns_ctx;
+ default_ns_ctx.dup(io_ctx);
+ default_ns_ctx.set_namespace("");
+
+ librados::IoCtx ns_ctx;
+ ns_ctx.dup(io_ctx);
+ ns_ctx.set_namespace(name);
+
+ std::map<std::string, cls::rbd::TrashImageSpec> trash_entries;
+
+ librados::ObjectWriteOperation dir_op;
+ librbd::cls_client::dir_state_set(
+ &dir_op, cls::rbd::DIRECTORY_STATE_ADD_DISABLED);
+ dir_op.remove();
+
+ int r = ns_ctx.operate(RBD_DIRECTORY, &dir_op);
+ if (r == -EBUSY) {
+ ldout(cct, 5) << "image directory not empty" << dendl;
+ goto rollback;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to disable the namespace: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = cls_client::trash_list(&ns_ctx, "", 1, &trash_entries);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to list trash directory: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (!trash_entries.empty()) {
+ ldout(cct, 5) << "image trash not empty" << dendl;
+ goto rollback;
+ }
+
+ r = Mirror<I>::mode_set(ns_ctx, RBD_MIRROR_MODE_DISABLED);
+ if (r < 0) {
+ lderr(cct) << "failed to disable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ for (auto& oid : POOL_OBJECTS) {
+ r = ns_ctx.remove(oid);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to remove object '" << oid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ r = cls_client::namespace_remove(&default_ns_ctx, name);
+ if (r < 0) {
+ lderr(cct) << "failed to remove namespace: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+
+rollback:
+
+ r = librbd::cls_client::dir_state_set(
+ &ns_ctx, RBD_DIRECTORY, cls::rbd::DIRECTORY_STATE_READY);
+ if (r < 0) {
+ lderr(cct) << "failed to restore directory state: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return -EBUSY;
+}
+
+template <typename I>
+int Namespace<I>::list(IoCtx& io_ctx, vector<string> *names)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << dendl;
+
+ librados::IoCtx default_ns_ctx;
+ default_ns_ctx.dup(io_ctx);
+ default_ns_ctx.set_namespace("");
+
+ int r;
+ int max_read = 1024;
+ std::string last_read = "";
+ do {
+ std::list<std::string> name_list;
+ r = cls_client::namespace_list(&default_ns_ctx, last_read, max_read,
+ &name_list);
+ if (r == -ENOENT) {
+ return 0;
+ } else if (r < 0) {
+ lderr(cct) << "error listing namespaces: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ names->insert(names->end(), name_list.begin(), name_list.end());
+ if (!name_list.empty()) {
+ last_read = name_list.back();
+ }
+ r = name_list.size();
+ } while (r == max_read);
+
+ return 0;
+}
+
+template <typename I>
+int Namespace<I>::exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists)
+{
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 5) << "name=" << name << dendl;
+
+ *exists = false;
+ if (name.empty()) {
+ return -EINVAL;
+ }
+
+ librados::IoCtx ns_ctx;
+ ns_ctx.dup(io_ctx);
+ ns_ctx.set_namespace(name);
+
+ int r = librbd::cls_client::dir_state_assert(&ns_ctx, RBD_DIRECTORY,
+ cls::rbd::DIRECTORY_STATE_READY);
+ if (r == 0) {
+ *exists = true;
+ } else if (r != -ENOENT) {
+ lderr(cct) << "error asserting namespace: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Namespace<librbd::ImageCtx>;
diff --git a/src/librbd/api/Namespace.h b/src/librbd/api/Namespace.h
new file mode 100644
index 000000000..220eb28f3
--- /dev/null
+++ b/src/librbd/api/Namespace.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_NAMESPACE_H
+#define CEPH_LIBRBD_API_NAMESPACE_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include <string>
+#include <vector>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Namespace {
+
+ static int create(librados::IoCtx& io_ctx, const std::string& name);
+ static int remove(librados::IoCtx& io_ctx, const std::string& name);
+ static int list(librados::IoCtx& io_ctx, std::vector<std::string>* names);
+ static int exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Namespace<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_NAMESPACE_H
diff --git a/src/librbd/api/Pool.cc b/src/librbd/api/Pool.cc
new file mode 100644
index 000000000..65d55328f
--- /dev/null
+++ b/src/librbd/api/Pool.cc
@@ -0,0 +1,375 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Pool.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "common/Throttle.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "osd/osd_types.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Trash.h"
+#include "librbd/image/ValidatePoolRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Pool::ImageStatRequest: " \
+ << __func__ << " " << this << ": " \
+ << "(id=" << m_image_id << "): "
+
+template <typename I>
+class ImageStatRequest {
+public:
+ ImageStatRequest(librados::IoCtx& io_ctx, SimpleThrottle& throttle,
+ const std::string& image_id, bool scan_snaps,
+ std::atomic<uint64_t>* bytes,
+ std::atomic<uint64_t>* max_bytes,
+ std::atomic<uint64_t>* snaps)
+ : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ m_io_ctx(io_ctx), m_throttle(throttle), m_image_id(image_id),
+ m_scan_snaps(scan_snaps), m_bytes(bytes), m_max_bytes(max_bytes),
+ m_snaps(snaps) {
+ m_throttle.start_op();
+ }
+
+ void send() {
+ get_head();
+ }
+
+protected:
+ void finish(int r) {
+ (*m_max_bytes) += m_max_size;
+ m_throttle.end_op(r);
+
+ delete this;
+ }
+
+private:
+ CephContext* m_cct;
+ librados::IoCtx& m_io_ctx;
+ SimpleThrottle& m_throttle;
+ const std::string& m_image_id;
+ bool m_scan_snaps;
+ std::atomic<uint64_t>* m_bytes;
+ std::atomic<uint64_t>* m_max_bytes;
+ std::atomic<uint64_t>* m_snaps;
+ bufferlist m_out_bl;
+
+ uint64_t m_max_size = 0;
+ ::SnapContext m_snapc;
+
+ void get_head() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_size_start(&op, CEPH_NOSNAP);
+ if (m_scan_snaps) {
+ cls_client::get_snapcontext_start(&op);
+ }
+
+ m_out_bl.clear();
+ auto aio_comp = util::create_rados_callback<
+ ImageStatRequest<I>, &ImageStatRequest<I>::handle_get_head>(this);
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ }
+
+ void handle_get_head(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (r == 0) {
+ uint8_t order;
+ r = cls_client::get_size_finish(&it, &m_max_size, &order);
+ if (r == 0) {
+ (*m_bytes) += m_max_size;
+ }
+ }
+ if (m_scan_snaps && r == 0) {
+ r = cls_client::get_snapcontext_finish(&it, &m_snapc);
+ if (r == 0) {
+ (*m_snaps) += m_snapc.snaps.size();
+ }
+ }
+
+ if (r == -ENOENT) {
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to stat image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_snapc.is_valid()) {
+ lderr(m_cct) << "snap context is invalid" << dendl;
+ finish(-EIO);
+ return;
+ }
+
+ get_snaps();
+ }
+
+ void get_snaps() {
+ if (!m_scan_snaps || m_snapc.snaps.empty()) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+ librados::ObjectReadOperation op;
+ for (auto snap_seq : m_snapc.snaps) {
+ cls_client::get_size_start(&op, snap_seq);
+ }
+
+ m_out_bl.clear();
+ auto aio_comp = util::create_rados_callback<
+ ImageStatRequest<I>, &ImageStatRequest<I>::handle_get_snaps>(this);
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ }
+
+ void handle_get_snaps(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ auto it = m_out_bl.cbegin();
+ for ([[maybe_unused]] auto snap_seq : m_snapc.snaps) {
+ uint64_t size;
+ if (r == 0) {
+ uint8_t order;
+ r = cls_client::get_size_finish(&it, &size, &order);
+ }
+ if (r == 0 && m_max_size < size) {
+ m_max_size = size;
+ }
+ }
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 15) << "out-of-sync metadata" << dendl;
+ get_head();
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve snap size: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ } else {
+ finish(0);
+ }
+ }
+
+};
+
+template <typename I>
+void get_pool_stat_option_value(typename Pool<I>::StatOptions* stat_options,
+ rbd_pool_stat_option_t option,
+ uint64_t** value) {
+ auto it = stat_options->find(option);
+ if (it == stat_options->end()) {
+ *value = nullptr;
+ } else {
+ *value = it->second;
+ }
+}
+
+template <typename I>
+int get_pool_stats(librados::IoCtx& io_ctx, const ConfigProxy& config,
+ const std::vector<std::string>& image_ids, uint64_t* image_count,
+ uint64_t* provisioned_bytes, uint64_t* max_provisioned_bytes,
+ uint64_t* snapshot_count) {
+
+ bool scan_snaps = ((max_provisioned_bytes != nullptr) ||
+ (snapshot_count != nullptr));
+
+ SimpleThrottle throttle(
+ config.template get_val<uint64_t>("rbd_concurrent_management_ops"), true);
+ std::atomic<uint64_t> bytes{0};
+ std::atomic<uint64_t> max_bytes{0};
+ std::atomic<uint64_t> snaps{0};
+ for (auto& image_id : image_ids) {
+ if (throttle.pending_error()) {
+ break;
+ }
+
+ auto req = new ImageStatRequest<I>(io_ctx, throttle, image_id,
+ scan_snaps, &bytes, &max_bytes, &snaps);
+ req->send();
+ }
+
+ int r = throttle.wait_for_ret();
+ if (r < 0) {
+ return r;
+ }
+
+ if (image_count != nullptr) {
+ *image_count = image_ids.size();
+ }
+ if (provisioned_bytes != nullptr) {
+ *provisioned_bytes = bytes.load();
+ }
+ if (max_provisioned_bytes != nullptr) {
+ *max_provisioned_bytes = max_bytes.load();
+ }
+ if (snapshot_count != nullptr) {
+ *snapshot_count = snaps.load();
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Pool: " << __func__ << ": "
+
+template <typename I>
+int Pool<I>::init(librados::IoCtx& io_ctx, bool force) {
+ auto cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ int r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RBD, force);
+ if (r < 0) {
+ return r;
+ }
+
+ ConfigProxy config{cct->_conf};
+ api::Config<I>::apply_pool_overrides(io_ctx, &config);
+ if (!config.get_val<bool>("rbd_validate_pool")) {
+ return 0;
+ }
+
+ C_SaferCond ctx;
+ auto req = image::ValidatePoolRequest<I>::create(io_ctx, &ctx);
+ req->send();
+
+ return ctx.wait();
+}
+
+template <typename I>
+int Pool<I>::add_stat_option(StatOptions* stat_options,
+ rbd_pool_stat_option_t option,
+ uint64_t* value) {
+ switch (option) {
+ case RBD_POOL_STAT_OPTION_IMAGES:
+ case RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS:
+ case RBD_POOL_STAT_OPTION_TRASH_IMAGES:
+ case RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES:
+ case RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS:
+ stat_options->emplace(option, value);
+ return 0;
+ default:
+ break;
+ }
+ return -ENOENT;
+}
+
+template <typename I>
+int Pool<I>::get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options) {
+ auto cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ ConfigProxy config{cct->_conf};
+ api::Config<I>::apply_pool_overrides(io_ctx, &config);
+
+ uint64_t* image_count;
+ uint64_t* provisioned_bytes;
+ uint64_t* max_provisioned_bytes;
+ uint64_t* snapshot_count;
+
+ std::vector<trash_image_info_t> trash_entries;
+ int r = Trash<I>::list(io_ctx, trash_entries, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGES, &image_count);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES,
+ &provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+ &max_provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snapshot_count);
+ if (image_count != nullptr || provisioned_bytes != nullptr ||
+ max_provisioned_bytes != nullptr || snapshot_count != nullptr) {
+ typename Image<I>::ImageNameToIds images;
+ int r = Image<I>::list_images_v2(io_ctx, &images);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<std::string> image_ids;
+ image_ids.reserve(images.size() + trash_entries.size());
+ for (auto& it : images) {
+ image_ids.push_back(std::move(it.second));
+ }
+ for (auto& it : trash_entries) {
+ if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ image_ids.push_back(std::move(it.id));
+ }
+ }
+
+ r = get_pool_stats<I>(io_ctx, config, image_ids, image_count,
+ provisioned_bytes, max_provisioned_bytes,
+ snapshot_count);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_IMAGES, &image_count);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES,
+ &provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+ &max_provisioned_bytes);
+ get_pool_stat_option_value<I>(
+ stat_options, RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &snapshot_count);
+ if (image_count != nullptr || provisioned_bytes != nullptr ||
+ max_provisioned_bytes != nullptr || snapshot_count != nullptr) {
+
+ std::vector<std::string> image_ids;
+ image_ids.reserve(trash_entries.size());
+ for (auto& it : trash_entries) {
+ if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) {
+ continue;
+ }
+ image_ids.push_back(std::move(it.id));
+ }
+
+ r = get_pool_stats<I>(io_ctx, config, image_ids, image_count,
+ provisioned_bytes, max_provisioned_bytes,
+ snapshot_count);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Pool<librbd::ImageCtx>;
diff --git a/src/librbd/api/Pool.h b/src/librbd/api/Pool.h
new file mode 100644
index 000000000..7b607ab6e
--- /dev/null
+++ b/src/librbd/api/Pool.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_POOL_H
+#define CEPH_LIBRBD_API_POOL_H
+
+#include "include/int_types.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.h"
+#include <map>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Pool {
+public:
+ typedef std::map<rbd_pool_stat_option_t, uint64_t*> StatOptions;
+
+ static int init(librados::IoCtx& io_ctx, bool force);
+
+ static int add_stat_option(StatOptions* stat_options,
+ rbd_pool_stat_option_t option,
+ uint64_t* value);
+
+ static int get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Pool<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_POOL_H
diff --git a/src/librbd/api/PoolMetadata.cc b/src/librbd/api/PoolMetadata.cc
new file mode 100644
index 000000000..33e3fb648
--- /dev/null
+++ b/src/librbd/api/PoolMetadata.cc
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/PoolMetadata.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/image/GetMetadataRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::PoolMetadata: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+void update_pool_timestamp(librados::IoCtx& io_ctx) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ auto now = ceph_clock_now();
+ std::string cmd =
+ R"({)"
+ R"("prefix": "config set", )"
+ R"("who": "global", )"
+ R"("name": "rbd_config_pool_override_update_timestamp", )"
+ R"("value": ")" + stringify(now.sec()) + R"(")"
+ R"(})";
+
+ librados::Rados rados(io_ctx);
+ bufferlist in_bl;
+ std::string ss;
+ int r = rados.mon_command(cmd, in_bl, nullptr, &ss);
+ if (r < 0) {
+ lderr(cct) << "failed to notify clients of pool config update: "
+ << cpp_strerror(r) << dendl;
+ }
+}
+
+} // anonymous namespace
+
+template <typename I>
+int PoolMetadata<I>::get(librados::IoCtx& io_ctx,
+ const std::string &key, std::string *value) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, value);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return r;
+}
+
+template <typename I>
+int PoolMetadata<I>::set(librados::IoCtx& io_ctx, const std::string &key,
+ const std::string &value) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ bool need_update_pool_timestamp = false;
+
+ std::string config_key;
+ if (util::is_metadata_config_override(key, &config_key)) {
+ if (!librbd::api::Config<I>::is_option_name(io_ctx, config_key)) {
+ lderr(cct) << "validation for " << key
+ << " failed: not allowed pool level override" << dendl;
+ return -EINVAL;
+ }
+ int r = ConfigProxy{false}.set_val(config_key.c_str(), value);
+ if (r < 0) {
+ lderr(cct) << "validation for " << key << " failed: " << cpp_strerror(r)
+ << dendl;
+ return -EINVAL;
+ }
+
+ need_update_pool_timestamp = true;
+ }
+
+ ceph::bufferlist bl;
+ bl.append(value);
+
+ int r = cls_client::metadata_set(&io_ctx, RBD_INFO, {{key, bl}});
+ if (r < 0) {
+ lderr(cct) << "failed setting metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (need_update_pool_timestamp) {
+ update_pool_timestamp(io_ctx);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int PoolMetadata<I>::remove(librados::IoCtx& io_ctx, const std::string &key) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ std::string value;
+ int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, &value);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "metadata " << key << " does not exist" << dendl;
+ } else {
+ lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ }
+ return r;
+ }
+
+ r = cls_client::metadata_remove(&io_ctx, RBD_INFO, key);
+ if (r < 0) {
+ lderr(cct) << "failed removing metadata " << key << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string config_key;
+ if (util::is_metadata_config_override(key, &config_key)) {
+ update_pool_timestamp(io_ctx);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int PoolMetadata<I>::list(librados::IoCtx& io_ctx, const std::string &start,
+ uint64_t max,
+ std::map<std::string, ceph::bufferlist> *pairs) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ pairs->clear();
+ C_SaferCond ctx;
+ auto req = image::GetMetadataRequest<I>::create(
+ io_ctx, RBD_INFO, false, "", start, max, pairs, &ctx);
+ req->send();
+
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed listing metadata: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::PoolMetadata<librbd::ImageCtx>;
diff --git a/src/librbd/api/PoolMetadata.h b/src/librbd/api/PoolMetadata.h
new file mode 100644
index 000000000..c0a817359
--- /dev/null
+++ b/src/librbd/api/PoolMetadata.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_POOL_METADATA_H
+#define CEPH_LIBRBD_API_POOL_METADATA_H
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+
+#include <map>
+#include <string>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PoolMetadata {
+public:
+ static int get(librados::IoCtx& io_ctx, const std::string &key,
+ std::string *value);
+ static int set(librados::IoCtx& io_ctx, const std::string &key,
+ const std::string &value);
+ static int remove(librados::IoCtx& io_ctx, const std::string &key);
+ static int list(librados::IoCtx& io_ctx, const std::string &start,
+ uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::PoolMetadata<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_POOL_METADATA_H
diff --git a/src/librbd/api/Snapshot.cc b/src/librbd/api/Snapshot.cc
new file mode 100644
index 000000000..88f22694c
--- /dev/null
+++ b/src/librbd/api/Snapshot.cc
@@ -0,0 +1,444 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Snapshot.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/errno.h"
+#include "librbd/internal.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Image.h"
+#include "include/Context.h"
+#include "common/Cond.h"
+
+#include <boost/variant.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Snapshot: " << __func__ << ": "
+
+using librados::snap_t;
+
+namespace librbd {
+namespace api {
+
+namespace {
+
+class GetGroupVisitor : public boost::static_visitor<int> {
+public:
+ CephContext* cct;
+ librados::IoCtx *image_ioctx;
+ snap_group_namespace_t *group_snap;
+
+ explicit GetGroupVisitor(CephContext* cct, librados::IoCtx *_image_ioctx,
+ snap_group_namespace_t *group_snap)
+ : cct(cct), image_ioctx(_image_ioctx), group_snap(group_snap) {};
+
+ template <typename T>
+ inline int operator()(const T&) const {
+ // ignore other than GroupSnapshotNamespace types.
+ return -EINVAL;
+ }
+
+ inline int operator()(
+ const cls::rbd::GroupSnapshotNamespace& snap_namespace) {
+ IoCtx group_ioctx;
+ int r = util::create_ioctx(*image_ioctx, "group", snap_namespace.group_pool,
+ {}, &group_ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::GroupSnapshot group_snapshot;
+
+ std::string group_name;
+ r = cls_client::dir_get_name(&group_ioctx, RBD_GROUP_DIRECTORY,
+ snap_namespace.group_id, &group_name);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve group name: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ string group_header_oid = util::group_header_name(snap_namespace.group_id);
+ r = cls_client::group_snap_get_by_id(&group_ioctx,
+ group_header_oid,
+ snap_namespace.group_snapshot_id,
+ &group_snapshot);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve group snapshot: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ group_snap->group_pool = group_ioctx.get_id();
+ group_snap->group_name = group_name;
+ group_snap->group_snap_name = group_snapshot.name;
+ return 0;
+ }
+};
+
+class GetTrashVisitor : public boost::static_visitor<int> {
+public:
+ std::string* original_name;
+
+ explicit GetTrashVisitor(std::string* original_name)
+ : original_name(original_name) {
+ }
+
+ template <typename T>
+ inline int operator()(const T&) const {
+ return -EINVAL;
+ }
+
+ inline int operator()(
+ const cls::rbd::TrashSnapshotNamespace& snap_namespace) {
+ *original_name = snap_namespace.original_name;
+ return 0;
+ }
+};
+
+class GetMirrorVisitor : public boost::static_visitor<int> {
+public:
+ snap_mirror_namespace_t *mirror_snap;
+
+ explicit GetMirrorVisitor(snap_mirror_namespace_t *mirror_snap)
+ : mirror_snap(mirror_snap) {
+ }
+
+ template <typename T>
+ inline int operator()(const T&) const {
+ return -EINVAL;
+ }
+
+ inline int operator()(
+ const cls::rbd::MirrorSnapshotNamespace& snap_namespace) {
+ mirror_snap->state = static_cast<snap_mirror_state_t>(snap_namespace.state);
+ mirror_snap->complete = snap_namespace.complete;
+ mirror_snap->mirror_peer_uuids = snap_namespace.mirror_peer_uuids;
+ mirror_snap->primary_mirror_uuid = snap_namespace.primary_mirror_uuid;
+ mirror_snap->primary_snap_id = snap_namespace.primary_snap_id;
+ mirror_snap->last_copied_object_number =
+ snap_namespace.last_copied_object_number;
+ return 0;
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+int Snapshot<I>::get_group_namespace(I *ictx, uint64_t snap_id,
+ snap_group_namespace_t *group_snap) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock image_locker{ictx->image_lock};
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ GetGroupVisitor ggv = GetGroupVisitor(ictx->cct, &ictx->md_ctx, group_snap);
+ r = boost::apply_visitor(ggv, snap_info->snap_namespace);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::get_trash_namespace(I *ictx, uint64_t snap_id,
+ std::string* original_name) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock image_locker{ictx->image_lock};
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ auto visitor = GetTrashVisitor(original_name);
+ r = boost::apply_visitor(visitor, snap_info->snap_namespace);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::get_mirror_namespace(
+ I *ictx, uint64_t snap_id, snap_mirror_namespace_t *mirror_snap) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock image_locker{ictx->image_lock};
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ auto gmv = GetMirrorVisitor(mirror_snap);
+ r = boost::apply_visitor(gmv, snap_info->snap_namespace);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::get_namespace_type(I *ictx, uint64_t snap_id,
+ snap_namespace_type_t *namespace_type) {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock l{ictx->image_lock};
+ auto snap_info = ictx->get_snap_info(snap_id);
+ if (snap_info == nullptr) {
+ return -ENOENT;
+ }
+
+ *namespace_type = static_cast<snap_namespace_type_t>(
+ cls::rbd::get_snap_namespace_type(snap_info->snap_namespace));
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::remove(I *ictx, uint64_t snap_id) {
+ ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_id << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ cls::rbd::SnapshotNamespace snapshot_namespace;
+ std::string snapshot_name;
+ {
+ std::shared_lock image_locker{ictx->image_lock};
+ auto it = ictx->snap_info.find(snap_id);
+ if (it == ictx->snap_info.end()) {
+ return -ENOENT;
+ }
+
+ snapshot_namespace = it->second.snap_namespace;
+ snapshot_name = it->second.name;
+ }
+
+ C_SaferCond ctx;
+ ictx->operations->snap_remove(snapshot_namespace, snapshot_name, &ctx);
+ r = ctx.wait();
+ return r;
+}
+
+template <typename I>
+int Snapshot<I>::get_name(I *ictx, uint64_t snap_id, std::string *snap_name)
+ {
+ ldout(ictx->cct, 20) << "snap_get_name " << ictx << " " << snap_id << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ std::shared_lock image_locker{ictx->image_lock};
+ r = ictx->get_snap_name(snap_id, snap_name);
+
+ return r;
+ }
+
+template <typename I>
+int Snapshot<I>::get_id(I *ictx, const std::string& snap_name, uint64_t *snap_id)
+ {
+ ldout(ictx->cct, 20) << "snap_get_id " << ictx << " " << snap_name << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ std::shared_lock image_locker{ictx->image_lock};
+ *snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), snap_name);
+ if (*snap_id == CEPH_NOSNAP)
+ return -ENOENT;
+
+ return 0;
+ }
+
+template <typename I>
+int Snapshot<I>::list(I *ictx, vector<snap_info_t>& snaps) {
+ ldout(ictx->cct, 20) << "snap_list " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ std::shared_lock l{ictx->image_lock};
+ for (auto &it : ictx->snap_info) {
+ snap_info_t info;
+ info.name = it.second.name;
+ info.id = it.first;
+ info.size = it.second.size;
+ snaps.push_back(info);
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::exists(I *ictx, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const char *snap_name, bool *exists) {
+ ldout(ictx->cct, 20) << "snap_exists " << ictx << " " << snap_name << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ std::shared_lock l{ictx->image_lock};
+ *exists = ictx->get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP;
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::create(I *ictx, const char *snap_name, uint32_t flags,
+ ProgressContext& pctx) {
+ ldout(ictx->cct, 20) << "snap_create " << ictx << " " << snap_name
+ << " flags: " << flags << dendl;
+
+ uint64_t internal_flags = 0;
+ int r = util::snap_create_flags_api_to_internal(ictx->cct, flags,
+ &internal_flags);
+ if (r < 0) {
+ return r;
+ }
+
+ return ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(),
+ snap_name, internal_flags, pctx);
+}
+
+template <typename I>
+int Snapshot<I>::remove(I *ictx, const char *snap_name, uint32_t flags,
+ ProgressContext& pctx) {
+ ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_name << " flags: " << flags << dendl;
+
+ int r = 0;
+
+ r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ if (flags & RBD_SNAP_REMOVE_FLATTEN) {
+ r = Image<I>::flatten_children(ictx, snap_name, pctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ bool protect;
+ r = is_protected(ictx, snap_name, &protect);
+ if (r < 0) {
+ return r;
+ }
+
+ if (protect && flags & RBD_SNAP_REMOVE_UNPROTECT) {
+ r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ if (r < 0) {
+ lderr(ictx->cct) << "failed to unprotect snapshot: " << snap_name << dendl;
+ return r;
+ }
+
+ r = is_protected(ictx, snap_name, &protect);
+ if (r < 0) {
+ return r;
+ }
+ if (protect) {
+ lderr(ictx->cct) << "snapshot is still protected after unprotection" << dendl;
+ ceph_abort();
+ }
+ }
+
+ C_SaferCond ctx;
+ ictx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(), snap_name, &ctx);
+
+ r = ctx.wait();
+ return r;
+}
+
+template <typename I>
+int Snapshot<I>::get_timestamp(I *ictx, uint64_t snap_id, struct timespec *timestamp) {
+ auto snap_it = ictx->snap_info.find(snap_id);
+ ceph_assert(snap_it != ictx->snap_info.end());
+ utime_t time = snap_it->second.timestamp;
+ time.to_timespec(timestamp);
+ return 0;
+}
+
+template <typename I>
+int Snapshot<I>::get_limit(I *ictx, uint64_t *limit) {
+ int r = cls_client::snapshot_get_limit(&ictx->md_ctx, ictx->header_oid,
+ limit);
+ if (r == -EOPNOTSUPP) {
+ *limit = UINT64_MAX;
+ r = 0;
+ }
+ return r;
+}
+
+template <typename I>
+int Snapshot<I>::set_limit(I *ictx, uint64_t limit) {
+ return ictx->operations->snap_set_limit(limit);
+}
+
+template <typename I>
+int Snapshot<I>::is_protected(I *ictx, const char *snap_name, bool *protect) {
+ ldout(ictx->cct, 20) << "snap_is_protected " << ictx << " " << snap_name
+ << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ std::shared_lock l{ictx->image_lock};
+ snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), snap_name);
+ if (snap_id == CEPH_NOSNAP)
+ return -ENOENT;
+ bool is_unprotected;
+ r = ictx->is_snap_unprotected(snap_id, &is_unprotected);
+ // consider both PROTECTED or UNPROTECTING to be 'protected',
+ // since in either state they can't be deleted
+ *protect = !is_unprotected;
+ return r;
+}
+
+template <typename I>
+int Snapshot<I>::get_namespace(I *ictx, const char *snap_name,
+ cls::rbd::SnapshotNamespace *snap_namespace) {
+ ldout(ictx->cct, 20) << "get_snap_namespace " << ictx << " " << snap_name
+ << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ std::shared_lock l{ictx->image_lock};
+ snap_t snap_id = ictx->get_snap_id(*snap_namespace, snap_name);
+ if (snap_id == CEPH_NOSNAP)
+ return -ENOENT;
+ r = ictx->get_snap_namespace(snap_id, snap_namespace);
+ return r;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Snapshot<librbd::ImageCtx>;
diff --git a/src/librbd/api/Snapshot.h b/src/librbd/api/Snapshot.h
new file mode 100644
index 000000000..7e06a5a8d
--- /dev/null
+++ b/src/librbd/api/Snapshot.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_SNAPSHOT_H
+#define CEPH_LIBRBD_API_SNAPSHOT_H
+
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Snapshot {
+
+ static int get_group_namespace(ImageCtxT *ictx, uint64_t snap_id,
+ snap_group_namespace_t *group_snap);
+
+ static int get_trash_namespace(ImageCtxT *ictx, uint64_t snap_id,
+ std::string *original_name);
+
+ static int get_mirror_namespace(
+ ImageCtxT *ictx, uint64_t snap_id,
+ snap_mirror_namespace_t *mirror_snap);
+
+ static int get_namespace_type(ImageCtxT *ictx, uint64_t snap_id,
+ snap_namespace_type_t *namespace_type);
+
+ static int remove(ImageCtxT *ictx, uint64_t snap_id);
+
+ static int get_name(ImageCtxT *ictx, uint64_t snap_id, std::string *snap_name);
+
+ static int get_id(ImageCtxT *ictx, const std::string& snap_name, uint64_t *snap_id);
+
+ static int list(ImageCtxT *ictx, std::vector<snap_info_t>& snaps);
+
+ static int exists(ImageCtxT *ictx, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const char *snap_name, bool *exists);
+
+ static int create(ImageCtxT *ictx, const char *snap_name, uint32_t flags,
+ ProgressContext& pctx);
+
+ static int remove(ImageCtxT *ictx, const char *snap_name, uint32_t flags, ProgressContext& pctx);
+
+ static int get_limit(ImageCtxT *ictx, uint64_t *limit);
+
+ static int set_limit(ImageCtxT *ictx, uint64_t limit);
+
+ static int get_timestamp(ImageCtxT *ictx, uint64_t snap_id, struct timespec *timestamp);
+
+ static int is_protected(ImageCtxT *ictx, const char *snap_name, bool *protect);
+
+ static int get_namespace(ImageCtxT *ictx, const char *snap_name,
+ cls::rbd::SnapshotNamespace *snap_namespace);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Snapshot<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_API_SNAPSHOT_H
diff --git a/src/librbd/api/Trash.cc b/src/librbd/api/Trash.cc
new file mode 100644
index 000000000..b66ac3057
--- /dev/null
+++ b/src/librbd/api/Trash.cc
@@ -0,0 +1,759 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Trash.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/TrashWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/api/DiffIterate.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/trash/MoveRequest.h"
+#include "librbd/trash/RemoveRequest.h"
+#include <json_spirit/json_spirit.h>
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/image/ListWatchersRequest.h"
+#include <experimental/map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::Trash: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+
+template <typename I>
+const typename Trash<I>::TrashImageSources Trash<I>::ALLOWED_RESTORE_SOURCES {
+ cls::rbd::TRASH_IMAGE_SOURCE_USER,
+ cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING,
+ cls::rbd::TRASH_IMAGE_SOURCE_USER_PARENT
+ };
+
+namespace {
+
+template <typename I>
+int disable_mirroring(I *ictx) {
+ ldout(ictx->cct, 10) << dendl;
+
+ C_SaferCond ctx;
+ auto req = mirror::DisableRequest<I>::create(ictx, false, true, &ctx);
+ req->send();
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(ictx->cct) << "failed to disable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int enable_mirroring(IoCtx &io_ctx, const std::string &image_id) {
+ auto cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+
+ uint64_t features;
+ uint64_t incompatible_features;
+ int r = cls_client::get_features(&io_ctx, util::header_name(image_id), true,
+ &features, &incompatible_features);
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve features: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((features & RBD_FEATURE_JOURNALING) == 0) {
+ return 0;
+ }
+
+ cls::rbd::MirrorMode mirror_mode;
+ r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (mirror_mode != cls::rbd::MIRROR_MODE_POOL) {
+ ldout(cct, 10) << "not pool mirroring mode" << dendl;
+ return 0;
+ }
+
+ ldout(cct, 10) << dendl;
+
+ AsioEngine asio_engine(io_ctx);
+
+ C_SaferCond ctx;
+ auto req = mirror::EnableRequest<I>::create(
+ io_ctx, image_id, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", false,
+ asio_engine.get_work_queue(), &ctx);
+ req->send();
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int list_trash_image_specs(
+ librados::IoCtx &io_ctx,
+ std::map<std::string, cls::rbd::TrashImageSpec>* trash_image_specs,
+ bool exclude_user_remove_source) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "list_trash_image_specs " << &io_ctx << dendl;
+
+ bool more_entries;
+ uint32_t max_read = 1024;
+ std::string last_read;
+ do {
+ std::map<string, cls::rbd::TrashImageSpec> trash_entries;
+ int r = cls_client::trash_list(&io_ctx, last_read, max_read,
+ &trash_entries);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing rbd trash entries: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ } else if (r == -ENOENT) {
+ break;
+ }
+
+ if (trash_entries.empty()) {
+ break;
+ }
+
+ for (const auto &entry : trash_entries) {
+ if (exclude_user_remove_source &&
+ entry.second.source == cls::rbd::TRASH_IMAGE_SOURCE_REMOVING) {
+ continue;
+ }
+
+ trash_image_specs->insert({entry.first, entry.second});
+ }
+
+ last_read = trash_entries.rbegin()->first;
+ more_entries = (trash_entries.size() >= max_read);
+ } while (more_entries);
+
+ return 0;
+}
+
+} // anonymous namespace
+
+template <typename I>
+int Trash<I>::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, const std::string &image_id,
+ uint64_t delay) {
+ ceph_assert(!image_name.empty() && !image_id.empty());
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << &io_ctx << " name=" << image_name << ", id=" << image_id
+ << dendl;
+
+ auto ictx = new I("", image_id, nullptr, io_ctx, false);
+ int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (r == 0) {
+ cls::rbd::MirrorImage mirror_image;
+ int mirror_r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
+ &mirror_image);
+ if (mirror_r == -ENOENT) {
+ ldout(ictx->cct, 10) << "mirroring is not enabled for this image"
+ << dendl;
+ } else if (mirror_r < 0) {
+ lderr(ictx->cct) << "failed to retrieve mirror image: "
+ << cpp_strerror(mirror_r) << dendl;
+ return mirror_r;
+ } else if (mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ // a remote rbd-mirror might own the exclusive-lock on this image
+ // and therefore we need to disable mirroring so that it closes the image
+ r = disable_mirroring<I>(ictx);
+ if (r < 0) {
+ ictx->state->close();
+ return r;
+ }
+ }
+
+ if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+ std::unique_lock image_locker{ictx->image_lock};
+ ictx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
+ ictx->owner_lock.lock_shared();
+ if (ictx->exclusive_lock != nullptr) {
+ ictx->exclusive_lock->block_requests(0);
+
+ r = ictx->operations->prepare_image_update(
+ exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true);
+ if (r < 0) {
+ lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl;
+ ictx->owner_lock.unlock_shared();
+ ictx->state->close();
+ return -EBUSY;
+ }
+ }
+ ictx->owner_lock.unlock_shared();
+
+ ictx->image_lock.lock_shared();
+ if (!ictx->migration_info.empty()) {
+ lderr(cct) << "cannot move migrating image to trash" << dendl;
+ ictx->image_lock.unlock_shared();
+ ictx->state->close();
+ return -EBUSY;
+ }
+ ictx->image_lock.unlock_shared();
+
+ if (mirror_r >= 0 &&
+ mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ r = disable_mirroring<I>(ictx);
+ if (r < 0) {
+ ictx->state->close();
+ return r;
+ }
+ }
+
+ ictx->state->close();
+ }
+
+ utime_t delete_time{ceph_clock_now()};
+ utime_t deferment_end_time{delete_time};
+ deferment_end_time += delay;
+ cls::rbd::TrashImageSpec trash_image_spec{
+ static_cast<cls::rbd::TrashImageSource>(source), image_name,
+ delete_time, deferment_end_time};
+
+ trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_MOVING;
+ C_SaferCond ctx;
+ auto req = trash::MoveRequest<I>::create(io_ctx, image_id, trash_image_spec,
+ &ctx);
+ req->send();
+
+ r = ctx.wait();
+ trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_NORMAL;
+ int ret = cls_client::trash_state_set(&io_ctx, image_id,
+ trash_image_spec.state,
+ cls::rbd::TRASH_IMAGE_STATE_MOVING);
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond notify_ctx;
+ TrashWatcher<I>::notify_image_added(io_ctx, image_id, trash_image_spec,
+ &notify_ctx);
+ r = notify_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, uint64_t delay) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << &io_ctx << " name=" << image_name << dendl;
+
+ // try to get image id from the directory
+ std::string image_id;
+ int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name,
+ &image_id);
+ if (r == -ENOENT) {
+ r = io_ctx.stat(util::old_header_name(image_name), nullptr, nullptr);
+ if (r == 0) {
+ // cannot move V1 image to trash
+ ldout(cct, 10) << "cannot move v1 image to trash" << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ // search for an interrupted trash move request
+ std::map<std::string, cls::rbd::TrashImageSpec> trash_image_specs;
+ int r = list_trash_image_specs(io_ctx, &trash_image_specs, true);
+ if (r < 0) {
+ return r;
+ }
+
+ std::experimental::erase_if(
+ trash_image_specs, [image_name](const auto& pair) {
+ const auto& spec = pair.second;
+ return (spec.source != cls::rbd::TRASH_IMAGE_SOURCE_USER ||
+ spec.state != cls::rbd::TRASH_IMAGE_STATE_MOVING ||
+ spec.name != image_name);
+ });
+ if (trash_image_specs.empty()) {
+ return -ENOENT;
+ }
+
+ image_id = trash_image_specs.begin()->first;
+ ldout(cct, 15) << "derived image id " << image_id << " from existing "
+ << "trash entry" << dendl;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (image_name.empty() || image_id.empty()) {
+ lderr(cct) << "invalid image name/id" << dendl;
+ return -EINVAL;
+ }
+
+ return Trash<I>::move(io_ctx, source, image_name, image_id, delay);
+}
+
+template <typename I>
+int Trash<I>::get(IoCtx &io_ctx, const std::string &id,
+ trash_image_info_t *info) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << __func__ << " " << &io_ctx << dendl;
+
+ cls::rbd::TrashImageSpec spec;
+ int r = cls_client::trash_get(&io_ctx, id, &spec);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "error retrieving trash entry: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ rbd_trash_image_source_t source = static_cast<rbd_trash_image_source_t>(
+ spec.source);
+ *info = trash_image_info_t{id, spec.name, source, spec.deletion_time.sec(),
+ spec.deferment_end_time.sec()};
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::list(IoCtx &io_ctx, vector<trash_image_info_t> &entries,
+ bool exclude_user_remove_source) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << __func__ << " " << &io_ctx << dendl;
+
+ std::map<std::string, cls::rbd::TrashImageSpec> trash_image_specs;
+ int r = list_trash_image_specs(io_ctx, &trash_image_specs,
+ exclude_user_remove_source);
+ if (r < 0) {
+ return r;
+ }
+
+ entries.reserve(trash_image_specs.size());
+ for (const auto& [image_id, spec] : trash_image_specs) {
+ rbd_trash_image_source_t source =
+ static_cast<rbd_trash_image_source_t>(spec.source);
+ entries.push_back({image_id, spec.name, source,
+ spec.deletion_time.sec(),
+ spec.deferment_end_time.sec()});
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::purge(IoCtx& io_ctx, time_t expire_ts,
+ float threshold, ProgressContext& pctx) {
+ auto *cct((CephContext *) io_ctx.cct());
+ ldout(cct, 20) << &io_ctx << dendl;
+
+ std::vector<librbd::trash_image_info_t> trash_entries;
+ int r = librbd::api::Trash<I>::list(io_ctx, trash_entries, true);
+ if (r < 0) {
+ return r;
+ }
+
+ trash_entries.erase(
+ std::remove_if(trash_entries.begin(), trash_entries.end(),
+ [](librbd::trash_image_info_t info) {
+ return info.source != RBD_TRASH_IMAGE_SOURCE_USER &&
+ info.source != RBD_TRASH_IMAGE_SOURCE_USER_PARENT;
+ }),
+ trash_entries.end());
+
+ std::set<std::string> to_be_removed;
+ if (threshold != -1) {
+ if (threshold < 0 || threshold > 1) {
+ lderr(cct) << "argument 'threshold' is out of valid range"
+ << dendl;
+ return -EINVAL;
+ }
+
+ librados::bufferlist inbl;
+ librados::bufferlist outbl;
+ std::string pool_name = io_ctx.get_pool_name();
+
+ librados::Rados rados(io_ctx);
+ rados.mon_command(R"({"prefix": "df", "format": "json"})", inbl,
+ &outbl, nullptr);
+
+ json_spirit::mValue json;
+ if (!json_spirit::read(outbl.to_str(), json)) {
+ lderr(cct) << "ceph df json output could not be parsed"
+ << dendl;
+ return -EBADMSG;
+ }
+
+ json_spirit::mArray arr = json.get_obj()["pools"].get_array();
+
+ double pool_percent_used = 0;
+ uint64_t pool_total_bytes = 0;
+
+ std::map<std::string, std::vector<std::string>> datapools;
+
+ std::sort(trash_entries.begin(), trash_entries.end(),
+ [](librbd::trash_image_info_t a, librbd::trash_image_info_t b) {
+ return a.deferment_end_time < b.deferment_end_time;
+ }
+ );
+
+ for (const auto &entry : trash_entries) {
+ int64_t data_pool_id = -1;
+ r = cls_client::get_data_pool(&io_ctx, util::header_name(entry.id),
+ &data_pool_id);
+ if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) {
+ lderr(cct) << "failed to query data pool: " << cpp_strerror(r) << dendl;
+ return r;
+ } else if (data_pool_id == -1) {
+ data_pool_id = io_ctx.get_id();
+ }
+
+ if (data_pool_id != io_ctx.get_id()) {
+ librados::IoCtx data_io_ctx;
+ r = util::create_ioctx(io_ctx, "image", data_pool_id,
+ {}, &data_io_ctx);
+ if (r < 0) {
+ lderr(cct) << "error accessing data pool" << dendl;
+ continue;
+ }
+ auto data_pool = data_io_ctx.get_pool_name();
+ datapools[data_pool].push_back(entry.id);
+ } else {
+ datapools[pool_name].push_back(entry.id);
+ }
+ }
+
+ uint64_t bytes_to_free = 0;
+
+ for (uint8_t i = 0; i < arr.size(); ++i) {
+ json_spirit::mObject obj = arr[i].get_obj();
+ std::string name = obj.find("name")->second.get_str();
+ auto img = datapools.find(name);
+ if (img != datapools.end()) {
+ json_spirit::mObject stats = arr[i].get_obj()["stats"].get_obj();
+ pool_percent_used = stats["percent_used"].get_real();
+ if (pool_percent_used <= threshold) continue;
+
+ bytes_to_free = 0;
+
+ pool_total_bytes = stats["max_avail"].get_uint64() +
+ stats["bytes_used"].get_uint64();
+
+ auto bytes_threshold = (uint64_t) (pool_total_bytes *
+ (pool_percent_used - threshold));
+
+ for (const auto &it : img->second) {
+ auto ictx = new I("", it, nullptr, io_ctx, false);
+ r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT);
+ if (r == -ENOENT) {
+ continue;
+ } else if (r < 0) {
+ lderr(cct) << "failed to open image " << it << ": "
+ << cpp_strerror(r) << dendl;
+ }
+
+ r = librbd::api::DiffIterate<I>::diff_iterate(
+ ictx, cls::rbd::UserSnapshotNamespace(), nullptr, 0, ictx->size,
+ false, true,
+ [](uint64_t offset, size_t len, int exists, void *arg) {
+ auto *to_free = reinterpret_cast<uint64_t *>(arg);
+ if (exists)
+ (*to_free) += len;
+ return 0;
+ }, &bytes_to_free);
+
+ ictx->state->close();
+ if (r < 0) {
+ lderr(cct) << "failed to calculate disk usage for image " << it
+ << ": " << cpp_strerror(r) << dendl;
+ continue;
+ }
+
+ to_be_removed.insert(it);
+ if (bytes_to_free >= bytes_threshold) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (bytes_to_free == 0) {
+ ldout(cct, 10) << "pool usage is lower than or equal to "
+ << (threshold * 100)
+ << "%" << dendl;
+ return 0;
+ }
+ }
+
+ if (expire_ts == 0) {
+ struct timespec now;
+ clock_gettime(CLOCK_REALTIME, &now);
+ expire_ts = now.tv_sec;
+ }
+
+ for (const auto &entry : trash_entries) {
+ if (expire_ts >= entry.deferment_end_time) {
+ to_be_removed.insert(entry.id);
+ }
+ }
+
+ NoOpProgressContext remove_pctx;
+ uint64_t list_size = to_be_removed.size(), i = 0;
+ int remove_err = 1;
+ while (!to_be_removed.empty() && remove_err == 1) {
+ remove_err = 0;
+ for (auto it = to_be_removed.begin(); it != to_be_removed.end(); ) {
+ trash_image_info_t trash_info;
+ r = Trash<I>::get(io_ctx, *it, &trash_info);
+ if (r == -ENOENT) {
+ // likely RBD_TRASH_IMAGE_SOURCE_USER_PARENT image removed as a side
+ // effect of a preceeding remove (last child detach)
+ pctx.update_progress(++i, list_size);
+ it = to_be_removed.erase(it);
+ continue;
+ } else if (r < 0) {
+ lderr(cct) << "error getting image id " << *it
+ << " info: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = Trash<I>::remove(io_ctx, *it, true, remove_pctx);
+ if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK || r == -EUCLEAN) {
+ if (!remove_err) {
+ remove_err = r;
+ }
+ ++it;
+ continue;
+ } else if (r < 0) {
+ lderr(cct) << "error removing image id " << *it
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ pctx.update_progress(++i, list_size);
+ it = to_be_removed.erase(it);
+ remove_err = 1;
+ }
+ ldout(cct, 20) << "remove_err=" << remove_err << dendl;
+ }
+
+ if (!to_be_removed.empty()) {
+ ceph_assert(remove_err < 0);
+ ldout(cct, 10) << "couldn't remove " << to_be_removed.size()
+ << " expired images" << dendl;
+ return remove_err;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::remove(IoCtx &io_ctx, const std::string &image_id, bool force,
+ ProgressContext& prog_ctx) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "trash_remove " << &io_ctx << " " << image_id
+ << " " << force << dendl;
+
+ cls::rbd::TrashImageSpec trash_spec;
+ int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec);
+ if (r < 0) {
+ lderr(cct) << "error getting image id " << image_id
+ << " info from trash: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ utime_t now = ceph_clock_now();
+ if (now < trash_spec.deferment_end_time && !force) {
+ lderr(cct) << "error: deferment time has not expired." << dendl;
+ return -EPERM;
+ }
+ if (trash_spec.state == cls::rbd::TRASH_IMAGE_STATE_MOVING) {
+ lderr(cct) << "error: image is pending moving to the trash."
+ << dendl;
+ return -EUCLEAN;
+ } else if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ lderr(cct) << "error: image is pending restoration." << dendl;
+ return -EBUSY;
+ }
+
+ AsioEngine asio_engine(io_ctx);
+
+ C_SaferCond cond;
+ auto req = librbd::trash::RemoveRequest<I>::create(
+ io_ctx, image_id, asio_engine.get_work_queue(), force, prog_ctx, &cond);
+ req->send();
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond notify_ctx;
+ TrashWatcher<I>::notify_image_removed(io_ctx, image_id, &notify_ctx);
+ r = notify_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return 0;
+}
+
+template <typename I>
+int Trash<I>::restore(librados::IoCtx &io_ctx,
+ const TrashImageSources& trash_image_sources,
+ const std::string &image_id,
+ const std::string &image_new_name) {
+ CephContext *cct((CephContext *)io_ctx.cct());
+ ldout(cct, 20) << "trash_restore " << &io_ctx << " " << image_id << " "
+ << image_new_name << dendl;
+
+ cls::rbd::TrashImageSpec trash_spec;
+ int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec);
+ if (r < 0) {
+ lderr(cct) << "error getting image id " << image_id
+ << " info from trash: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (trash_image_sources.count(trash_spec.source) == 0) {
+ lderr(cct) << "Current trash source '" << trash_spec.source << "' "
+ << "does not match expected: "
+ << trash_image_sources << dendl;
+ return -EINVAL;
+ }
+
+ std::string image_name = image_new_name;
+ if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_RESTORING) {
+ lderr(cct) << "error restoring image id " << image_id
+ << ", which is pending deletion" << dendl;
+ return -EBUSY;
+ }
+ r = cls_client::trash_state_set(&io_ctx, image_id,
+ cls::rbd::TRASH_IMAGE_STATE_RESTORING,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (image_name.empty()) {
+ // if user didn't specify a new name, let's try using the old name
+ image_name = trash_spec.name;
+ ldout(cct, 20) << "restoring image id " << image_id << " with name "
+ << image_name << dendl;
+ }
+
+ // check if no image exists with the same name
+ bool create_id_obj = true;
+ std::string existing_id;
+ r = cls_client::get_id(&io_ctx, util::id_obj_name(image_name), &existing_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error checking if image " << image_name << " exists: "
+ << cpp_strerror(r) << dendl;
+ int ret = cls_client::trash_state_set(&io_ctx, image_id,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL,
+ cls::rbd::TRASH_IMAGE_STATE_RESTORING);
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(ret) << dendl;
+ }
+ return r;
+ } else if (r != -ENOENT){
+ // checking if we are recovering from an incomplete restore
+ if (existing_id != image_id) {
+ ldout(cct, 2) << "an image with the same name already exists" << dendl;
+ int r2 = cls_client::trash_state_set(&io_ctx, image_id,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL,
+ cls::rbd::TRASH_IMAGE_STATE_RESTORING);
+ if (r2 < 0 && r2 != -EOPNOTSUPP) {
+ lderr(cct) << "error setting trash image state: "
+ << cpp_strerror(r2) << dendl;
+ }
+ return -EEXIST;
+ }
+ create_id_obj = false;
+ }
+
+ if (create_id_obj) {
+ ldout(cct, 2) << "adding id object" << dendl;
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_client::set_id(&op, image_id);
+ r = io_ctx.operate(util::id_obj_name(image_name), &op);
+ if (r < 0) {
+ lderr(cct) << "error adding id object for image " << image_name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ ldout(cct, 2) << "adding rbd image to v2 directory..." << dendl;
+ r = cls_client::dir_add_image(&io_ctx, RBD_DIRECTORY, image_name,
+ image_id);
+ if (r < 0 && r != -EEXIST) {
+ lderr(cct) << "error adding image to v2 directory: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = enable_mirroring<I>(io_ctx, image_id);
+ if (r < 0) {
+ // not fatal -- ignore
+ }
+
+ ldout(cct, 2) << "removing image from trash..." << dendl;
+ r = cls_client::trash_remove(&io_ctx, image_id);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error removing image id " << image_id << " from trash: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond notify_ctx;
+ TrashWatcher<I>::notify_image_removed(io_ctx, image_id, &notify_ctx);
+ r = notify_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ return 0;
+}
+
+} // namespace api
+} // namespace librbd
+
+template class librbd::api::Trash<librbd::ImageCtx>;
diff --git a/src/librbd/api/Trash.h b/src/librbd/api/Trash.h
new file mode 100644
index 000000000..66f819dfa
--- /dev/null
+++ b/src/librbd/api/Trash.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_API_TRASH_H
+#define LIBRBD_API_TRASH_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <set>
+#include <string>
+#include <vector>
+
+namespace librbd {
+
+class ProgressContext;
+
+struct ImageCtx;
+
+namespace api {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Trash {
+ typedef std::set<cls::rbd::TrashImageSource> TrashImageSources;
+ static const TrashImageSources ALLOWED_RESTORE_SOURCES;
+
+ static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, uint64_t delay);
+ static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
+ const std::string &image_name, const std::string &image_id,
+ uint64_t delay);
+ static int get(librados::IoCtx &io_ctx, const std::string &id,
+ trash_image_info_t *info);
+ static int list(librados::IoCtx &io_ctx,
+ std::vector<trash_image_info_t> &entries,
+ bool exclude_user_remove_source);
+ static int purge(IoCtx& io_ctx, time_t expire_ts,
+ float threshold, ProgressContext& pctx);
+ static int remove(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool force, ProgressContext& prog_ctx);
+ static int restore(librados::IoCtx &io_ctx,
+ const TrashImageSources& trash_image_sources,
+ const std::string &image_id,
+ const std::string &image_new_name);
+
+};
+
+} // namespace api
+} // namespace librbd
+
+extern template class librbd::api::Trash<librbd::ImageCtx>;
+
+#endif // LIBRBD_API_TRASH_H
diff --git a/src/librbd/api/Utils.cc b/src/librbd/api/Utils.cc
new file mode 100644
index 000000000..1ffb2f174
--- /dev/null
+++ b/src/librbd/api/Utils.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/api/Utils.h"
+#include "common/dout.h"
+
+#if defined(HAVE_LIBCRYPTSETUP)
+#include "librbd/crypto/luks/EncryptionFormat.h"
+#endif
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::api::util: " << __func__ << ": "
+
+namespace librbd {
+namespace api {
+namespace util {
+
+template <typename I>
+int create_encryption_format(
+ CephContext* cct, encryption_format_t format,
+ encryption_options_t opts, size_t opts_size, bool c_api,
+ crypto::EncryptionFormat<I>** result_format) {
+ size_t expected_opts_size;
+ switch (format) {
+#if defined(HAVE_LIBCRYPTSETUP)
+ case RBD_ENCRYPTION_FORMAT_LUKS1: {
+ if (c_api) {
+ expected_opts_size = sizeof(rbd_encryption_luks1_format_options_t);
+ if (expected_opts_size == opts_size) {
+ auto c_opts = (rbd_encryption_luks1_format_options_t*)opts;
+ *result_format = new crypto::luks::LUKS1EncryptionFormat<I>(
+ c_opts->alg, {c_opts->passphrase, c_opts->passphrase_size});
+ }
+ } else {
+ expected_opts_size = sizeof(encryption_luks1_format_options_t);
+ if (expected_opts_size == opts_size) {
+ auto cpp_opts = (encryption_luks1_format_options_t*)opts;
+ *result_format = new crypto::luks::LUKS1EncryptionFormat<I>(
+ cpp_opts->alg, std::move(cpp_opts->passphrase));
+ }
+ }
+ break;
+ }
+ case RBD_ENCRYPTION_FORMAT_LUKS2: {
+ if (c_api) {
+ expected_opts_size = sizeof(rbd_encryption_luks2_format_options_t);
+ if (expected_opts_size == opts_size) {
+ auto c_opts = (rbd_encryption_luks2_format_options_t*)opts;
+ *result_format = new crypto::luks::LUKS2EncryptionFormat<I>(
+ c_opts->alg, {c_opts->passphrase, c_opts->passphrase_size});
+ }
+ } else {
+ expected_opts_size = sizeof(encryption_luks2_format_options_t);
+ if (expected_opts_size == opts_size) {
+ auto cpp_opts = (encryption_luks2_format_options_t*)opts;
+ *result_format = new crypto::luks::LUKS2EncryptionFormat<I>(
+ cpp_opts->alg, std::move(cpp_opts->passphrase));
+ }
+ }
+ break;
+ }
+#endif
+ default:
+ lderr(cct) << "unsupported encryption format: " << format << dendl;
+ return -ENOTSUP;
+ }
+
+ if (expected_opts_size != opts_size) {
+ lderr(cct) << "expected opts_size: " << expected_opts_size << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+} // namespace util
+} // namespace api
+} // namespace librbd
+
+template int librbd::api::util::create_encryption_format(
+ CephContext* cct, encryption_format_t format, encryption_options_t opts,
+ size_t opts_size, bool c_api,
+ crypto::EncryptionFormat<librbd::ImageCtx>** result_format);
diff --git a/src/librbd/api/Utils.h b/src/librbd/api/Utils.h
new file mode 100644
index 000000000..8f8c22290
--- /dev/null
+++ b/src/librbd/api/Utils.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_API_UTILS_H
+#define CEPH_LIBRBD_API_UTILS_H
+
+#include "include/rbd/librbd.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/crypto/EncryptionFormat.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace api {
+namespace util {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+int create_encryption_format(
+ CephContext* cct, encryption_format_t format,
+ encryption_options_t opts, size_t opts_size, bool c_api,
+ crypto::EncryptionFormat<ImageCtxT>** result_format);
+
+} // namespace util
+} // namespace api
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_API_UTILS_H
diff --git a/src/librbd/asio/ContextWQ.cc b/src/librbd/asio/ContextWQ.cc
new file mode 100644
index 000000000..4f6c72770
--- /dev/null
+++ b/src/librbd/asio/ContextWQ.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/asio/ContextWQ.h"
+#include "include/Context.h"
+#include "common/Cond.h"
+#include "common/dout.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::asio::ContextWQ: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace asio {
+
+ContextWQ::ContextWQ(CephContext* cct, boost::asio::io_context& io_context)
+ : m_cct(cct), m_io_context(io_context),
+ m_strand(std::make_unique<boost::asio::io_context::strand>(io_context)),
+ m_queued_ops(0) {
+ ldout(m_cct, 20) << dendl;
+}
+
+ContextWQ::~ContextWQ() {
+ ldout(m_cct, 20) << dendl;
+ drain();
+ m_strand.reset();
+}
+
+void ContextWQ::drain() {
+ ldout(m_cct, 20) << dendl;
+ C_SaferCond ctx;
+ drain_handler(&ctx);
+ ctx.wait();
+}
+
+void ContextWQ::drain_handler(Context* ctx) {
+ if (m_queued_ops == 0) {
+ ctx->complete(0);
+ return;
+ }
+
+ // new items might be queued while we are trying to drain, so we
+ // might need to post the handler multiple times
+ boost::asio::post(*m_strand, [this, ctx]() { drain_handler(ctx); });
+}
+
+} // namespace asio
+} // namespace librbd
diff --git a/src/librbd/asio/ContextWQ.h b/src/librbd/asio/ContextWQ.h
new file mode 100644
index 000000000..85c254161
--- /dev/null
+++ b/src/librbd/asio/ContextWQ.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_ASIO_CONTEXT_WQ_H
+#define CEPH_LIBRBD_ASIO_CONTEXT_WQ_H
+
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include <atomic>
+#include <memory>
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/io_context_strand.hpp>
+#include <boost/asio/post.hpp>
+
+namespace librbd {
+namespace asio {
+
+class ContextWQ {
+public:
+ explicit ContextWQ(CephContext* cct, boost::asio::io_context& io_context);
+ ~ContextWQ();
+
+ void drain();
+
+ void queue(Context *ctx, int r = 0) {
+ ++m_queued_ops;
+
+ // ensure all legacy ContextWQ users are dispatched sequentially for
+ // backwards compatibility (i.e. might not be concurrent thread-safe)
+ boost::asio::post(*m_strand, [this, ctx, r]() {
+ ctx->complete(r);
+
+ ceph_assert(m_queued_ops > 0);
+ --m_queued_ops;
+ });
+ }
+
+private:
+ CephContext* m_cct;
+ boost::asio::io_context& m_io_context;
+ std::unique_ptr<boost::asio::io_context::strand> m_strand;
+
+ std::atomic<uint64_t> m_queued_ops;
+
+ void drain_handler(Context* ctx);
+
+};
+
+} // namespace asio
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_ASIO_CONTEXT_WQ_H
diff --git a/src/librbd/asio/Utils.h b/src/librbd/asio/Utils.h
new file mode 100644
index 000000000..2fbbb5846
--- /dev/null
+++ b/src/librbd/asio/Utils.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_ASIO_UTILS_H
+#define CEPH_LIBRBD_ASIO_UTILS_H
+
+#include "include/Context.h"
+#include "include/rados/librados_fwd.hpp"
+#include <boost/system/error_code.hpp>
+
+namespace librbd {
+namespace asio {
+namespace util {
+
+template <typename T>
+auto get_context_adapter(T&& t) {
+ return [t = std::move(t)](boost::system::error_code ec) {
+ t->complete(-ec.value());
+ };
+}
+
+template <typename T>
+auto get_callback_adapter(T&& t) {
+ return [t = std::move(t)](boost::system::error_code ec, auto&& ... args) {
+ t(-ec.value(), std::forward<decltype(args)>(args)...);
+ };
+}
+
+} // namespace util
+} // namespace asio
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_ASIO_UTILS_H
diff --git a/src/librbd/cache/ImageWriteback.cc b/src/librbd/cache/ImageWriteback.cc
new file mode 100644
index 000000000..dcbba42ba
--- /dev/null
+++ b/src/librbd/cache/ImageWriteback.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ImageWriteback.h"
+#include "include/buffer.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ReadResult.h"
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageWriteback: " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+template <typename I>
+ImageWriteback<I>::ImageWriteback(I &image_ctx) : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_read(Extents &&image_extents, bufferlist *bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, image_ctx, io::AIO_TYPE_READ);
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_read(
+ *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp,
+ std::move(image_extents), io::ReadResult{bl},
+ image_ctx->get_data_io_context(),
+ fadvise_flags, 0, trace);
+ req->send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_write(Extents &&image_extents,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, image_ctx, io::AIO_TYPE_WRITE);
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_write(
+ *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp,
+ std::move(image_extents), std::move(bl),
+ image_ctx->get_data_io_context(), fadvise_flags, trace);
+ req->send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, image_ctx, io::AIO_TYPE_DISCARD);
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_discard(
+ *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, offset,
+ length, discard_granularity_bytes,
+ image_ctx->get_data_io_context(), trace);
+ req->send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_flush(io::FlushSource flush_source,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "on_finish=" << on_finish << dendl;
+
+ ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, image_ctx, io::AIO_TYPE_FLUSH);
+
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_flush(
+ *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp,
+ flush_source, trace);
+ req->send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "offset=" << offset << ", "
+ << "length=" << length << ", "
+ << "data_len=" << bl.length() << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, image_ctx, io::AIO_TYPE_WRITESAME);
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_write_same(
+ *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, offset,
+ length, std::move(bl), image_ctx->get_data_io_context(),
+ fadvise_flags, trace);
+ req->send();
+}
+
+template <typename I>
+void ImageWriteback<I>::aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, image_ctx, io::AIO_TYPE_COMPARE_AND_WRITE);
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_compare_and_write(
+ *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp,
+ std::move(image_extents), std::move(cmp_bl), std::move(bl),
+ mismatch_offset, image_ctx->get_data_io_context(),
+ fadvise_flags, trace);
+ req->send();
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ImageWriteback<librbd::ImageCtx>;
+
diff --git a/src/librbd/cache/ImageWriteback.h b/src/librbd/cache/ImageWriteback.h
new file mode 100644
index 000000000..3f62391e4
--- /dev/null
+++ b/src/librbd/cache/ImageWriteback.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK
+#define CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include "librbd/io/Types.h"
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+class ImageWritebackInterface {
+public:
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+ virtual ~ImageWritebackInterface() {
+ }
+ virtual void aio_read(Extents &&image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_write(Extents &&image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes, Context *on_finish) = 0;
+ virtual void aio_flush(io::FlushSource flush_source, Context *on_finish) = 0 ;
+ virtual void aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags, Context *on_finish) = 0;
+};
+
+/**
+ * client-side, image extent cache writeback handler
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageWriteback : public ImageWritebackInterface {
+public:
+ using ImageWritebackInterface::Extents;
+
+ explicit ImageWriteback(ImageCtxT &image_ctx);
+
+ void aio_read(Extents &&image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish);
+ void aio_write(Extents &&image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish);
+ void aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes, Context *on_finish);
+ void aio_flush(io::FlushSource flush_source, Context *on_finish);
+ void aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish);
+ void aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags, Context *on_finish);
+private:
+ ImageCtxT &m_image_ctx;
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ImageWriteback<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK
diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.cc b/src/librbd/cache/ObjectCacherObjectDispatch.cc
new file mode 100644
index 000000000..81d5a7188
--- /dev/null
+++ b/src/librbd/cache/ObjectCacherObjectDispatch.cc
@@ -0,0 +1,467 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ObjectCacherObjectDispatch.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/ObjectCacherWriteback.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/Utils.h"
+#include "osd/osd_types.h"
+#include "osdc/WritebackHandler.h"
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::ObjectCacherObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+using librbd::util::data_object_name;
+
+namespace {
+
+typedef std::vector<ObjectExtent> ObjectExtents;
+
+} // anonymous namespace
+
+template <typename I>
+struct ObjectCacherObjectDispatch<I>::C_InvalidateCache : public Context {
+ ObjectCacherObjectDispatch* dispatcher;
+ bool purge_on_error;
+ Context *on_finish;
+
+ C_InvalidateCache(ObjectCacherObjectDispatch* dispatcher,
+ bool purge_on_error, Context *on_finish)
+ : dispatcher(dispatcher), purge_on_error(purge_on_error),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ ceph_assert(ceph_mutex_is_locked(dispatcher->m_cache_lock));
+ auto cct = dispatcher->m_image_ctx->cct;
+
+ if (r == -EBLOCKLISTED) {
+ lderr(cct) << "blocklisted during flush (purging)" << dendl;
+ dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set);
+ } else if (r < 0 && purge_on_error) {
+ lderr(cct) << "failed to invalidate cache (purging): "
+ << cpp_strerror(r) << dendl;
+ dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set);
+ } else if (r != 0) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r) << dendl;
+ }
+
+ auto unclean = dispatcher->m_object_cacher->release_set(
+ dispatcher->m_object_set);
+ if (unclean == 0) {
+ r = 0;
+ } else {
+ lderr(cct) << "could not release all objects from cache: "
+ << unclean << " bytes remain" << dendl;
+ if (r == 0) {
+ r = -EBUSY;
+ }
+ }
+
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+ObjectCacherObjectDispatch<I>::ObjectCacherObjectDispatch(
+ I* image_ctx, size_t max_dirty, bool writethrough_until_flush)
+ : m_image_ctx(image_ctx), m_max_dirty(max_dirty),
+ m_writethrough_until_flush(writethrough_until_flush),
+ m_cache_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::ObjectCacherObjectDispatch::cache_lock", this))) {
+ ceph_assert(m_image_ctx->data_ctx.is_valid());
+}
+
+template <typename I>
+ObjectCacherObjectDispatch<I>::~ObjectCacherObjectDispatch() {
+ delete m_object_cacher;
+ delete m_object_set;
+
+ delete m_writeback_handler;
+}
+
+template <typename I>
+void ObjectCacherObjectDispatch<I>::init() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_cache_lock.lock();
+ ldout(cct, 5) << "enabling caching..." << dendl;
+ m_writeback_handler = new ObjectCacherWriteback(m_image_ctx, m_cache_lock);
+
+ auto init_max_dirty = m_max_dirty;
+ if (m_writethrough_until_flush) {
+ init_max_dirty = 0;
+ }
+
+ auto cache_size =
+ m_image_ctx->config.template get_val<Option::size_t>("rbd_cache_size");
+ auto target_dirty =
+ m_image_ctx->config.template get_val<Option::size_t>("rbd_cache_target_dirty");
+ auto max_dirty_age =
+ m_image_ctx->config.template get_val<double>("rbd_cache_max_dirty_age");
+ auto block_writes_upfront =
+ m_image_ctx->config.template get_val<bool>("rbd_cache_block_writes_upfront");
+ auto max_dirty_object =
+ m_image_ctx->config.template get_val<uint64_t>("rbd_cache_max_dirty_object");
+
+ ldout(cct, 5) << "Initial cache settings:"
+ << " size=" << cache_size
+ << " num_objects=" << 10
+ << " max_dirty=" << init_max_dirty
+ << " target_dirty=" << target_dirty
+ << " max_dirty_age=" << max_dirty_age << dendl;
+
+ m_object_cacher = new ObjectCacher(cct, m_image_ctx->perfcounter->get_name(),
+ *m_writeback_handler, m_cache_lock,
+ nullptr, nullptr, cache_size,
+ 10, /* reset this in init */
+ init_max_dirty, target_dirty,
+ max_dirty_age, block_writes_upfront);
+
+ // size object cache appropriately
+ if (max_dirty_object == 0) {
+ max_dirty_object = std::min<uint64_t>(
+ 2000, std::max<uint64_t>(10, cache_size / 100 /
+ sizeof(ObjectCacher::Object)));
+ }
+ ldout(cct, 5) << " cache bytes " << cache_size
+ << " -> about " << max_dirty_object << " objects" << dendl;
+ m_object_cacher->set_max_objects(max_dirty_object);
+
+ m_object_set = new ObjectCacher::ObjectSet(nullptr,
+ m_image_ctx->data_ctx.get_id(), 0);
+ m_object_cacher->start();
+ m_cache_lock.unlock();
+
+ // add ourself to the IO object dispatcher chain
+ if (m_max_dirty > 0) {
+ m_image_ctx->disable_zero_copy = true;
+ }
+ m_image_ctx->io_object_dispatcher->register_dispatch(this);
+}
+
+template <typename I>
+void ObjectCacherObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // chain shut down in reverse order
+
+ // shut down the cache
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ m_object_cacher->stop();
+ on_finish->complete(r);
+ });
+
+ // ensure we aren't holding the cache lock post-flush
+ on_finish = util::create_async_context_callback(*m_image_ctx, on_finish);
+
+ // invalidate any remaining cache entries
+ on_finish = new C_InvalidateCache(this, true, on_finish);
+
+ // flush all pending writeback state
+ std::lock_guard locker{m_cache_lock};
+ m_object_cacher->release_set(m_object_set);
+ m_object_cacher->flush_set(m_object_set, on_finish);
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ // IO chained in reverse order
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl;
+
+ if (extents->size() == 0) {
+ ldout(cct, 20) << "no extents to read" << dendl;
+ return false;
+ }
+
+ if (version != nullptr) {
+ // we currently don't cache read versions
+ // and don't support reading more than one extent
+ return false;
+ }
+
+ // ensure we aren't holding the cache lock post-read
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ // embed the RBD-internal read flags in the genenric RADOS op_flags and
+ op_flags = ((op_flags & ~ObjectCacherWriteback::READ_FLAGS_MASK) |
+ ((read_flags << ObjectCacherWriteback::READ_FLAGS_SHIFT) &
+ ObjectCacherWriteback::READ_FLAGS_MASK));
+
+ ceph::bufferlist* bl;
+ if (extents->size() > 1) {
+ auto req = new io::ReadResult::C_ObjectReadMergedExtents(
+ cct, extents, on_dispatched);
+ on_dispatched = req;
+ bl = &req->bl;
+ } else {
+ bl = &extents->front().bl;
+ }
+
+ m_image_ctx->image_lock.lock_shared();
+ auto rd = m_object_cacher->prepare_read(
+ io_context->read_snap().value_or(CEPH_NOSNAP), bl, op_flags);
+ m_image_ctx->image_lock.unlock_shared();
+
+ uint64_t off = 0;
+ for (auto& read_extent: *extents) {
+ ObjectExtent extent(data_object_name(m_image_ctx, object_no), object_no,
+ read_extent.offset, read_extent.length, 0);
+ extent.oloc.pool = m_image_ctx->data_ctx.get_id();
+ extent.buffer_extents.push_back({off, read_extent.length});
+ rd->extents.push_back(extent);
+ off += read_extent.length;
+ }
+
+ ZTracer::Trace trace(parent_trace);
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+
+ m_cache_lock.lock();
+ int r = m_object_cacher->readx(rd, m_object_set, on_dispatched, &trace);
+ m_cache_lock.unlock();
+ if (r != 0) {
+ on_dispatched->complete(r);
+ }
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << object_len << dendl;
+
+ ObjectExtents object_extents;
+ object_extents.emplace_back(data_object_name(m_image_ctx, object_no),
+ object_no, object_off, object_len, 0);
+
+ // discard the cache state after changes are committed to disk (and to
+ // prevent races w/ readahead)
+ auto ctx = *on_finish;
+ *on_finish = new LambdaContext(
+ [this, object_extents, ctx](int r) {
+ m_cache_lock.lock();
+ m_object_cacher->discard_set(m_object_set, object_extents);
+ m_cache_lock.unlock();
+
+ ctx->complete(r);
+ });
+
+ // ensure we aren't holding the cache lock post-write
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+
+ // ensure any in-flight writeback is complete before advancing
+ // the discard request
+ std::lock_guard locker{m_cache_lock};
+ m_object_cacher->discard_writeback(m_object_set, object_extents,
+ on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << data.length() << dendl;
+
+ // ensure we aren't holding the cache lock post-write
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ // cache layer does not handle version checking
+ if (assert_version.has_value() ||
+ (write_flags & io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) {
+ ObjectExtents object_extents;
+ object_extents.emplace_back(data_object_name(m_image_ctx, object_no),
+ object_no, object_off, data.length(), 0);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+
+ // ensure any in-flight writeback is complete before advancing
+ // the write request
+ std::lock_guard locker{m_cache_lock};
+ m_object_cacher->discard_writeback(m_object_set, object_extents,
+ on_dispatched);
+ return true;
+ }
+
+ SnapContext snapc;
+ if (io_context->write_snap_context()) {
+ auto write_snap_context = *io_context->write_snap_context();
+ snapc = SnapContext(write_snap_context.first,
+ {write_snap_context.second.begin(),
+ write_snap_context.second.end()});
+ }
+
+ m_image_ctx->image_lock.lock_shared();
+ ObjectCacher::OSDWrite *wr = m_object_cacher->prepare_write(
+ snapc, data, ceph::real_time::min(), op_flags, *journal_tid);
+ m_image_ctx->image_lock.unlock_shared();
+
+ ObjectExtent extent(data_object_name(m_image_ctx, object_no),
+ object_no, object_off, data.length(), 0);
+ extent.oloc.pool = m_image_ctx->data_ctx.get_id();
+ extent.buffer_extents.push_back({0, data.length()});
+ wr->extents.push_back(extent);
+
+ ZTracer::Trace trace(parent_trace);
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+
+ std::lock_guard locker{m_cache_lock};
+ m_object_cacher->writex(wr, m_object_set, on_dispatched, &trace);
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << object_len << dendl;
+
+ // ObjectCacher doesn't support write-same so convert to regular write
+ io::LightweightObjectExtent extent(object_no, object_off, object_len, 0);
+ extent.buffer_extents = std::move(buffer_extents);
+
+ bufferlist ws_data;
+ io::util::assemble_write_same_extent(extent, data, &ws_data, true);
+
+ return write(object_no, object_off, std::move(ws_data), io_context, op_flags,
+ 0, std::nullopt, parent_trace, object_dispatch_flags,
+ journal_tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~"
+ << cmp_data.length() << dendl;
+
+ // pass-through the compare-and-write request since it's not a supported
+ // operation of the ObjectCacher
+
+ // ensure we aren't holding the cache lock post-flush
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ // flush any pending writes from the cache
+ ZTracer::Trace trace(parent_trace);
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+
+ ObjectExtents object_extents;
+ object_extents.emplace_back(data_object_name(m_image_ctx, object_no),
+ object_no, object_off, cmp_data.length(), 0);
+
+ std::lock_guard cache_locker{m_cache_lock};
+ m_object_cacher->flush_set(m_object_set, object_extents, &trace,
+ on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ // ensure we aren't holding the cache lock post-flush
+ on_dispatched = util::create_async_context_callback(*m_image_ctx,
+ on_dispatched);
+
+ std::lock_guard locker{m_cache_lock};
+ if (flush_source == io::FLUSH_SOURCE_USER && !m_user_flushed) {
+ m_user_flushed = true;
+ if (m_writethrough_until_flush && m_max_dirty > 0) {
+ m_object_cacher->set_max_dirty(m_max_dirty);
+ ldout(cct, 5) << "saw first user flush, enabling writeback" << dendl;
+ }
+ }
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ m_object_cacher->flush_set(m_object_set, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::invalidate_cache(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // ensure we aren't holding the cache lock post-flush
+ on_finish = util::create_async_context_callback(*m_image_ctx, on_finish);
+
+ // invalidate any remaining cache entries
+ on_finish = new C_InvalidateCache(this, false, on_finish);
+
+ std::lock_guard locker{m_cache_lock};
+ m_object_cacher->release_set(m_object_set);
+ m_object_cacher->flush_set(m_object_set, on_finish);
+ return true;
+}
+
+template <typename I>
+bool ObjectCacherObjectDispatch<I>::reset_existence_cache(
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ std::lock_guard locker{m_cache_lock};
+ m_object_cacher->clear_nonexistence(m_object_set);
+ return false;
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ObjectCacherObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.h b/src/librbd/cache/ObjectCacherObjectDispatch.h
new file mode 100644
index 000000000..0cc87bd87
--- /dev/null
+++ b/src/librbd/cache/ObjectCacherObjectDispatch.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H
+
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "common/ceph_mutex.h"
+#include "osdc/ObjectCacher.h"
+
+struct WritebackHandler;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace cache {
+
+/**
+ * Facade around the OSDC object cacher to make it align with
+ * the object dispatcher interface
+ */
+template <typename ImageCtxT = ImageCtx>
+class ObjectCacherObjectDispatch : public io::ObjectDispatchInterface {
+public:
+ static ObjectCacherObjectDispatch* create(ImageCtxT* image_ctx,
+ size_t max_dirty,
+ bool writethrough_until_flush) {
+ return new ObjectCacherObjectDispatch(image_ctx, max_dirty,
+ writethrough_until_flush);
+ }
+
+ ObjectCacherObjectDispatch(ImageCtxT* image_ctx, size_t max_dirty,
+ bool writethrough_until_flush);
+ ~ObjectCacherObjectDispatch() override;
+
+ io::ObjectDispatchLayer get_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_CACHE;
+ }
+
+ void init();
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override;
+ bool reset_existence_cache(Context* on_finish) override;
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ struct C_InvalidateCache;
+
+ ImageCtxT* m_image_ctx;
+ size_t m_max_dirty;
+ bool m_writethrough_until_flush;
+
+ ceph::mutex m_cache_lock;
+ ObjectCacher *m_object_cacher = nullptr;
+ ObjectCacher::ObjectSet *m_object_set = nullptr;
+
+ WritebackHandler *m_writeback_handler = nullptr;
+
+ bool m_user_flushed = false;
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ObjectCacherObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H
diff --git a/src/librbd/cache/ObjectCacherWriteback.cc b/src/librbd/cache/ObjectCacherWriteback.cc
new file mode 100644
index 000000000..9f3d5c895
--- /dev/null
+++ b/src/librbd/cache/ObjectCacherWriteback.cc
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <errno.h>
+
+#include "librbd/cache/ObjectCacherWriteback.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/ceph_mutex.h"
+#include "osdc/Striper.h"
+#include "include/Context.h"
+#include "include/neorados/RADOS.hpp"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Utils.h"
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::ObjectCacherWriteback: "
+
+namespace librbd {
+namespace cache {
+
+/**
+ * context to wrap another context in a Mutex
+ *
+ * @param cct cct
+ * @param c context to finish
+ * @param l mutex to lock
+ */
+class C_ReadRequest : public Context {
+public:
+ C_ReadRequest(CephContext *cct, Context *c, ceph::mutex *cache_lock)
+ : m_cct(cct), m_ctx(c), m_cache_lock(cache_lock) {
+ }
+ void finish(int r) override {
+ ldout(m_cct, 20) << "aio_cb completing " << dendl;
+ {
+ std::lock_guard cache_locker{*m_cache_lock};
+ m_ctx->complete(r);
+ }
+ ldout(m_cct, 20) << "aio_cb finished" << dendl;
+ }
+private:
+ CephContext *m_cct;
+ Context *m_ctx;
+ ceph::mutex *m_cache_lock;
+};
+
+class C_OrderedWrite : public Context {
+public:
+ C_OrderedWrite(CephContext *cct,
+ ObjectCacherWriteback::write_result_d *result,
+ const ZTracer::Trace &trace, ObjectCacherWriteback *wb)
+ : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {}
+ ~C_OrderedWrite() override {}
+ void finish(int r) override {
+ ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl;
+ {
+ std::lock_guard l{m_wb_handler->m_lock};
+ ceph_assert(!m_result->done);
+ m_result->done = true;
+ m_result->ret = r;
+ m_wb_handler->complete_writes(m_result->oid);
+ }
+ ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl;
+ m_trace.event("finish");
+ }
+private:
+ CephContext *m_cct;
+ ObjectCacherWriteback::write_result_d *m_result;
+ ZTracer::Trace m_trace;
+ ObjectCacherWriteback *m_wb_handler;
+};
+
+struct C_CommitIOEventExtent : public Context {
+ ImageCtx *image_ctx;
+ uint64_t journal_tid;
+ uint64_t offset;
+ uint64_t length;
+
+ C_CommitIOEventExtent(ImageCtx *image_ctx, uint64_t journal_tid,
+ uint64_t offset, uint64_t length)
+ : image_ctx(image_ctx), journal_tid(journal_tid), offset(offset),
+ length(length) {
+ }
+
+ void finish(int r) override {
+ // all IO operations are flushed prior to closing the journal
+ ceph_assert(image_ctx->journal != nullptr);
+
+ image_ctx->journal->commit_io_event_extent(journal_tid, offset, length, r);
+ }
+};
+
+ObjectCacherWriteback::ObjectCacherWriteback(ImageCtx *ictx, ceph::mutex& lock)
+ : m_tid(0), m_lock(lock), m_ictx(ictx) {
+}
+
+void ObjectCacherWriteback::read(const object_t& oid, uint64_t object_no,
+ const object_locator_t& oloc,
+ uint64_t off, uint64_t len, snapid_t snapid,
+ bufferlist *pbl, uint64_t trunc_size,
+ __u32 trunc_seq, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *onfinish)
+{
+ ZTracer::Trace trace;
+ if (parent_trace.valid()) {
+ trace.init("", &m_ictx->trace_endpoint, &parent_trace);
+ trace.copy_name("cache read " + oid.name);
+ trace.event("start");
+ }
+
+ // on completion, take the mutex and then call onfinish.
+ onfinish = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock);
+
+ // re-use standard object read state machine
+ auto aio_comp = io::AioCompletion::create_and_start(onfinish, m_ictx,
+ io::AIO_TYPE_READ);
+ aio_comp->read_result = io::ReadResult{pbl};
+ aio_comp->set_request_count(1);
+
+ auto req_comp = new io::ReadResult::C_ObjectReadRequest(
+ aio_comp, {{off, len, {{0, len}}}});
+
+ auto io_context = m_ictx->duplicate_data_io_context();
+ if (snapid != CEPH_NOSNAP) {
+ io_context->read_snap(snapid);
+ }
+
+ // extract the embedded RBD read flags from the op_flags
+ int read_flags = (op_flags & READ_FLAGS_MASK) >> READ_FLAGS_SHIFT;
+ op_flags &= ~READ_FLAGS_MASK;
+
+ auto req = io::ObjectDispatchSpec::create_read(
+ m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, object_no, &req_comp->extents,
+ io_context, op_flags, read_flags, trace, nullptr, req_comp);
+ req->send();
+}
+
+bool ObjectCacherWriteback::may_copy_on_write(const object_t& oid,
+ uint64_t read_off,
+ uint64_t read_len,
+ snapid_t snapid)
+{
+ m_ictx->image_lock.lock_shared();
+ librados::snap_t snap_id = m_ictx->snap_id;
+ uint64_t overlap = 0;
+ m_ictx->get_parent_overlap(snap_id, &overlap);
+ m_ictx->image_lock.unlock_shared();
+
+ uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+ // reverse map this object extent onto the parent
+ vector<pair<uint64_t,uint64_t> > objectx;
+ io::util::extent_to_file(
+ m_ictx, object_no, 0, m_ictx->layout.object_size, objectx);
+ uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap);
+ bool may = object_overlap > 0;
+ ldout(m_ictx->cct, 10) << "may_copy_on_write " << oid << " " << read_off
+ << "~" << read_len << " = " << may << dendl;
+ return may;
+}
+
+ceph_tid_t ObjectCacherWriteback::write(const object_t& oid,
+ const object_locator_t& oloc,
+ uint64_t off, uint64_t len,
+ const SnapContext& snapc,
+ const bufferlist &bl,
+ ceph::real_time mtime,
+ uint64_t trunc_size,
+ __u32 trunc_seq, ceph_tid_t journal_tid,
+ const ZTracer::Trace &parent_trace,
+ Context *oncommit)
+{
+ ZTracer::Trace trace;
+ if (parent_trace.valid()) {
+ trace.init("", &m_ictx->trace_endpoint, &parent_trace);
+ trace.copy_name("writeback " + oid.name);
+ trace.event("start");
+ }
+
+ uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+ write_result_d *result = new write_result_d(oid.name, oncommit);
+ m_writes[oid.name].push(result);
+ ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
+
+ bufferlist bl_copy(bl);
+
+ Context *ctx = new C_OrderedWrite(m_ictx->cct, result, trace, this);
+ ctx = util::create_async_context_callback(*m_ictx, ctx);
+
+ auto io_context = m_ictx->duplicate_data_io_context();
+ if (!snapc.empty()) {
+ io_context->write_snap_context(
+ {{snapc.seq, {snapc.snaps.begin(), snapc.snaps.end()}}});
+ }
+
+ auto req = io::ObjectDispatchSpec::create_write(
+ m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, object_no, off, std::move(bl_copy),
+ io_context, 0, 0, std::nullopt, journal_tid, trace, ctx);
+ req->object_dispatch_flags = (
+ io::OBJECT_DISPATCH_FLAG_FLUSH |
+ io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR);
+ req->send();
+
+ return ++m_tid;
+}
+
+
+void ObjectCacherWriteback::overwrite_extent(const object_t& oid, uint64_t off,
+ uint64_t len,
+ ceph_tid_t original_journal_tid,
+ ceph_tid_t new_journal_tid) {
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+ ldout(m_ictx->cct, 20) << __func__ << ": " << oid << " "
+ << off << "~" << len << " "
+ << "journal_tid=" << original_journal_tid << ", "
+ << "new_journal_tid=" << new_journal_tid << dendl;
+
+ uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+ // all IO operations are flushed prior to closing the journal
+ ceph_assert(original_journal_tid != 0 && m_ictx->journal != NULL);
+
+ Extents file_extents;
+ io::util::extent_to_file(m_ictx, object_no, off, len, file_extents);
+ for (Extents::iterator it = file_extents.begin();
+ it != file_extents.end(); ++it) {
+ if (new_journal_tid != 0) {
+ // ensure new journal event is safely committed to disk before
+ // committing old event
+ m_ictx->journal->flush_event(
+ new_journal_tid, new C_CommitIOEventExtent(m_ictx,
+ original_journal_tid,
+ it->first, it->second));
+ } else {
+ m_ictx->journal->commit_io_event_extent(original_journal_tid, it->first,
+ it->second, 0);
+ }
+ }
+}
+
+void ObjectCacherWriteback::complete_writes(const std::string& oid)
+{
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ std::queue<write_result_d*>& results = m_writes[oid];
+ ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl;
+ std::list<write_result_d*> finished;
+
+ while (!results.empty()) {
+ write_result_d *result = results.front();
+ if (!result->done)
+ break;
+ finished.push_back(result);
+ results.pop();
+ }
+
+ if (results.empty())
+ m_writes.erase(oid);
+
+ for (std::list<write_result_d*>::iterator it = finished.begin();
+ it != finished.end(); ++it) {
+ write_result_d *result = *it;
+ ldout(m_ictx->cct, 20) << "complete_writes() completing " << result
+ << dendl;
+ result->oncommit->complete(result->ret);
+ delete result;
+ }
+}
+
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/ObjectCacherWriteback.h b/src/librbd/cache/ObjectCacherWriteback.h
new file mode 100644
index 000000000..d8c2ebbd9
--- /dev/null
+++ b/src/librbd/cache/ObjectCacherWriteback.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_OBJECT_CACHER_WRITEBACK_H
+#define CEPH_LIBRBD_CACHE_OBJECT_CACHER_WRITEBACK_H
+
+#include "common/snap_types.h"
+#include "osd/osd_types.h"
+#include "osdc/WritebackHandler.h"
+#include <queue>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+class ObjectCacherWriteback : public WritebackHandler {
+public:
+ static const int READ_FLAGS_MASK = 0xF000;
+ static const int READ_FLAGS_SHIFT = 24;
+
+ ObjectCacherWriteback(ImageCtx *ictx, ceph::mutex& lock);
+
+ // Note that oloc, trunc_size, and trunc_seq are ignored
+ void read(const object_t& oid, uint64_t object_no,
+ const object_locator_t& oloc, uint64_t off, uint64_t len,
+ snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
+ __u32 trunc_seq, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *onfinish) override;
+
+ // Determine whether a read to this extent could be affected by a
+ // write-triggered copy-on-write
+ bool may_copy_on_write(const object_t& oid, uint64_t read_off,
+ uint64_t read_len, snapid_t snapid) override;
+
+ // Note that oloc, trunc_size, and trunc_seq are ignored
+ ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
+ uint64_t off, uint64_t len,
+ const SnapContext& snapc, const bufferlist &bl,
+ ceph::real_time mtime, uint64_t trunc_size,
+ __u32 trunc_seq, ceph_tid_t journal_tid,
+ const ZTracer::Trace &parent_trace,
+ Context *oncommit) override;
+ using WritebackHandler::write;
+
+ void overwrite_extent(const object_t& oid, uint64_t off,
+ uint64_t len, ceph_tid_t original_journal_tid,
+ ceph_tid_t new_journal_tid) override;
+
+ struct write_result_d {
+ bool done;
+ int ret;
+ std::string oid;
+ Context *oncommit;
+ write_result_d(const std::string& oid, Context *oncommit) :
+ done(false), ret(0), oid(oid), oncommit(oncommit) {}
+ private:
+ write_result_d(const write_result_d& rhs);
+ const write_result_d& operator=(const write_result_d& rhs);
+ };
+
+private:
+ void complete_writes(const std::string& oid);
+
+ ceph_tid_t m_tid;
+ ceph::mutex& m_lock;
+ librbd::ImageCtx *m_ictx;
+ ceph::unordered_map<std::string, std::queue<write_result_d*> > m_writes;
+ friend class C_OrderedWrite;
+};
+
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_OBJECT_CACHER_WRITEBACK_H
diff --git a/src/librbd/cache/ParentCacheObjectDispatch.cc b/src/librbd/cache/ParentCacheObjectDispatch.cc
new file mode 100644
index 000000000..762b18101
--- /dev/null
+++ b/src/librbd/cache/ParentCacheObjectDispatch.cc
@@ -0,0 +1,255 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+#include "include/neorados/RADOS.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/ParentCacheObjectDispatch.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/plugin/Api.h"
+#include "osd/osd_types.h"
+#include "osdc/WritebackHandler.h"
+
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::ParentCacheObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+using namespace ceph::immutable_obj_cache;
+using librbd::util::data_object_name;
+
+namespace librbd {
+namespace cache {
+
+template <typename I>
+ParentCacheObjectDispatch<I>::ParentCacheObjectDispatch(
+ I* image_ctx, plugin::Api<I>& plugin_api)
+ : m_image_ctx(image_ctx), m_plugin_api(plugin_api),
+ m_lock(ceph::make_mutex(
+ "librbd::cache::ParentCacheObjectDispatch::lock", true, false)) {
+ ceph_assert(m_image_ctx->data_ctx.is_valid());
+ auto controller_path = image_ctx->cct->_conf.template get_val<std::string>(
+ "immutable_object_cache_sock");
+ m_cache_client = new CacheClient(controller_path.c_str(), m_image_ctx->cct);
+}
+
+template <typename I>
+ParentCacheObjectDispatch<I>::~ParentCacheObjectDispatch() {
+ delete m_cache_client;
+ m_cache_client = nullptr;
+}
+
+template <typename I>
+void ParentCacheObjectDispatch<I>::init(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ if (m_image_ctx->child == nullptr) {
+ ldout(cct, 5) << "non-parent image: skipping" << dendl;
+ if (on_finish != nullptr) {
+ on_finish->complete(-EINVAL);
+ }
+ return;
+ }
+
+ m_image_ctx->io_object_dispatcher->register_dispatch(this);
+
+ std::unique_lock locker{m_lock};
+ create_cache_session(on_finish, false);
+}
+
+template <typename I>
+bool ParentCacheObjectDispatch<I>::read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl;
+
+ if (version != nullptr) {
+ // we currently don't cache read versions
+ return false;
+ }
+
+ string oid = data_object_name(m_image_ctx, object_no);
+
+ /* if RO daemon still don't startup, or RO daemon crash,
+ * or session occur any error, try to re-connect daemon.*/
+ std::unique_lock locker{m_lock};
+ if (!m_cache_client->is_session_work()) {
+ create_cache_session(nullptr, true);
+ ldout(cct, 5) << "Parent cache try to re-connect to RO daemon. "
+ << "dispatch current request to lower object layer" << dendl;
+ return false;
+ }
+
+ CacheGenContextURef ctx = make_gen_lambda_context<ObjectCacheRequest*,
+ std::function<void(ObjectCacheRequest*)>>
+ ([this, extents, dispatch_result, on_dispatched, object_no, io_context,
+ &parent_trace]
+ (ObjectCacheRequest* ack) {
+ handle_read_cache(ack, object_no, extents, io_context, parent_trace,
+ dispatch_result, on_dispatched);
+ });
+
+ m_cache_client->lookup_object(m_image_ctx->data_ctx.get_namespace(),
+ m_image_ctx->data_ctx.get_id(),
+ io_context->read_snap().value_or(CEPH_NOSNAP),
+ m_image_ctx->layout.object_size,
+ oid, std::move(ctx));
+ return true;
+}
+
+template <typename I>
+void ParentCacheObjectDispatch<I>::handle_read_cache(
+ ObjectCacheRequest* ack, uint64_t object_no, io::ReadExtents* extents,
+ IOContext io_context, const ZTracer::Trace &parent_trace,
+ io::DispatchResult* dispatch_result, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ if(ack->type != RBDSC_READ_REPLY) {
+ // go back to read rados
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ on_dispatched->complete(0);
+ return;
+ }
+
+ ceph_assert(ack->type == RBDSC_READ_REPLY);
+ std::string file_path = ((ObjectCacheReadReplyData*)ack)->cache_path;
+ if (file_path.empty()) {
+ auto ctx = new LambdaContext(
+ [this, dispatch_result, on_dispatched](int r) {
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_image_ctx->cct) << "failed to read parent: "
+ << cpp_strerror(r) << dendl;
+ }
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ on_dispatched->complete(r);
+ });
+ m_plugin_api.read_parent(m_image_ctx, object_no, extents,
+ io_context->read_snap().value_or(CEPH_NOSNAP),
+ parent_trace, ctx);
+ return;
+ }
+
+ int read_len = 0;
+ for (auto& extent: *extents) {
+ // try to read from parent image cache
+ int r = read_object(file_path, &extent.bl, extent.offset, extent.length,
+ on_dispatched);
+ if (r < 0) {
+ // cache read error, fall back to read rados
+ for (auto& read_extent: *extents) {
+ // clear read bufferlists
+ if (&read_extent == &extent) {
+ break;
+ }
+ read_extent.bl.clear();
+ }
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ on_dispatched->complete(0);
+ return;
+ }
+
+ read_len += r;
+ }
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ on_dispatched->complete(read_len);
+}
+
+template <typename I>
+int ParentCacheObjectDispatch<I>::handle_register_client(bool reg) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ if (!reg) {
+ lderr(cct) << "Parent cache register fails." << dendl;
+ }
+ return 0;
+}
+
+template <typename I>
+void ParentCacheObjectDispatch<I>::create_cache_session(Context* on_finish,
+ bool is_reconnect) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (m_connecting) {
+ return;
+ }
+ m_connecting = true;
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ Context* register_ctx = new LambdaContext([this, cct, on_finish](int ret) {
+ if (ret < 0) {
+ lderr(cct) << "Parent cache fail to register client." << dendl;
+ }
+ handle_register_client(ret < 0 ? false : true);
+
+ ceph_assert(m_connecting);
+ m_connecting = false;
+
+ if (on_finish != nullptr) {
+ on_finish->complete(0);
+ }
+ });
+
+ Context* connect_ctx = new LambdaContext(
+ [this, cct, register_ctx](int ret) {
+ if (ret < 0) {
+ lderr(cct) << "Parent cache fail to connect RO daeomn." << dendl;
+ register_ctx->complete(ret);
+ return;
+ }
+
+ ldout(cct, 20) << "Parent cache connected to RO daemon." << dendl;
+
+ m_cache_client->register_client(register_ctx);
+ });
+
+ if (m_cache_client != nullptr && is_reconnect) {
+ // CacheClient's destruction will cleanup all details on old session.
+ delete m_cache_client;
+
+ // create new CacheClient to connect RO daemon.
+ auto controller_path = cct->_conf.template get_val<std::string>(
+ "immutable_object_cache_sock");
+ m_cache_client = new CacheClient(controller_path.c_str(), m_image_ctx->cct);
+ }
+
+ m_cache_client->run();
+ m_cache_client->connect(connect_ctx);
+}
+
+template <typename I>
+int ParentCacheObjectDispatch<I>::read_object(
+ std::string file_path, ceph::bufferlist* read_data, uint64_t offset,
+ uint64_t length, Context *on_finish) {
+
+ auto *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "file path: " << file_path << dendl;
+
+ std::string error;
+ int ret = read_data->pread_file(file_path.c_str(), offset, length, &error);
+ if (ret < 0) {
+ ldout(cct, 5) << "read from file return error: " << error
+ << "file path= " << file_path
+ << dendl;
+ return ret;
+ }
+ return read_data->length();
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ParentCacheObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/cache/ParentCacheObjectDispatch.h b/src/librbd/cache/ParentCacheObjectDispatch.h
new file mode 100644
index 000000000..1cf9c73b0
--- /dev/null
+++ b/src/librbd/cache/ParentCacheObjectDispatch.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PARENT_CACHER_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_CACHE_PARENT_CACHER_OBJECT_DISPATCH_H
+
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "common/ceph_mutex.h"
+#include "librbd/cache/TypeTraits.h"
+#include "tools/immutable_object_cache/CacheClient.h"
+#include "tools/immutable_object_cache/Types.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace plugin { template <typename> struct Api; }
+
+namespace cache {
+
+template <typename ImageCtxT = ImageCtx>
+class ParentCacheObjectDispatch : public io::ObjectDispatchInterface {
+ // mock unit testing support
+ typedef cache::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::CacheClient CacheClient;
+
+public:
+ static ParentCacheObjectDispatch* create(ImageCtxT* image_ctx,
+ plugin::Api<ImageCtxT>& plugin_api) {
+ return new ParentCacheObjectDispatch(image_ctx, plugin_api);
+ }
+
+ ParentCacheObjectDispatch(ImageCtxT* image_ctx,
+ plugin::Api<ImageCtxT>& plugin_api);
+ ~ParentCacheObjectDispatch() override;
+
+ io::ObjectDispatchLayer get_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_PARENT_CACHE;
+ }
+
+ void init(Context* on_finish = nullptr);
+ void shut_down(Context* on_finish) {
+ m_image_ctx->op_work_queue->queue(on_finish, 0);
+ }
+
+ bool read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ return false;
+ }
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ return false;
+ }
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ return false;
+ }
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ return false;
+ }
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_id, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ return false;
+ }
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) {
+ return false;
+ }
+
+ bool reset_existence_cache(Context* on_finish) {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+ ImageCtxT* get_image_ctx() {
+ return m_image_ctx;
+ }
+
+ CacheClient* get_cache_client() {
+ return m_cache_client;
+ }
+
+private:
+
+ int read_object(std::string file_path, ceph::bufferlist* read_data,
+ uint64_t offset, uint64_t length, Context *on_finish);
+ void handle_read_cache(ceph::immutable_obj_cache::ObjectCacheRequest* ack,
+ uint64_t object_no, io::ReadExtents* extents,
+ IOContext io_context,
+ const ZTracer::Trace &parent_trace,
+ io::DispatchResult* dispatch_result,
+ Context* on_dispatched);
+ int handle_register_client(bool reg);
+ void create_cache_session(Context* on_finish, bool is_reconnect);
+
+ ImageCtxT* m_image_ctx;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+
+ ceph::mutex m_lock;
+ CacheClient *m_cache_client = nullptr;
+ bool m_connecting = false;
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ParentCacheObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PARENT_CACHER_OBJECT_DISPATCH_H
diff --git a/src/librbd/cache/TypeTraits.h b/src/librbd/cache/TypeTraits.h
new file mode 100644
index 000000000..dd7075e8d
--- /dev/null
+++ b/src/librbd/cache/TypeTraits.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_TYPE_TRAITS_H
+#define CEPH_LIBRBD_CACHE_TYPE_TRAITS_H
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+class CacheClient;
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+
+namespace librbd {
+namespace cache {
+
+template <typename ImageCtxT>
+struct TypeTraits {
+ typedef ceph::immutable_obj_cache::CacheClient CacheClient;
+};
+
+} // namespace librbd
+} // namespace cache
+
+#endif
diff --git a/src/librbd/cache/Types.h b/src/librbd/cache/Types.h
new file mode 100644
index 000000000..43dcd758f
--- /dev/null
+++ b/src/librbd/cache/Types.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_TYPES_H
+#define CEPH_LIBRBD_CACHE_TYPES_H
+
+#include <list>
+#include <string>
+
+class Context;
+
+namespace librbd {
+namespace cache {
+
+enum ImageCacheType {
+ IMAGE_CACHE_TYPE_RWL = 1,
+ IMAGE_CACHE_TYPE_SSD,
+ IMAGE_CACHE_TYPE_UNKNOWN
+};
+
+typedef std::list<Context *> Contexts;
+
+const std::string PERSISTENT_CACHE_STATE = ".rbd_persistent_cache_state";
+
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_TYPES_H
diff --git a/src/librbd/cache/Utils.h b/src/librbd/cache/Utils.h
new file mode 100644
index 000000000..cd2eb7c3b
--- /dev/null
+++ b/src/librbd/cache/Utils.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_UTILS_H
+#define CEPH_LIBRBD_CACHE_UTILS_H
+
+#include "acconfig.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace util {
+
+template <typename T>
+bool is_pwl_enabled(T& image_ctx) {
+#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE)
+ auto value = image_ctx.config.template get_val<std::string>("rbd_persistent_cache_mode");
+ return value == "disabled" ? false : true;
+#else
+ return false;
+#endif // WITH_RBD_RWL
+}
+
+} // namespace util
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_UTILS_H
diff --git a/src/librbd/cache/WriteAroundObjectDispatch.cc b/src/librbd/cache/WriteAroundObjectDispatch.cc
new file mode 100644
index 000000000..fafb73f40
--- /dev/null
+++ b/src/librbd/cache/WriteAroundObjectDispatch.cc
@@ -0,0 +1,525 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/WriteAroundObjectDispatch.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::WriteAroundObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+using librbd::util::data_object_name;
+
+template <typename I>
+WriteAroundObjectDispatch<I>::WriteAroundObjectDispatch(
+ I* image_ctx, size_t max_dirty, bool writethrough_until_flush)
+ : m_image_ctx(image_ctx), m_init_max_dirty(max_dirty), m_max_dirty(max_dirty),
+ m_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::WriteAroundObjectDispatch::lock", this))) {
+ if (writethrough_until_flush) {
+ m_max_dirty = 0;
+ }
+}
+
+template <typename I>
+WriteAroundObjectDispatch<I>::~WriteAroundObjectDispatch() {
+}
+
+template <typename I>
+void WriteAroundObjectDispatch<I>::init() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // add ourself to the IO object dispatcher chain
+ if (m_init_max_dirty > 0) {
+ m_image_ctx->disable_zero_copy = true;
+ }
+ m_image_ctx->io_object_dispatcher->register_dispatch(this);
+}
+
+template <typename I>
+void WriteAroundObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ bool handled = false;
+ for (auto& extent: *extents) {
+ handled |= dispatch_unoptimized_io(object_no, extent.offset, extent.length,
+ dispatch_result, on_dispatched);
+ }
+ return handled;
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ return dispatch_io(object_no, object_off, object_len, 0, dispatch_result,
+ on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+
+ return dispatch_io(object_no, object_off, data.length(), op_flags,
+ dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ return dispatch_io(object_no, object_off, object_len, op_flags,
+ dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ return dispatch_unoptimized_io(object_no, object_off, cmp_data.length(),
+ dispatch_result, on_dispatched);
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (flush_source == io::FLUSH_SOURCE_USER && !m_user_flushed) {
+ m_user_flushed = true;
+ if (m_max_dirty == 0 && m_init_max_dirty > 0) {
+ ldout(cct, 5) << "first user flush: enabling write-around" << dendl;
+ m_max_dirty = m_init_max_dirty;
+ }
+ }
+
+ if (m_in_flight_io_tids.empty()) {
+ // no in-flight IO (also implies no queued/blocked IO)
+ return false;
+ }
+
+ auto tid = ++m_last_tid;
+ auto ctx = util::create_async_context_callback(*m_image_ctx, *on_finish);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ *on_finish = new LambdaContext([this, tid](int r) {
+ handle_in_flight_flush_complete(r, tid);
+ });
+
+ if (m_queued_ios.empty() && m_blocked_ios.empty()) {
+ // immediately allow the flush to be dispatched
+ ldout(cct, 20) << "dispatching: tid=" << tid << dendl;
+ m_in_flight_flushes.emplace(tid, ctx);
+ return false;
+ }
+
+ // cannot dispatch the flush until after preceeding IO is dispatched
+ ldout(cct, 20) << "queueing: tid=" << tid << dendl;
+ m_queued_flushes.emplace(tid, QueuedFlush{ctx, on_dispatched});
+ return true;
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::dispatch_unoptimized_io(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::DispatchResult* dispatch_result, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+
+ m_lock.lock();
+ auto in_flight_extents_it = m_in_flight_extents.find(object_no);
+ if (in_flight_extents_it == m_in_flight_extents.end() ||
+ !in_flight_extents_it->second.intersects(object_off, object_len)) {
+ // no IO in-flight to the specified extent
+ m_lock.unlock();
+ return false;
+ }
+
+ // write IO is in-flight -- it needs to complete before the unoptimized
+ // IO can be dispatched
+ auto tid = ++m_last_tid;
+ ldout(cct, 20) << "blocked by in-flight IO: tid=" << tid << dendl;
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ m_blocked_unoptimized_ios[object_no].emplace(
+ tid, BlockedIO{object_off, object_len, nullptr, on_dispatched});
+ m_lock.unlock();
+
+ return true;
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::dispatch_io(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ int op_flags, io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+
+ m_lock.lock();
+ if (m_max_dirty == 0) {
+ // write-through mode is active -- no-op the cache
+ m_lock.unlock();
+ return false;
+ }
+
+ if ((op_flags & LIBRADOS_OP_FLAG_FADVISE_FUA) != 0) {
+ // force unit access flag is set -- disable write-around
+ m_lock.unlock();
+ return dispatch_unoptimized_io(object_no, object_off, object_len,
+ dispatch_result, on_dispatched);
+ }
+
+ auto tid = ++m_last_tid;
+ auto ctx = util::create_async_context_callback(*m_image_ctx, *on_finish);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ *on_finish = new LambdaContext(
+ [this, tid, object_no, object_off, object_len](int r) {
+ handle_in_flight_io_complete(r, tid, object_no, object_off, object_len);
+ });
+
+ bool blocked = block_overlapping_io(&m_in_flight_extents[object_no],
+ object_off, object_len);
+ if (blocked) {
+ ldout(cct, 20) << "blocked on overlap: tid=" << tid << dendl;
+ m_queued_or_blocked_io_tids.insert(tid);
+ m_blocked_ios[object_no].emplace(tid, BlockedIO{object_off, object_len, ctx,
+ on_dispatched});
+ m_lock.unlock();
+ } else if (can_dispatch_io(tid, object_len)) {
+ m_lock.unlock();
+
+ ldout(cct, 20) << "dispatching: tid=" << tid << dendl;
+ on_dispatched->complete(0);
+ ctx->complete(0);
+ } else {
+ ldout(cct, 20) << "queueing: tid=" << tid << dendl;
+ m_queued_or_blocked_io_tids.insert(tid);
+ m_queued_ios.emplace(tid, QueuedIO{object_len, ctx, on_dispatched});
+ m_lock.unlock();
+ }
+ return true;
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::block_overlapping_io(
+ InFlightObjectExtents* in_flight_object_extents, uint64_t object_off,
+ uint64_t object_len) {
+ if (in_flight_object_extents->intersects(object_off, object_len)) {
+ return true;
+ }
+
+ in_flight_object_extents->insert(object_off, object_len);
+ return false;
+}
+
+template <typename I>
+void WriteAroundObjectDispatch<I>::unblock_overlapping_ios(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ Contexts* unoptimized_io_dispatches) {
+ auto cct = m_image_ctx->cct;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto in_flight_extents_it = m_in_flight_extents.find(object_no);
+ ceph_assert(in_flight_extents_it != m_in_flight_extents.end());
+
+ auto& in_flight_object_extents = in_flight_extents_it->second;
+ in_flight_object_extents.erase(object_off, object_len);
+
+ // handle unoptimized IOs that were blocked by in-flight IO
+ InFlightObjectExtents blocked_unoptimized_ios;
+ auto blocked_unoptimized_ios_it = m_blocked_unoptimized_ios.find(object_no);
+ if (blocked_unoptimized_ios_it != m_blocked_unoptimized_ios.end()) {
+ auto& blocked_unoptimized_object_ios = blocked_unoptimized_ios_it->second;
+ for (auto it = blocked_unoptimized_object_ios.begin();
+ it != blocked_unoptimized_object_ios.end();) {
+ auto& blocked_io = it->second;
+ if (!in_flight_object_extents.intersects(blocked_io.offset,
+ blocked_io.length)) {
+ unoptimized_io_dispatches->emplace(it->first, blocked_io.on_dispatched);
+ it = blocked_unoptimized_object_ios.erase(it);
+ } else {
+ blocked_unoptimized_ios.union_insert(blocked_io.offset,
+ blocked_io.length);
+ ++it;
+ }
+ }
+
+ if (blocked_unoptimized_object_ios.empty()) {
+ m_blocked_unoptimized_ios.erase(blocked_unoptimized_ios_it);
+ }
+ }
+
+ // handle optimized IOs that were blocked
+ auto blocked_io_it = m_blocked_ios.find(object_no);
+ if (blocked_io_it != m_blocked_ios.end()) {
+ auto& blocked_object_ios = blocked_io_it->second;
+
+ auto blocked_object_ios_it = blocked_object_ios.begin();
+ while (blocked_object_ios_it != blocked_object_ios.end()) {
+ auto next_blocked_object_ios_it = blocked_object_ios_it;
+ ++next_blocked_object_ios_it;
+
+ auto& blocked_io = blocked_object_ios_it->second;
+ if (blocked_unoptimized_ios.intersects(blocked_io.offset,
+ blocked_io.length) ||
+ block_overlapping_io(&in_flight_object_extents, blocked_io.offset,
+ blocked_io.length)) {
+ break;
+ }
+
+ // move unblocked IO to the queued list, which will get processed when
+ // there is capacity
+ auto tid = blocked_object_ios_it->first;
+ ldout(cct, 20) << "queueing unblocked: tid=" << tid << dendl;
+ m_queued_ios.emplace(tid, blocked_io);
+
+ blocked_object_ios.erase(blocked_object_ios_it);
+ blocked_object_ios_it = next_blocked_object_ios_it;
+ }
+
+ if (blocked_object_ios.empty()) {
+ m_blocked_ios.erase(blocked_io_it);
+ }
+ }
+
+ if (in_flight_object_extents.empty()) {
+ m_in_flight_extents.erase(in_flight_extents_it);
+ }
+}
+
+template <typename I>
+bool WriteAroundObjectDispatch<I>::can_dispatch_io(
+ uint64_t tid, uint64_t length) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (m_in_flight_bytes == 0 || m_in_flight_bytes + length <= m_max_dirty) {
+ // no in-flight IO or still under max write-around in-flight limit.
+ // allow the dispatcher to proceed to send the IO but complete it back
+ // to the invoker.
+ m_in_flight_bytes += length;
+ m_in_flight_io_tids.insert(tid);
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+void WriteAroundObjectDispatch<I>::handle_in_flight_io_complete(
+ int r, uint64_t tid, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl;
+
+ m_lock.lock();
+ m_in_flight_io_tids.erase(tid);
+ ceph_assert(m_in_flight_bytes >= object_len);
+ m_in_flight_bytes -= object_len;
+
+ if (r < 0) {
+ lderr(cct) << "IO error encountered: tid=" << tid << ": "
+ << cpp_strerror(r) << dendl;
+ if (m_pending_flush_error == 0) {
+ m_pending_flush_error = r;
+ }
+ }
+
+ // any overlapping blocked IOs can be queued now
+ Contexts unoptimized_io_dispatches;
+ unblock_overlapping_ios(object_no, object_off, object_len,
+ &unoptimized_io_dispatches);
+
+ // collect any flushes that are ready for completion
+ int pending_flush_error = 0;
+ auto finished_flushes = collect_finished_flushes();
+ if (!finished_flushes.empty()) {
+ std::swap(pending_flush_error, m_pending_flush_error);
+ }
+
+ // collect any queued IOs that are ready for dispatch
+ auto ready_ios = collect_ready_ios();
+
+ // collect any queued flushes that were tied to queued IOs
+ auto ready_flushes = collect_ready_flushes();
+ m_lock.unlock();
+
+ // dispatch any ready unoptimized IOs
+ for (auto& it : unoptimized_io_dispatches) {
+ ldout(cct, 20) << "dispatching unoptimized IO: tid=" << it.first << dendl;
+ it.second->complete(0);
+ }
+
+ // complete flushes that were waiting on in-flight IO
+ // (and propogate any IO error to first flush)
+ for (auto& it : finished_flushes) {
+ ldout(cct, 20) << "completing flush: tid=" << it.first << ", "
+ << "r=" << pending_flush_error << dendl;
+ it.second->complete(pending_flush_error);
+ }
+
+ // dispatch any ready queued IOs
+ for (auto& it : ready_ios) {
+ ldout(cct, 20) << "dispatching IO: tid=" << it.first << dendl;
+ it.second.on_dispatched->complete(0);
+ it.second.on_finish->complete(0);
+ }
+
+ // dispatch any ready flushes
+ for (auto& it : ready_flushes) {
+ ldout(cct, 20) << "dispatching flush: tid=" << it.first << dendl;
+ it.second->complete(0);
+ }
+}
+
+template <typename I>
+void WriteAroundObjectDispatch<I>::handle_in_flight_flush_complete(
+ int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl;
+
+ m_lock.lock();
+
+ // move the in-flight flush to the pending completion list
+ auto it = m_in_flight_flushes.find(tid);
+ ceph_assert(it != m_in_flight_flushes.end());
+
+ m_pending_flushes.emplace(it->first, it->second);
+ m_in_flight_flushes.erase(it);
+
+ // collect any flushes that are ready for completion
+ int pending_flush_error = 0;
+ auto finished_flushes = collect_finished_flushes();
+ if (!finished_flushes.empty()) {
+ std::swap(pending_flush_error, m_pending_flush_error);
+ }
+ m_lock.unlock();
+
+ // complete flushes that were waiting on in-flight IO
+ // (and propogate any IO errors)
+ for (auto& it : finished_flushes) {
+ ldout(cct, 20) << "completing flush: tid=" << it.first << dendl;
+ it.second->complete(pending_flush_error);
+ pending_flush_error = 0;
+ }
+}
+
+template <typename I>
+typename WriteAroundObjectDispatch<I>::QueuedIOs
+WriteAroundObjectDispatch<I>::collect_ready_ios() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ QueuedIOs queued_ios;
+
+ while (true) {
+ auto it = m_queued_ios.begin();
+ if (it == m_queued_ios.end() ||
+ !can_dispatch_io(it->first, it->second.length)) {
+ break;
+ }
+
+ queued_ios.emplace(it->first, it->second);
+ m_queued_or_blocked_io_tids.erase(it->first);
+ m_queued_ios.erase(it);
+ }
+ return queued_ios;
+}
+
+template <typename I>
+typename WriteAroundObjectDispatch<I>::Contexts
+WriteAroundObjectDispatch<I>::collect_ready_flushes() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Contexts ready_flushes;
+ auto io_tid_it = m_queued_or_blocked_io_tids.begin();
+ while (true) {
+ auto it = m_queued_flushes.begin();
+ if (it == m_queued_flushes.end() ||
+ (io_tid_it != m_queued_or_blocked_io_tids.end() &&
+ *io_tid_it < it->first)) {
+ break;
+ }
+
+ m_in_flight_flushes.emplace(it->first, it->second.on_finish);
+ ready_flushes.emplace(it->first, it->second.on_dispatched);
+ m_queued_flushes.erase(it);
+ }
+
+ return ready_flushes;
+}
+
+template <typename I>
+typename WriteAroundObjectDispatch<I>::Contexts
+WriteAroundObjectDispatch<I>::collect_finished_flushes() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Contexts finished_flushes;
+ auto io_tid_it = m_in_flight_io_tids.begin();
+ while (true) {
+ auto it = m_pending_flushes.begin();
+ if (it == m_pending_flushes.end() ||
+ (io_tid_it != m_in_flight_io_tids.end() && *io_tid_it < it->first)) {
+ break;
+ }
+
+ finished_flushes.emplace(it->first, it->second);
+ m_pending_flushes.erase(it);
+ }
+ return finished_flushes;
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::WriteAroundObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/cache/WriteAroundObjectDispatch.h b/src/librbd/cache/WriteAroundObjectDispatch.h
new file mode 100644
index 000000000..bc289f91c
--- /dev/null
+++ b/src/librbd/cache/WriteAroundObjectDispatch.h
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_WRITE_AROUND_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_CACHE_WRITE_AROUND_OBJECT_DISPATCH_H
+
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "include/interval_set.h"
+#include "common/ceph_mutex.h"
+#include "librbd/io/Types.h"
+#include <map>
+#include <set>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+template <typename ImageCtxT = ImageCtx>
+class WriteAroundObjectDispatch : public io::ObjectDispatchInterface {
+public:
+ static WriteAroundObjectDispatch* create(ImageCtxT* image_ctx,
+ size_t max_dirty,
+ bool writethrough_until_flush) {
+ return new WriteAroundObjectDispatch(image_ctx, max_dirty,
+ writethrough_until_flush);
+ }
+
+ WriteAroundObjectDispatch(ImageCtxT* image_ctx, size_t max_dirty,
+ bool writethrough_until_flush);
+ ~WriteAroundObjectDispatch() override;
+
+ io::ObjectDispatchLayer get_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_CACHE;
+ }
+
+ void init();
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ struct QueuedIO {
+ QueuedIO(uint64_t length, Context* on_finish, Context* on_dispatched)
+ : length(length), on_finish(on_finish), on_dispatched(on_dispatched) {
+ }
+
+ uint64_t length;
+ Context* on_finish;
+ Context* on_dispatched;
+ };
+
+ struct QueuedFlush {
+ QueuedFlush(Context* on_finish, Context* on_dispatched)
+ : on_finish(on_finish), on_dispatched(on_dispatched) {
+ }
+
+ Context* on_finish;
+ Context* on_dispatched;
+ };
+
+
+ struct BlockedIO : public QueuedIO {
+ BlockedIO(uint64_t offset, uint64_t length, Context* on_finish,
+ Context* on_dispatched)
+ : QueuedIO(length, on_finish, on_dispatched), offset(offset) {
+ }
+
+ uint64_t offset;
+ };
+
+ typedef std::map<uint64_t, QueuedIO> QueuedIOs;
+ typedef std::map<uint64_t, QueuedFlush> QueuedFlushes;
+
+ typedef std::map<uint64_t, BlockedIO> BlockedObjectIOs;
+ typedef std::map<uint64_t, BlockedObjectIOs> BlockedIOs;
+
+ typedef std::map<uint64_t, Context*> Contexts;
+ typedef std::set<uint64_t> Tids;
+ typedef interval_set<uint64_t> InFlightObjectExtents;
+ typedef std::map<uint64_t, InFlightObjectExtents> InFlightExtents;
+
+ ImageCtxT* m_image_ctx;
+ size_t m_init_max_dirty;
+ size_t m_max_dirty;
+
+ ceph::mutex m_lock;
+ bool m_user_flushed = false;
+
+ uint64_t m_last_tid = 0;
+ uint64_t m_in_flight_bytes = 0;
+
+ Tids m_in_flight_io_tids;
+ InFlightExtents m_in_flight_extents;
+
+ BlockedIOs m_blocked_ios;
+ QueuedIOs m_queued_ios;
+ Tids m_queued_or_blocked_io_tids;
+
+ BlockedIOs m_blocked_unoptimized_ios;
+
+ QueuedFlushes m_queued_flushes;
+ Contexts m_in_flight_flushes;
+ Contexts m_pending_flushes;
+ int m_pending_flush_error = 0;
+
+ bool dispatch_unoptimized_io(uint64_t object_no, uint64_t object_off,
+ uint64_t object_len,
+ io::DispatchResult* dispatch_result,
+ Context* on_dispatched);
+ bool dispatch_io(uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, int op_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatch);
+
+ bool block_overlapping_io(InFlightObjectExtents* in_flight_object_extents,
+ uint64_t object_off, uint64_t object_len);
+ void unblock_overlapping_ios(uint64_t object_no, uint64_t object_off,
+ uint64_t object_len,
+ Contexts* unoptimized_io_dispatches);
+
+ bool can_dispatch_io(uint64_t tid, uint64_t length);
+
+ void handle_in_flight_io_complete(int r, uint64_t tid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len);
+ void handle_in_flight_flush_complete(int r, uint64_t tid);
+
+ QueuedIOs collect_ready_ios();
+ Contexts collect_ready_flushes();
+ Contexts collect_finished_flushes();
+
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::WriteAroundObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_WRITE_AROUND_OBJECT_DISPATCH_H
diff --git a/src/librbd/cache/WriteLogImageDispatch.cc b/src/librbd/cache/WriteLogImageDispatch.cc
new file mode 100644
index 000000000..6cb8738e7
--- /dev/null
+++ b/src/librbd/cache/WriteLogImageDispatch.cc
@@ -0,0 +1,217 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "include/neorados/RADOS.hpp"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/ShutdownRequest.h"
+#include "librbd/cache/WriteLogImageDispatch.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::WriteLogImageDispatch: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+
+template <typename I>
+void WriteLogImageDispatch<I>::shut_down(Context* on_finish) {
+ ceph_assert(m_image_cache != nullptr);
+
+ Context* ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ m_image_cache = nullptr;
+ on_finish->complete(r);
+ });
+
+ cache::pwl::ShutdownRequest<I> *req = cache::pwl::ShutdownRequest<I>::create(
+ *m_image_ctx, m_image_cache, m_plugin_api, ctx);
+ req->send();
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ if (io_context->read_snap().value_or(CEPH_NOSNAP) != CEPH_NOSNAP) {
+ return false;
+ }
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ if (preprocess_length(aio_comp, image_extents)) {
+ return true;
+ }
+
+ m_plugin_api.update_aio_comp(aio_comp, 1, read_result, image_extents);
+
+ auto *req_comp = m_plugin_api.create_image_read_request(aio_comp, 0, image_extents);
+
+ m_image_cache->read(std::move(image_extents),
+ &req_comp->bl, op_flags,
+ req_comp);
+ return true;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ if (preprocess_length(aio_comp, image_extents)) {
+ return true;
+ }
+
+ m_plugin_api.update_aio_comp(aio_comp, 1);
+ io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp);
+ m_image_cache->write(std::move(image_extents),
+ std::move(bl), op_flags, req_comp);
+ return true;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ if (preprocess_length(aio_comp, image_extents)) {
+ return true;
+ }
+
+ m_plugin_api.update_aio_comp(aio_comp, image_extents.size());
+ for (auto &extent : image_extents) {
+ io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp);
+ m_image_cache->discard(extent.first, extent.second,
+ discard_granularity_bytes,
+ req_comp);
+ }
+ return true;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&bl, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ if (preprocess_length(aio_comp, image_extents)) {
+ return true;
+ }
+
+ m_plugin_api.update_aio_comp(aio_comp, image_extents.size());
+ for (auto &extent : image_extents) {
+ io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp);
+ m_image_cache->writesame(extent.first, extent.second,
+ std::move(bl), op_flags,
+ req_comp);
+ }
+ return true;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ if (preprocess_length(aio_comp, image_extents)) {
+ return true;
+ }
+
+ m_plugin_api.update_aio_comp(aio_comp, 1);
+ io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp);
+ m_image_cache->compare_and_write(
+ std::move(image_extents), std::move(cmp_bl), std::move(bl),
+ mismatch_offset, op_flags, req_comp);
+ return true;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+
+ m_plugin_api.update_aio_comp(aio_comp, 1);
+
+ io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp);
+ m_image_cache->flush(flush_source, req_comp);
+
+ return true;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::list_snaps(
+ io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids,
+ int list_snaps_flags, io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ return false;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::preprocess_length(
+ io::AioCompletion* aio_comp, io::Extents &image_extents) const {
+ auto total_bytes = io::util::get_extents_length(image_extents);
+ if (total_bytes == 0) {
+ m_plugin_api.update_aio_comp(aio_comp, 0);
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+bool WriteLogImageDispatch<I>::invalidate_cache(Context* on_finish) {
+ m_image_cache->invalidate(on_finish);
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::cache::WriteLogImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/cache/WriteLogImageDispatch.h b/src/librbd/cache/WriteLogImageDispatch.h
new file mode 100644
index 000000000..934491623
--- /dev/null
+++ b/src/librbd/cache/WriteLogImageDispatch.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WRITELOG_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_WRITELOG_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include "librbd/plugin/Api.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+namespace pwl { template <typename> class AbstractWriteLog; }
+
+template <typename ImageCtxT>
+class WriteLogImageDispatch : public io::ImageDispatchInterface {
+public:
+ WriteLogImageDispatch(ImageCtxT* image_ctx,
+ pwl::AbstractWriteLog<ImageCtx> *image_cache,
+ plugin::Api<ImageCtxT>& plugin_api) :
+ m_image_ctx(image_ctx), m_image_cache(image_cache),
+ m_plugin_api(plugin_api) {
+ }
+
+ io::ImageDispatchLayer get_dispatch_layer() const override {
+ return io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+ bool discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+ bool write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+ bool compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+ bool flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+ bool list_snaps(
+ io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+ pwl::AbstractWriteLog<ImageCtx> *m_image_cache;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+
+ bool preprocess_length(
+ io::AioCompletion* aio_comp, io::Extents &image_extents) const;
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::WriteLogImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_WRITELOG_IMAGE_DISPATCH_H
diff --git a/src/librbd/cache/pwl/AbstractWriteLog.cc b/src/librbd/cache/pwl/AbstractWriteLog.cc
new file mode 100644
index 000000000..6f017a9c3
--- /dev/null
+++ b/src/librbd/cache/pwl/AbstractWriteLog.cc
@@ -0,0 +1,2195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "AbstractWriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/hostname.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/plugin/Api.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using namespace librbd::cache::pwl;
+
+typedef AbstractWriteLog<ImageCtx>::Extent Extent;
+typedef AbstractWriteLog<ImageCtx>::Extents Extents;
+
+template <typename I>
+AbstractWriteLog<I>::AbstractWriteLog(
+ I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+ Builder<This> *builder, cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api)
+ : m_builder(builder),
+ m_write_log_guard(image_ctx.cct),
+ m_flush_guard(image_ctx.cct),
+ m_flush_guard_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_flush_guard_lock", this))),
+ m_deferred_dispatch_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_deferred_dispatch_lock", this))),
+ m_blockguard_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_blockguard_lock", this))),
+ m_thread_pool(
+ image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool",
+ "tp_pwl", 4, ""),
+ m_cache_state(cache_state),
+ m_image_ctx(image_ctx),
+ m_log_pool_size(DEFAULT_POOL_SIZE),
+ m_image_writeback(image_writeback),
+ m_plugin_api(plugin_api),
+ m_log_retire_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_log_retire_lock", this))),
+ m_entry_reader_lock("librbd::cache::pwl::AbstractWriteLog::m_entry_reader_lock"),
+ m_log_append_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_log_append_lock", this))),
+ m_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_lock", this))),
+ m_blocks_to_log_entries(image_ctx.cct),
+ m_work_queue("librbd::cache::pwl::ReplicatedWriteLog::work_queue",
+ ceph::make_timespan(
+ image_ctx.config.template get_val<uint64_t>(
+ "rbd_op_thread_timeout")),
+ &m_thread_pool)
+{
+ CephContext *cct = m_image_ctx.cct;
+ m_plugin_api.get_image_timer_instance(cct, &m_timer, &m_timer_lock);
+}
+
+template <typename I>
+AbstractWriteLog<I>::~AbstractWriteLog() {
+ ldout(m_image_ctx.cct, 15) << "enter" << dendl;
+ {
+ std::lock_guard timer_locker(*m_timer_lock);
+ std::lock_guard locker(m_lock);
+ m_timer->cancel_event(m_timer_ctx);
+ m_thread_pool.stop();
+ ceph_assert(m_deferred_ios.size() == 0);
+ ceph_assert(m_ops_to_flush.size() == 0);
+ ceph_assert(m_ops_to_append.size() == 0);
+ ceph_assert(m_flush_ops_in_flight == 0);
+
+ delete m_cache_state;
+ m_cache_state = nullptr;
+ }
+ ldout(m_image_ctx.cct, 15) << "exit" << dendl;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::perf_start(std::string name) {
+ PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first,
+ l_librbd_pwl_last);
+
+ // Latency axis configuration for op histograms, values are in nanoseconds
+ PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
+ "Latency (nsec)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
+ 0, ///< Start at 0
+ 5000, ///< Quantization unit is 5usec
+ 16, ///< Ranges into the mS
+ };
+
+ // Syncpoint logentry number x-axis configuration for op histograms
+ PerfHistogramCommon::axis_config_d sp_logentry_number_config{
+ "logentry number",
+ PerfHistogramCommon::SCALE_LINEAR, // log entry number in linear scale
+ 0, // Start at 0
+ 1, // Quantization unit is 1
+ 260, // Up to 260 > (MAX_WRITES_PER_SYNC_POINT)
+ };
+
+ // Syncpoint bytes number y-axis configuration for op histogram
+ PerfHistogramCommon::axis_config_d sp_bytes_number_config{
+ "Number of SyncPoint",
+ PerfHistogramCommon::SCALE_LOG2, // Request size in logarithmic scale
+ 0, // Start at 0
+ 512, // Quantization unit is 512
+ 17, // Writes up to 8M >= MAX_BYTES_PER_SYNC_POINT
+ };
+
+ // Op size axis configuration for op histogram y axis, values are in bytes
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
+ "Request size (bytes)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+ 0, ///< Start at 0
+ 512, ///< Quantization unit is 512 bytes
+ 16, ///< Writes up to >32k
+ };
+
+ // Num items configuration for op histogram y axis, values are in items
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{
+ "Number of items",
+ PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale
+ 0, ///< Start at 0
+ 1, ///< Quantization unit is 1
+ 32, ///< Writes up to >32k
+ };
+
+ plb.add_u64_counter(l_librbd_pwl_rd_req, "rd", "Reads");
+ plb.add_u64_counter(l_librbd_pwl_rd_bytes, "rd_bytes", "Data size in reads");
+ plb.add_time_avg(l_librbd_pwl_rd_latency, "rd_latency", "Latency of reads");
+
+ plb.add_u64_counter(l_librbd_pwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL");
+ plb.add_u64_counter(l_librbd_pwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL");
+ plb.add_time_avg(l_librbd_pwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits");
+
+ plb.add_u64_counter(l_librbd_pwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL");
+
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_syncpoint_hist, "syncpoint_logentry_bytes_histogram",
+ sp_logentry_number_config, sp_bytes_number_config,
+ "Histogram of syncpoint's logentry numbers vs bytes number");
+
+ plb.add_u64_counter(l_librbd_pwl_wr_req, "wr", "Writes");
+ plb.add_u64_counter(l_librbd_pwl_wr_bytes, "wr_bytes", "Data size in writes");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def, "wr_def", "Writes deferred for resources");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)");
+
+ plb.add_u64_counter(l_librbd_pwl_log_ops, "log_ops", "Log appends");
+ plb.add_u64_avg(l_librbd_pwl_log_op_bytes, "log_op_bytes", "Average log append bytes");
+
+ plb.add_time_avg(
+ l_librbd_pwl_req_arr_to_all_t, "req_arr_to_all_t",
+ "Average arrival to allocation time (time deferred for overlap)");
+ plb.add_time_avg(
+ l_librbd_pwl_req_arr_to_dis_t, "req_arr_to_dis_t",
+ "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+ plb.add_time_avg(
+ l_librbd_pwl_req_all_to_dis_t, "req_all_to_dis_t",
+ "Average allocation to dispatch time (time deferred for log resources)");
+ plb.add_time_avg(
+ l_librbd_pwl_wr_latency, "wr_latency",
+ "Latency of writes (persistent completion)");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_wr_latency_hist, "wr_latency_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write request latency (nanoseconds) vs. bytes written");
+ plb.add_time_avg(
+ l_librbd_pwl_wr_caller_latency, "caller_wr_latency",
+ "Latency of write completion to caller");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t",
+ "Average arrival to allocation time (time deferred for overlap)");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t",
+ "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t",
+ "Average allocation to dispatch time (time deferred for log resources)");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_wr_latency, "wr_latency_nw",
+ "Latency of writes (persistent completion) not deferred for free space");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_wr_caller_latency, "caller_wr_latency_nw",
+ "Latency of write completion to callerfor writes not deferred for free space");
+ plb.add_time_avg(l_librbd_pwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written");
+ plb.add_time_avg(l_librbd_pwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time");
+ plb.add_time_avg(l_librbd_pwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time");
+ plb.add_time_avg(l_librbd_pwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written");
+
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_buf_to_app_t, "op_buf_to_app_t",
+ "Average buffer persist to log append time (write data persist/replicate + wait for append time)");
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t",
+ "Average buffer persist time (write data persist/replicate time)");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write buffer persist time (nanoseconds) vs. bytes written");
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_app_to_cmp_t, "op_app_to_cmp_t",
+ "Average log append to persist complete time (log entry append/replicate + wait for complete time)");
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_app_to_appc_t, "op_app_to_appc_t",
+ "Average log append to persist complete time (log entry append/replicate time)");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of log append persist time (nanoseconds) (vs. op bytes)");
+
+ plb.add_u64_counter(l_librbd_pwl_discard, "discard", "Discards");
+ plb.add_u64_counter(l_librbd_pwl_discard_bytes, "discard_bytes", "Bytes discarded");
+ plb.add_time_avg(l_librbd_pwl_discard_latency, "discard_lat", "Discard latency");
+
+ plb.add_u64_counter(l_librbd_pwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)");
+ plb.add_u64_counter(l_librbd_pwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources");
+ plb.add_time_avg(l_librbd_pwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency");
+
+ plb.add_u64_counter(l_librbd_pwl_ws,"ws", "Write Sames");
+ plb.add_u64_counter(l_librbd_pwl_ws_bytes, "ws_bytes", "Write Same bytes to image");
+ plb.add_time_avg(l_librbd_pwl_ws_latency, "ws_lat", "Write Same latency");
+
+ plb.add_u64_counter(l_librbd_pwl_cmp, "cmp", "Compare and Write requests");
+ plb.add_u64_counter(l_librbd_pwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written");
+ plb.add_time_avg(l_librbd_pwl_cmp_latency, "cmp_lat", "Compare and Write latecy");
+ plb.add_u64_counter(l_librbd_pwl_cmp_fails, "cmp_fails", "Compare and Write compare fails");
+
+ plb.add_u64_counter(l_librbd_pwl_internal_flush, "internal_flush", "Flush RWL (write back to OSD)");
+ plb.add_time_avg(l_librbd_pwl_writeback_latency, "writeback_lat", "write back to OSD latency");
+ plb.add_u64_counter(l_librbd_pwl_invalidate_cache, "invalidate", "Invalidate RWL");
+ plb.add_u64_counter(l_librbd_pwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL");
+
+ plb.add_time_avg(l_librbd_pwl_append_tx_t, "append_tx_lat", "Log append transaction latency");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_append_tx_t_hist, "append_tx_lat_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_count_config,
+ "Histogram of log append transaction time (nanoseconds) vs. entries appended");
+ plb.add_time_avg(l_librbd_pwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_retire_tx_t_hist, "retire_tx_lat_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_count_config,
+ "Histogram of log retire transaction time (nanoseconds) vs. entries retired");
+
+ m_perfcounter = plb.create_perf_counters();
+ m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::perf_stop() {
+ ceph_assert(m_perfcounter);
+ m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter);
+ delete m_perfcounter;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::log_perf() {
+ bufferlist bl;
+ Formatter *f = Formatter::create("json-pretty");
+ bl.append("Perf dump follows\n--- Begin perf dump ---\n");
+ bl.append("{\n");
+ stringstream ss;
+ utime_t now = ceph_clock_now();
+ ss << "\"test_time\": \"" << now << "\",";
+ ss << "\"image\": \"" << m_image_ctx.name << "\",";
+ bl.append(ss);
+ bl.append("\"stats\": ");
+ m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0);
+ f->flush(bl);
+ bl.append(",\n\"histograms\": ");
+ m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0);
+ f->flush(bl);
+ delete f;
+ bl.append("}\n--- End perf dump ---\n");
+ bl.append('\0');
+ ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::periodic_stats() {
+ std::unique_lock locker(m_lock);
+ ldout(m_image_ctx.cct, 5) << "STATS: m_log_entries=" << m_log_entries.size()
+ << ", m_dirty_log_entries=" << m_dirty_log_entries.size()
+ << ", m_free_log_entries=" << m_free_log_entries
+ << ", m_bytes_allocated=" << m_bytes_allocated
+ << ", m_bytes_cached=" << m_bytes_cached
+ << ", m_bytes_dirty=" << m_bytes_dirty
+ << ", bytes available=" << m_bytes_allocated_cap - m_bytes_allocated
+ << ", m_first_valid_entry=" << m_first_valid_entry
+ << ", m_first_free_entry=" << m_first_free_entry
+ << ", m_current_sync_gen=" << m_current_sync_gen
+ << ", m_flushed_sync_gen=" << m_flushed_sync_gen
+ << dendl;
+
+ update_image_cache_state();
+ write_image_cache_state(locker);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::arm_periodic_stats() {
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+ m_timer_ctx = new LambdaContext([this](int r) {
+ /* m_timer_lock is held */
+ periodic_stats();
+ arm_periodic_stats();
+ });
+ m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::update_entries(std::shared_ptr<GenericLogEntry> *log_entry,
+ WriteLogCacheEntry *cache_entry, std::map<uint64_t, bool> &missing_sync_points,
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> &sync_point_entries,
+ uint64_t entry_index) {
+ bool writer = cache_entry->is_writer();
+ if (cache_entry->is_sync_point()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a sync point. cache_entry=[" << *cache_entry << "]" << dendl;
+ auto sync_point_entry = std::make_shared<SyncPointLogEntry>(cache_entry->sync_gen_number);
+ *log_entry = sync_point_entry;
+ sync_point_entries[cache_entry->sync_gen_number] = sync_point_entry;
+ missing_sync_points.erase(cache_entry->sync_gen_number);
+ m_current_sync_gen = cache_entry->sync_gen_number;
+ } else if (cache_entry->is_write()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a write. cache_entry=[" << *cache_entry << "]" << dendl;
+ auto write_entry =
+ m_builder->create_write_log_entry(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes);
+ write_data_to_buffer(write_entry, cache_entry);
+ *log_entry = write_entry;
+ } else if (cache_entry->is_writesame()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a write same. cache_entry=[" << *cache_entry << "]" << dendl;
+ auto ws_entry =
+ m_builder->create_writesame_log_entry(nullptr, cache_entry->image_offset_bytes,
+ cache_entry->write_bytes, cache_entry->ws_datalen);
+ write_data_to_buffer(ws_entry, cache_entry);
+ *log_entry = ws_entry;
+ } else if (cache_entry->is_discard()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a discard. cache_entry=[" << *cache_entry << "]" << dendl;
+ auto discard_entry =
+ std::make_shared<DiscardLogEntry>(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes,
+ m_discard_granularity_bytes);
+ *log_entry = discard_entry;
+ } else {
+ lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index
+ << ", cache_entry=[" << *cache_entry << "]" << dendl;
+ }
+
+ if (writer) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " writes. cache_entry=[" << *cache_entry << "]" << dendl;
+ if (!sync_point_entries[cache_entry->sync_gen_number]) {
+ missing_sync_points[cache_entry->sync_gen_number] = true;
+ }
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::update_sync_points(std::map<uint64_t, bool> &missing_sync_points,
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> &sync_point_entries,
+ DeferredContexts &later) {
+ /* Create missing sync points. These must not be appended until the
+ * entry reload is complete and the write map is up to
+ * date. Currently this is handled by the deferred contexts object
+ * passed to new_sync_point(). These contexts won't be completed
+ * until this function returns. */
+ for (auto &kv : missing_sync_points) {
+ ldout(m_image_ctx.cct, 5) << "Adding sync point " << kv.first << dendl;
+ if (0 == m_current_sync_gen) {
+ /* The unlikely case where the log contains writing entries, but no sync
+ * points (e.g. because they were all retired) */
+ m_current_sync_gen = kv.first-1;
+ }
+ ceph_assert(kv.first == m_current_sync_gen+1);
+ init_flush_new_sync_point(later);
+ ceph_assert(kv.first == m_current_sync_gen);
+ sync_point_entries[kv.first] = m_current_sync_point->log_entry;;
+ }
+
+ /*
+ * Iterate over the log entries again (this time via the global
+ * entries list), connecting write entries to their sync points and
+ * updating the sync point stats.
+ *
+ * Add writes to the write log map.
+ */
+ std::shared_ptr<SyncPointLogEntry> previous_sync_point_entry = nullptr;
+ for (auto &log_entry : m_log_entries) {
+ if ((log_entry->write_bytes() > 0) || (log_entry->bytes_dirty() > 0)) {
+ /* This entry is one of the types that write */
+ auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(log_entry);
+ if (gen_write_entry) {
+ auto sync_point_entry = sync_point_entries[gen_write_entry->ram_entry.sync_gen_number];
+ if (!sync_point_entry) {
+ lderr(m_image_ctx.cct) << "Sync point missing for entry=[" << *gen_write_entry << "]" << dendl;
+ ceph_assert(false);
+ } else {
+ gen_write_entry->sync_point_entry = sync_point_entry;
+ sync_point_entry->writes++;
+ sync_point_entry->bytes += gen_write_entry->ram_entry.write_bytes;
+ sync_point_entry->writes_completed++;
+ m_blocks_to_log_entries.add_log_entry(gen_write_entry);
+ /* This entry is only dirty if its sync gen number is > the flushed
+ * sync gen number from the root object. */
+ if (gen_write_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) {
+ m_dirty_log_entries.push_back(log_entry);
+ m_bytes_dirty += gen_write_entry->bytes_dirty();
+ } else {
+ gen_write_entry->set_flushed(true);
+ sync_point_entry->writes_flushed++;
+ }
+
+ /* calc m_bytes_allocated & m_bytes_cached */
+ inc_allocated_cached_bytes(log_entry);
+ }
+ }
+ } else {
+ /* This entry is sync point entry */
+ auto sync_point_entry = static_pointer_cast<SyncPointLogEntry>(log_entry);
+ if (sync_point_entry) {
+ if (previous_sync_point_entry) {
+ previous_sync_point_entry->next_sync_point_entry = sync_point_entry;
+ if (previous_sync_point_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) {
+ sync_point_entry->prior_sync_point_flushed = false;
+ ceph_assert(!previous_sync_point_entry->prior_sync_point_flushed ||
+ (0 == previous_sync_point_entry->writes) ||
+ (previous_sync_point_entry->writes >= previous_sync_point_entry->writes_flushed));
+ } else {
+ sync_point_entry->prior_sync_point_flushed = true;
+ ceph_assert(previous_sync_point_entry->prior_sync_point_flushed);
+ ceph_assert(previous_sync_point_entry->writes == previous_sync_point_entry->writes_flushed);
+ }
+ } else {
+ /* There are no previous sync points, so we'll consider them flushed */
+ sync_point_entry->prior_sync_point_flushed = true;
+ }
+ previous_sync_point_entry = sync_point_entry;
+ ldout(m_image_ctx.cct, 10) << "Loaded to sync point=[" << *sync_point_entry << dendl;
+ }
+ }
+ }
+ if (0 == m_current_sync_gen) {
+ /* If a re-opened log was completely flushed, we'll have found no sync point entries here,
+ * and not advanced m_current_sync_gen. Here we ensure it starts past the last flushed sync
+ * point recorded in the log. */
+ m_current_sync_gen = m_flushed_sync_gen;
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::pwl_init(Context *on_finish, DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+ ceph_assert(m_cache_state);
+ std::lock_guard locker(m_lock);
+ ceph_assert(!m_initialized);
+ ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
+
+ if (!m_cache_state->present) {
+ m_cache_state->host = ceph_get_short_hostname();
+ m_cache_state->size = m_image_ctx.config.template get_val<uint64_t>(
+ "rbd_persistent_cache_size");
+
+ string path = m_image_ctx.config.template get_val<string>(
+ "rbd_persistent_cache_path");
+ std::string pool_name = m_image_ctx.md_ctx.get_pool_name();
+ m_cache_state->path = path + "/rbd-pwl." + pool_name + "." + m_image_ctx.id + ".pool";
+ }
+
+ ldout(cct,5) << "pwl_size: " << m_cache_state->size << dendl;
+ ldout(cct,5) << "pwl_path: " << m_cache_state->path << dendl;
+
+ m_log_pool_name = m_cache_state->path;
+ m_log_pool_size = max(m_cache_state->size, MIN_POOL_SIZE);
+ m_log_pool_size = p2align(m_log_pool_size, POOL_SIZE_ALIGN);
+ ldout(cct, 5) << "pool " << m_log_pool_name << " size " << m_log_pool_size
+ << " (adjusted from " << m_cache_state->size << ")" << dendl;
+
+ if ((!m_cache_state->present) &&
+ (access(m_log_pool_name.c_str(), F_OK) == 0)) {
+ ldout(cct, 5) << "There's an existing pool file " << m_log_pool_name
+ << ", While there's no cache in the image metatata." << dendl;
+ if (remove(m_log_pool_name.c_str()) != 0) {
+ lderr(cct) << "Failed to remove the pool file " << m_log_pool_name
+ << dendl;
+ on_finish->complete(-errno);
+ return;
+ } else {
+ ldout(cct, 5) << "Removed the existing pool file." << dendl;
+ }
+ } else if ((m_cache_state->present) &&
+ (access(m_log_pool_name.c_str(), F_OK) != 0)) {
+ ldout(cct, 5) << "Can't find the existed pool file " << m_log_pool_name << dendl;
+ on_finish->complete(-errno);
+ return;
+ }
+
+ bool succeeded = initialize_pool(on_finish, later);
+ if (!succeeded) {
+ return ;
+ }
+
+ ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries
+ << " log entries, " << m_free_log_entries << " of which are free."
+ << " first_valid=" << m_first_valid_entry
+ << ", first_free=" << m_first_free_entry
+ << ", flushed_sync_gen=" << m_flushed_sync_gen
+ << ", m_current_sync_gen=" << m_current_sync_gen << dendl;
+ if (m_first_free_entry == m_first_valid_entry) {
+ ldout(cct,1) << "write log is empty" << dendl;
+ m_cache_state->empty = true;
+ }
+
+ /* Start the sync point following the last one seen in the
+ * log. Flush the last sync point created during the loading of the
+ * existing log entries. */
+ init_flush_new_sync_point(later);
+ ldout(cct,20) << "new sync point = [" << m_current_sync_point << "]" << dendl;
+
+ m_initialized = true;
+ // Start the thread
+ m_thread_pool.start();
+
+ /* Do these after we drop lock */
+ later.add(new LambdaContext([this](int r) {
+ /* Log stats for the first time */
+ periodic_stats();
+ /* Arm periodic stats logging for the first time */
+ std::lock_guard timer_locker(*m_timer_lock);
+ arm_periodic_stats();
+ }));
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::write_image_cache_state(std::unique_lock<ceph::mutex>& locker) {
+ using klass = AbstractWriteLog<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_write_image_cache_state>(this);
+ m_cache_state->write_image_cache_state(locker, ctx);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::update_image_cache_state() {
+ ldout(m_image_ctx.cct, 10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ m_cache_state->allocated_bytes = m_bytes_allocated;
+ m_cache_state->cached_bytes = m_bytes_cached;
+ m_cache_state->dirty_bytes = m_bytes_dirty;
+ m_cache_state->free_bytes = m_bytes_allocated_cap - m_bytes_allocated;
+ m_cache_state->hits_full = m_perfcounter->get(l_librbd_pwl_rd_hit_req);
+ m_cache_state->hits_partial = m_perfcounter->get(l_librbd_pwl_rd_part_hit_req);
+ m_cache_state->misses = m_perfcounter->get(l_librbd_pwl_rd_req) -
+ m_cache_state->hits_full - m_cache_state->hits_partial;
+ m_cache_state->hit_bytes = m_perfcounter->get(l_librbd_pwl_rd_hit_bytes);
+ m_cache_state->miss_bytes = m_perfcounter->get(l_librbd_pwl_rd_bytes) -
+ m_cache_state->hit_bytes;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::handle_write_image_cache_state(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to update image cache state: " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::init(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+ auto pname = std::string("librbd-pwl-") + m_image_ctx.id +
+ std::string("-") + m_image_ctx.md_ctx.get_pool_name() +
+ std::string("-") + m_image_ctx.name;
+ perf_start(pname);
+
+ ceph_assert(!m_initialized);
+
+ Context *ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ if (r >= 0) {
+ std::unique_lock locker(m_lock);
+ update_image_cache_state();
+ m_cache_state->write_image_cache_state(locker, on_finish);
+ } else {
+ on_finish->complete(r);
+ }
+ });
+
+ DeferredContexts later;
+ pwl_init(ctx, later);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::shut_down(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
+
+ Context *ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ if (m_perfcounter) {
+ perf_stop();
+ }
+ ldout(m_image_ctx.cct, 6) << "shutdown complete" << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, r);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ ldout(m_image_ctx.cct, 6) << "image cache cleaned" << dendl;
+ Context *next_ctx = override_ctx(r, ctx);
+ periodic_stats();
+
+ std::unique_lock locker(m_lock);
+ check_image_cache_state_clean();
+ m_wake_up_enabled = false;
+ m_log_entries.clear();
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ remove_pool_file();
+ update_image_cache_state();
+ m_cache_state->write_image_cache_state(locker, next_ctx);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ Context *next_ctx = override_ctx(r, ctx);
+ ldout(m_image_ctx.cct, 6) << "waiting for in flight operations" << dendl;
+ // Wait for in progress IOs to complete
+ next_ctx = util::create_async_context_callback(&m_work_queue, next_ctx);
+ m_async_op_tracker.wait_for_ops(next_ctx);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ Context *next_ctx = override_ctx(r, ctx);
+ {
+ /* Sync with process_writeback_dirty_entries() */
+ RWLock::WLocker entry_reader_wlocker(m_entry_reader_lock);
+ m_shutting_down = true;
+ /* Flush all writes to OSDs (unless disabled) and wait for all
+ in-progress flush writes to complete */
+ ldout(m_image_ctx.cct, 6) << "flushing" << dendl;
+ periodic_stats();
+ }
+ flush_dirty_entries(next_ctx);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ ldout(m_image_ctx.cct, 6) << "Done internal_flush in shutdown" << dendl;
+ m_work_queue.queue(ctx, r);
+ });
+ /* Complete all in-flight writes before shutting down */
+ ldout(m_image_ctx.cct, 6) << "internal_flush in shutdown" << dendl;
+ internal_flush(false, ctx);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::read(Extents&& image_extents,
+ ceph::bufferlist* bl,
+ int fadvise_flags, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ utime_t now = ceph_clock_now();
+
+ on_finish = new LambdaContext(
+ [this, on_finish](int r) {
+ m_async_op_tracker.finish_op();
+ on_finish->complete(r);
+ });
+ C_ReadRequest *read_ctx = m_builder->create_read_request(
+ cct, now, m_perfcounter, bl, on_finish);
+ ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
+ << "image_extents=" << image_extents << ", "
+ << "bl=" << bl << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ceph_assert(m_initialized);
+ bl->clear();
+ m_perfcounter->inc(l_librbd_pwl_rd_req, 1);
+
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> log_entries_to_read;
+ std::vector<bufferlist*> bls_to_read;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new LambdaContext(
+ [this, read_ctx, fadvise_flags](int r) {
+ if (read_ctx->miss_extents.empty()) {
+ /* All of this read comes from RWL */
+ read_ctx->complete(0);
+ } else {
+ /* Pass the read misses on to the layer below RWL */
+ m_image_writeback.aio_read(
+ std::move(read_ctx->miss_extents), &read_ctx->miss_bl,
+ fadvise_flags, read_ctx);
+ }
+ });
+
+ /*
+ * The strategy here is to look up all the WriteLogMapEntries that overlap
+ * this read, and iterate through those to separate this read into hits and
+ * misses. A new Extents object is produced here with Extents for each miss
+ * region. The miss Extents is then passed on to the read cache below RWL. We
+ * also produce an ImageExtentBufs for all the extents (hit or miss) in this
+ * read. When the read from the lower cache layer completes, we iterate
+ * through the ImageExtentBufs and insert buffers for each cache hit at the
+ * appropriate spot in the bufferlist returned from below for the miss
+ * read. The buffers we insert here refer directly to regions of various
+ * write log entry data buffers.
+ *
+ * Locking: These buffer objects hold a reference on the write log entries
+ * they refer to. Log entries can't be retired until there are no references.
+ * The GenericWriteLogEntry references are released by the buffer destructor.
+ */
+ for (auto &extent : image_extents) {
+ uint64_t extent_offset = 0;
+ RWLock::RLocker entry_reader_locker(m_entry_reader_lock);
+ WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(
+ block_extent(extent));
+ for (auto &map_entry : map_entries) {
+ Extent entry_image_extent(pwl::image_extent(map_entry.block_extent));
+ /* If this map entry starts after the current image extent offset ... */
+ if (entry_image_extent.first > extent.first + extent_offset) {
+ /* ... add range before map_entry to miss extents */
+ uint64_t miss_extent_start = extent.first + extent_offset;
+ uint64_t miss_extent_length = entry_image_extent.first -
+ miss_extent_start;
+ Extent miss_extent(miss_extent_start, miss_extent_length);
+ read_ctx->miss_extents.push_back(miss_extent);
+ /* Add miss range to read extents */
+ auto miss_extent_buf = std::make_shared<ImageExtentBuf>(miss_extent);
+ read_ctx->read_extents.push_back(miss_extent_buf);
+ extent_offset += miss_extent_length;
+ }
+ ceph_assert(entry_image_extent.first <= extent.first + extent_offset);
+ uint64_t entry_offset = 0;
+ /* If this map entry starts before the current image extent offset ... */
+ if (entry_image_extent.first < extent.first + extent_offset) {
+ /* ... compute offset into log entry for this read extent */
+ entry_offset = (extent.first + extent_offset) - entry_image_extent.first;
+ }
+ /* This read hit ends at the end of the extent or the end of the log
+ entry, whichever is less. */
+ uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset,
+ extent.second - extent_offset);
+ Extent hit_extent(entry_image_extent.first, entry_hit_length);
+ if (0 == map_entry.log_entry->write_bytes() &&
+ 0 < map_entry.log_entry->bytes_dirty()) {
+ /* discard log entry */
+ ldout(cct, 20) << "discard log entry" << dendl;
+ auto discard_entry = map_entry.log_entry;
+ ldout(cct, 20) << "read hit on discard entry: log_entry="
+ << *discard_entry
+ << dendl;
+ /* Discards read as zero, so we'll construct a bufferlist of zeros */
+ bufferlist zero_bl;
+ zero_bl.append_zero(entry_hit_length);
+ /* Add hit extent to read extents */
+ auto hit_extent_buf = std::make_shared<ImageExtentBuf>(
+ hit_extent, zero_bl);
+ read_ctx->read_extents.push_back(hit_extent_buf);
+ } else {
+ ldout(cct, 20) << "write or writesame log entry" << dendl;
+ /* write and writesame log entry */
+ /* Offset of the map entry into the log entry's buffer */
+ uint64_t map_entry_buffer_offset = entry_image_extent.first -
+ map_entry.log_entry->ram_entry.image_offset_bytes;
+ /* Offset into the log entry buffer of this read hit */
+ uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
+ /* Create buffer object referring to pmem pool for this read hit */
+ collect_read_extents(
+ read_buffer_offset, map_entry, log_entries_to_read, bls_to_read,
+ entry_hit_length, hit_extent, read_ctx);
+ }
+ /* Exclude RWL hit range from buffer and extent */
+ extent_offset += entry_hit_length;
+ ldout(cct, 20) << map_entry << dendl;
+ }
+ /* If the last map entry didn't consume the entire image extent ... */
+ if (extent.second > extent_offset) {
+ /* ... add the rest of this extent to miss extents */
+ uint64_t miss_extent_start = extent.first + extent_offset;
+ uint64_t miss_extent_length = extent.second - extent_offset;
+ Extent miss_extent(miss_extent_start, miss_extent_length);
+ read_ctx->miss_extents.push_back(miss_extent);
+ /* Add miss range to read extents */
+ auto miss_extent_buf = std::make_shared<ImageExtentBuf>(miss_extent);
+ read_ctx->read_extents.push_back(miss_extent_buf);
+ extent_offset += miss_extent_length;
+ }
+ }
+
+ ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", "
+ << "miss_bl=" << read_ctx->miss_bl << dendl;
+
+ complete_read(log_entries_to_read, bls_to_read, ctx);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::write(Extents &&image_extents,
+ bufferlist&& bl,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << "aio_write" << dendl;
+
+ utime_t now = ceph_clock_now();
+ m_perfcounter->inc(l_librbd_pwl_wr_req, 1);
+
+ ceph_assert(m_initialized);
+
+ /* Split images because PMDK's space management is not perfect, there are
+ * fragment problems. The larger the block size difference of the block,
+ * the easier the fragmentation problem will occur, resulting in the
+ * remaining space can not be allocated in large size. We plan to manage
+ * pmem space and allocation by ourselves in the future.
+ */
+ Extents split_image_extents;
+ uint64_t max_extent_size = get_max_extent();
+ if (max_extent_size != 0) {
+ for (auto extent : image_extents) {
+ if (extent.second > max_extent_size) {
+ uint64_t off = extent.first;
+ uint64_t extent_bytes = extent.second;
+ for (int i = 0; extent_bytes != 0; ++i) {
+ Extent _ext;
+ _ext.first = off + i * max_extent_size;
+ _ext.second = std::min(max_extent_size, extent_bytes);
+ extent_bytes = extent_bytes - _ext.second ;
+ split_image_extents.emplace_back(_ext);
+ }
+ } else {
+ split_image_extents.emplace_back(extent);
+ }
+ }
+ } else {
+ split_image_extents = image_extents;
+ }
+
+ C_WriteRequestT *write_req =
+ m_builder->create_write_request(*this, now, std::move(split_image_extents),
+ std::move(bl), fadvise_flags, m_lock,
+ m_perfcounter, on_finish);
+ m_perfcounter->inc(l_librbd_pwl_wr_bytes,
+ write_req->image_extents_summary.total_bytes);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this,
+ write_req](GuardedRequestFunctionContext &guard_ctx) {
+ write_req->blockguard_acquired(guard_ctx);
+ alloc_and_dispatch_io_req(write_req);
+ });
+
+ detain_guarded_request(write_req, guarded_ctx, false);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << dendl;
+
+ utime_t now = ceph_clock_now();
+ m_perfcounter->inc(l_librbd_pwl_discard, 1);
+ Extents discard_extents = {{offset, length}};
+ m_discard_granularity_bytes = discard_granularity_bytes;
+
+ ceph_assert(m_initialized);
+
+ auto *discard_req =
+ new C_DiscardRequestT(*this, now, std::move(discard_extents), discard_granularity_bytes,
+ m_lock, m_perfcounter, on_finish);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, discard_req](GuardedRequestFunctionContext &guard_ctx) {
+ discard_req->blockguard_acquired(guard_ctx);
+ alloc_and_dispatch_io_req(discard_req);
+ });
+
+ detain_guarded_request(discard_req, guarded_ctx, false);
+}
+
+/**
+ * Aio_flush completes when all previously completed writes are
+ * flushed to persistent cache. We make a best-effort attempt to also
+ * defer until all in-progress writes complete, but we may not know
+ * about all of the writes the application considers in-progress yet,
+ * due to uncertainty in the IO submission workq (multiple WQ threads
+ * may allow out-of-order submission).
+ *
+ * This flush operation will not wait for writes deferred for overlap
+ * in the block guard.
+ */
+template <typename I>
+void AbstractWriteLog<I>::flush(io::FlushSource flush_source, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "on_finish=" << on_finish << " flush_source=" << flush_source << dendl;
+
+ if (io::FLUSH_SOURCE_SHUTDOWN == flush_source || io::FLUSH_SOURCE_INTERNAL == flush_source ||
+ io::FLUSH_SOURCE_WRITE_BLOCK == flush_source) {
+ internal_flush(false, on_finish);
+ return;
+ }
+ m_perfcounter->inc(l_librbd_pwl_aio_flush, 1);
+
+ /* May be called even if initialization fails */
+ if (!m_initialized) {
+ ldout(cct, 05) << "never initialized" << dendl;
+ /* Deadlock if completed here */
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ {
+ std::shared_lock image_locker(m_image_ctx.image_lock);
+ if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+ }
+
+ auto flush_req = make_flush_req(on_finish);
+
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, flush_req](GuardedRequestFunctionContext &guard_ctx) {
+ ldout(m_image_ctx.cct, 20) << "flush_req=" << flush_req << " cell=" << guard_ctx.cell << dendl;
+ ceph_assert(guard_ctx.cell);
+ flush_req->detained = guard_ctx.state.detained;
+ /* We don't call flush_req->set_cell(), because the block guard will be released here */
+ {
+ DeferredContexts post_unlock; /* Do these when the lock below is released */
+ std::lock_guard locker(m_lock);
+
+ if (!m_persist_on_flush && m_persist_on_write_until_flush) {
+ m_persist_on_flush = true;
+ ldout(m_image_ctx.cct, 5) << "now persisting on flush" << dendl;
+ }
+
+ /*
+ * Create a new sync point if there have been writes since the last
+ * one.
+ *
+ * We do not flush the caches below the RWL here.
+ */
+ flush_new_sync_point_if_needed(flush_req, post_unlock);
+ }
+
+ release_guarded_request(guard_ctx.cell);
+ });
+
+ detain_guarded_request(flush_req, guarded_ctx, true);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::writesame(uint64_t offset, uint64_t length,
+ bufferlist&& bl, int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << "aio_writesame" << dendl;
+
+ utime_t now = ceph_clock_now();
+ Extents ws_extents = {{offset, length}};
+ m_perfcounter->inc(l_librbd_pwl_ws, 1);
+ ceph_assert(m_initialized);
+
+ /* A write same request is also a write request. The key difference is the
+ * write same data buffer is shorter than the extent of the request. The full
+ * extent will be used in the block guard, and appear in
+ * m_blocks_to_log_entries_map. The data buffer allocated for the WS is only
+ * as long as the length of the bl here, which is the pattern that's repeated
+ * in the image for the entire length of this WS. Read hits and flushing of
+ * write sames are different than normal writes. */
+ C_WriteSameRequestT *ws_req =
+ m_builder->create_writesame_request(*this, now, std::move(ws_extents), std::move(bl),
+ fadvise_flags, m_lock, m_perfcounter, on_finish);
+ m_perfcounter->inc(l_librbd_pwl_ws_bytes, ws_req->image_extents_summary.total_bytes);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, ws_req](GuardedRequestFunctionContext &guard_ctx) {
+ ws_req->blockguard_acquired(guard_ctx);
+ alloc_and_dispatch_io_req(ws_req);
+ });
+
+ detain_guarded_request(ws_req, guarded_ctx, false);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::compare_and_write(Extents &&image_extents,
+ bufferlist&& cmp_bl,
+ bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ utime_t now = ceph_clock_now();
+ m_perfcounter->inc(l_librbd_pwl_cmp, 1);
+ ceph_assert(m_initialized);
+
+ /* A compare and write request is also a write request. We only allocate
+ * resources and dispatch this write request if the compare phase
+ * succeeds. */
+ C_WriteRequestT *cw_req =
+ m_builder->create_comp_and_write_request(
+ *this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
+ mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
+ m_perfcounter->inc(l_librbd_pwl_cmp_bytes, cw_req->image_extents_summary.total_bytes);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, cw_req](GuardedRequestFunctionContext &guard_ctx) {
+ cw_req->blockguard_acquired(guard_ctx);
+
+ auto read_complete_ctx = new LambdaContext(
+ [this, cw_req](int r) {
+ ldout(m_image_ctx.cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
+ << "cw_req=" << cw_req << dendl;
+
+ /* Compare read_bl to cmp_bl to determine if this will produce a write */
+ buffer::list aligned_read_bl;
+ if (cw_req->cmp_bl.length() < cw_req->read_bl.length()) {
+ aligned_read_bl.substr_of(cw_req->read_bl, 0, cw_req->cmp_bl.length());
+ }
+ if (cw_req->cmp_bl.contents_equal(cw_req->read_bl) ||
+ cw_req->cmp_bl.contents_equal(aligned_read_bl)) {
+ /* Compare phase succeeds. Begin write */
+ ldout(m_image_ctx.cct, 5) << " cw_req=" << cw_req << " compare matched" << dendl;
+ cw_req->compare_succeeded = true;
+ *cw_req->mismatch_offset = 0;
+ /* Continue with this request as a write. Blockguard release and
+ * user request completion handled as if this were a plain
+ * write. */
+ alloc_and_dispatch_io_req(cw_req);
+ } else {
+ /* Compare phase fails. Comp-and write ends now. */
+ ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " compare failed" << dendl;
+ /* Bufferlist doesn't tell us where they differed, so we'll have to determine that here */
+ uint64_t bl_index = 0;
+ for (bl_index = 0; bl_index < cw_req->cmp_bl.length(); bl_index++) {
+ if (cw_req->cmp_bl[bl_index] != cw_req->read_bl[bl_index]) {
+ ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " mismatch at " << bl_index << dendl;
+ break;
+ }
+ }
+ cw_req->compare_succeeded = false;
+ *cw_req->mismatch_offset = bl_index;
+ cw_req->complete_user_request(-EILSEQ);
+ cw_req->release_cell();
+ cw_req->complete(0);
+ }
+ });
+
+ /* Read phase of comp-and-write must read through RWL */
+ Extents image_extents_copy = cw_req->image_extents;
+ read(std::move(image_extents_copy), &cw_req->read_bl, cw_req->fadvise_flags, read_complete_ctx);
+ });
+
+ detain_guarded_request(cw_req, guarded_ctx, false);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::flush(Context *on_finish) {
+ internal_flush(false, on_finish);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::invalidate(Context *on_finish) {
+ internal_flush(true, on_finish);
+}
+
+template <typename I>
+CephContext *AbstractWriteLog<I>::get_context() {
+ return m_image_ctx.cct;
+}
+
+template <typename I>
+BlockGuardCell* AbstractWriteLog<I>::detain_guarded_request_helper(GuardedRequest &req)
+{
+ CephContext *cct = m_image_ctx.cct;
+ BlockGuardCell *cell;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock));
+ ldout(cct, 20) << dendl;
+
+ int r = m_write_log_guard.detain(req.block_extent, &req, &cell);
+ ceph_assert(r>=0);
+ if (r > 0) {
+ ldout(cct, 20) << "detaining guarded request due to in-flight requests: "
+ << "req=" << req << dendl;
+ return nullptr;
+ }
+
+ ldout(cct, 20) << "in-flight request cell: " << cell << dendl;
+ return cell;
+}
+
+template <typename I>
+BlockGuardCell* AbstractWriteLog<I>::detain_guarded_request_barrier_helper(
+ GuardedRequest &req)
+{
+ BlockGuardCell *cell = nullptr;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock));
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ if (m_barrier_in_progress) {
+ req.guard_ctx->state.queued = true;
+ m_awaiting_barrier.push_back(req);
+ } else {
+ bool barrier = req.guard_ctx->state.barrier;
+ if (barrier) {
+ m_barrier_in_progress = true;
+ req.guard_ctx->state.current_barrier = true;
+ }
+ cell = detain_guarded_request_helper(req);
+ if (barrier) {
+ /* Only non-null if the barrier acquires the guard now */
+ m_barrier_cell = cell;
+ }
+ }
+
+ return cell;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::detain_guarded_request(
+ C_BlockIORequestT *request,
+ GuardedRequestFunctionContext *guarded_ctx,
+ bool is_barrier)
+{
+ BlockExtent extent;
+ if (request) {
+ extent = request->image_extents_summary.block_extent();
+ } else {
+ extent = block_extent(whole_volume_extent());
+ }
+ auto req = GuardedRequest(extent, guarded_ctx, is_barrier);
+ BlockGuardCell *cell = nullptr;
+
+ ldout(m_image_ctx.cct, 20) << dendl;
+ {
+ std::lock_guard locker(m_blockguard_lock);
+ cell = detain_guarded_request_barrier_helper(req);
+ }
+ if (cell) {
+ req.guard_ctx->cell = cell;
+ req.guard_ctx->complete(0);
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::release_guarded_request(BlockGuardCell *released_cell)
+{
+ CephContext *cct = m_image_ctx.cct;
+ WriteLogGuard::BlockOperations block_reqs;
+ ldout(cct, 20) << "released_cell=" << released_cell << dendl;
+
+ {
+ std::lock_guard locker(m_blockguard_lock);
+ m_write_log_guard.release(released_cell, &block_reqs);
+
+ for (auto &req : block_reqs) {
+ req.guard_ctx->state.detained = true;
+ BlockGuardCell *detained_cell = detain_guarded_request_helper(req);
+ if (detained_cell) {
+ if (req.guard_ctx->state.current_barrier) {
+ /* The current barrier is acquiring the block guard, so now we know its cell */
+ m_barrier_cell = detained_cell;
+ /* detained_cell could be == released_cell here */
+ ldout(cct, 20) << "current barrier cell=" << detained_cell << " req=" << req << dendl;
+ }
+ req.guard_ctx->cell = detained_cell;
+ m_work_queue.queue(req.guard_ctx);
+ }
+ }
+
+ if (m_barrier_in_progress && (released_cell == m_barrier_cell)) {
+ ldout(cct, 20) << "current barrier released cell=" << released_cell << dendl;
+ /* The released cell is the current barrier request */
+ m_barrier_in_progress = false;
+ m_barrier_cell = nullptr;
+ /* Move waiting requests into the blockguard. Stop if there's another barrier */
+ while (!m_barrier_in_progress && !m_awaiting_barrier.empty()) {
+ auto &req = m_awaiting_barrier.front();
+ ldout(cct, 20) << "submitting queued request to blockguard: " << req << dendl;
+ BlockGuardCell *detained_cell = detain_guarded_request_barrier_helper(req);
+ if (detained_cell) {
+ req.guard_ctx->cell = detained_cell;
+ m_work_queue.queue(req.guard_ctx);
+ }
+ m_awaiting_barrier.pop_front();
+ }
+ }
+ }
+
+ ldout(cct, 20) << "exit" << dendl;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::append_scheduled(GenericLogOperations &ops, bool &ops_remain,
+ bool &appending, bool isRWL)
+{
+ const unsigned long int OPS_APPENDED = isRWL ? MAX_ALLOC_PER_TRANSACTION
+ : MAX_WRITES_PER_SYNC_POINT;
+ {
+ std::lock_guard locker(m_lock);
+ if (!appending && m_appending) {
+ /* Another thread is appending */
+ ldout(m_image_ctx.cct, 15) << "Another thread is appending" << dendl;
+ return;
+ }
+ if (m_ops_to_append.size()) {
+ appending = true;
+ m_appending = true;
+ auto last_in_batch = m_ops_to_append.begin();
+ unsigned int ops_to_append = m_ops_to_append.size();
+ if (ops_to_append > OPS_APPENDED) {
+ ops_to_append = OPS_APPENDED;
+ }
+ std::advance(last_in_batch, ops_to_append);
+ ops.splice(ops.end(), m_ops_to_append, m_ops_to_append.begin(), last_in_batch);
+ ops_remain = true; /* Always check again before leaving */
+ ldout(m_image_ctx.cct, 20) << "appending " << ops.size() << ", "
+ << m_ops_to_append.size() << " remain" << dendl;
+ } else if (isRWL) {
+ ops_remain = false;
+ if (appending) {
+ appending = false;
+ m_appending = false;
+ }
+ }
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::schedule_append(GenericLogOperationsVector &ops, C_BlockIORequestT *req)
+{
+ GenericLogOperations to_append(ops.begin(), ops.end());
+
+ schedule_append_ops(to_append, req);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::schedule_append(GenericLogOperationSharedPtr op, C_BlockIORequestT *req)
+{
+ GenericLogOperations to_append { op };
+
+ schedule_append_ops(to_append, req);
+}
+
+/*
+ * Complete a set of write ops with the result of append_op_entries.
+ */
+template <typename I>
+void AbstractWriteLog<I>::complete_op_log_entries(GenericLogOperations &&ops,
+ const int result)
+{
+ GenericLogEntries dirty_entries;
+ int published_reserves = 0;
+ ldout(m_image_ctx.cct, 20) << __func__ << ": completing" << dendl;
+ for (auto &op : ops) {
+ utime_t now = ceph_clock_now();
+ auto log_entry = op->get_log_entry();
+ log_entry->completed = true;
+ if (op->is_writing_op()) {
+ op->mark_log_entry_completed();
+ dirty_entries.push_back(log_entry);
+ }
+ if (log_entry->is_write_entry()) {
+ release_ram(log_entry);
+ }
+ if (op->reserved_allocated()) {
+ published_reserves++;
+ }
+ {
+ std::lock_guard locker(m_lock);
+ m_unpublished_reserves -= published_reserves;
+ m_dirty_log_entries.splice(m_dirty_log_entries.end(), dirty_entries);
+ }
+ op->complete(result);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_app_t,
+ op->log_append_start_time - op->dispatch_time);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_cmp_t, now - op->dispatch_time);
+ m_perfcounter->hinc(l_librbd_pwl_log_op_dis_to_cmp_t_hist,
+ utime_t(now - op->dispatch_time).to_nsec(),
+ log_entry->ram_entry.write_bytes);
+ utime_t app_lat = op->log_append_comp_time - op->log_append_start_time;
+ m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_appc_t, app_lat);
+ m_perfcounter->hinc(l_librbd_pwl_log_op_app_to_appc_t_hist, app_lat.to_nsec(),
+ log_entry->ram_entry.write_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_cmp_t, now - op->log_append_start_time);
+ }
+ // New entries may be flushable
+ {
+ std::lock_guard locker(m_lock);
+ wake_up();
+ }
+}
+
+/**
+ * Dispatch as many deferred writes as possible
+ */
+template <typename I>
+void AbstractWriteLog<I>::dispatch_deferred_writes(void)
+{
+ C_BlockIORequestT *front_req = nullptr; /* req still on front of deferred list */
+ C_BlockIORequestT *allocated_req = nullptr; /* req that was allocated, and is now off the list */
+ bool allocated = false; /* front_req allocate succeeded */
+ bool cleared_dispatching_flag = false;
+
+ /* If we can't become the dispatcher, we'll exit */
+ {
+ std::lock_guard locker(m_lock);
+ if (m_dispatching_deferred_ops ||
+ !m_deferred_ios.size()) {
+ return;
+ }
+ m_dispatching_deferred_ops = true;
+ }
+
+ /* There are ops to dispatch, and this should be the only thread dispatching them */
+ {
+ std::lock_guard deferred_dispatch(m_deferred_dispatch_lock);
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dispatching_deferred_ops);
+ if (allocated) {
+ /* On the 2..n-1 th time we get lock, front_req->alloc_resources() will
+ * have succeeded, and we'll need to pop it off the deferred ops list
+ * here. */
+ ceph_assert(front_req);
+ ceph_assert(!allocated_req);
+ m_deferred_ios.pop_front();
+ allocated_req = front_req;
+ front_req = nullptr;
+ allocated = false;
+ }
+ ceph_assert(!allocated);
+ if (!allocated && front_req) {
+ /* front_req->alloc_resources() failed on the last iteration.
+ * We'll stop dispatching. */
+ wake_up();
+ front_req = nullptr;
+ ceph_assert(!cleared_dispatching_flag);
+ m_dispatching_deferred_ops = false;
+ cleared_dispatching_flag = true;
+ } else {
+ ceph_assert(!front_req);
+ if (m_deferred_ios.size()) {
+ /* New allocation candidate */
+ front_req = m_deferred_ios.front();
+ } else {
+ ceph_assert(!cleared_dispatching_flag);
+ m_dispatching_deferred_ops = false;
+ cleared_dispatching_flag = true;
+ }
+ }
+ }
+ /* Try allocating for front_req before we decide what to do with allocated_req
+ * (if any) */
+ if (front_req) {
+ ceph_assert(!cleared_dispatching_flag);
+ allocated = front_req->alloc_resources();
+ }
+ if (allocated_req && front_req && allocated) {
+ /* Push dispatch of the first allocated req to a wq */
+ m_work_queue.queue(new LambdaContext(
+ [allocated_req](int r) {
+ allocated_req->dispatch();
+ }), 0);
+ allocated_req = nullptr;
+ }
+ ceph_assert(!(allocated_req && front_req && allocated));
+
+ /* Continue while we're still considering the front of the deferred ops list */
+ } while (front_req);
+ ceph_assert(!allocated);
+ }
+ ceph_assert(cleared_dispatching_flag);
+
+ /* If any deferred requests were allocated, the last one will still be in allocated_req */
+ if (allocated_req) {
+ allocated_req->dispatch();
+ }
+}
+
+/**
+ * Returns the lanes used by this write, and attempts to dispatch the next
+ * deferred write
+ */
+template <typename I>
+void AbstractWriteLog<I>::release_write_lanes(C_BlockIORequestT *req)
+{
+ {
+ std::lock_guard locker(m_lock);
+ m_free_lanes += req->image_extents.size();
+ }
+ dispatch_deferred_writes();
+}
+
+/**
+ * Attempts to allocate log resources for a write. Write is dispatched if
+ * resources are available, or queued if they aren't.
+ */
+template <typename I>
+void AbstractWriteLog<I>::alloc_and_dispatch_io_req(C_BlockIORequestT *req)
+{
+ bool dispatch_here = false;
+
+ {
+ /* If there are already deferred writes, queue behind them for resources */
+ {
+ std::lock_guard locker(m_lock);
+ dispatch_here = m_deferred_ios.empty();
+ // Only flush req's total_bytes is the max uint64
+ if (req->image_extents_summary.total_bytes ==
+ std::numeric_limits<uint64_t>::max() &&
+ static_cast<C_FlushRequestT *>(req)->internal == true) {
+ dispatch_here = true;
+ }
+ }
+ if (dispatch_here) {
+ dispatch_here = req->alloc_resources();
+ }
+ if (dispatch_here) {
+ ldout(m_image_ctx.cct, 20) << "dispatching" << dendl;
+ req->dispatch();
+ } else {
+ req->deferred();
+ {
+ std::lock_guard locker(m_lock);
+ m_deferred_ios.push_back(req);
+ }
+ ldout(m_image_ctx.cct, 20) << "deferred IOs: " << m_deferred_ios.size() << dendl;
+ dispatch_deferred_writes();
+ }
+ }
+}
+
+template <typename I>
+bool AbstractWriteLog<I>::check_allocation(
+ C_BlockIORequestT *req, uint64_t bytes_cached, uint64_t bytes_dirtied,
+ uint64_t bytes_allocated, uint32_t num_lanes, uint32_t num_log_entries,
+ uint32_t num_unpublished_reserves) {
+ bool alloc_succeeds = true;
+ bool no_space = false;
+ {
+ std::lock_guard locker(m_lock);
+ if (m_free_lanes < num_lanes) {
+ req->set_io_waited_for_lanes(true);
+ ldout(m_image_ctx.cct, 20) << "not enough free lanes (need "
+ << num_lanes
+ << ", have " << m_free_lanes << ") "
+ << *req << dendl;
+ alloc_succeeds = false;
+ /* This isn't considered a "no space" alloc fail. Lanes are a throttling mechanism. */
+ }
+ if (m_free_log_entries < num_log_entries) {
+ req->set_io_waited_for_entries(true);
+ ldout(m_image_ctx.cct, 20) << "not enough free entries (need "
+ << num_log_entries
+ << ", have " << m_free_log_entries << ") "
+ << *req << dendl;
+ alloc_succeeds = false;
+ no_space = true; /* Entries must be retired */
+ }
+ /* Don't attempt buffer allocate if we've exceeded the "full" threshold */
+ if (m_bytes_allocated + bytes_allocated > m_bytes_allocated_cap) {
+ if (!req->has_io_waited_for_buffers()) {
+ req->set_io_waited_for_buffers(true);
+ ldout(m_image_ctx.cct, 5) << "Waiting for allocation cap (cap="
+ << m_bytes_allocated_cap
+ << ", allocated=" << m_bytes_allocated
+ << ") in write [" << *req << "]" << dendl;
+ }
+ alloc_succeeds = false;
+ no_space = true; /* Entries must be retired */
+ }
+ }
+
+ if (alloc_succeeds) {
+ reserve_cache(req, alloc_succeeds, no_space);
+ }
+
+ if (alloc_succeeds) {
+ std::unique_lock locker(m_lock);
+ /* We need one free log entry per extent (each is a separate entry), and
+ * one free "lane" for remote replication. */
+ if ((m_free_lanes >= num_lanes) &&
+ (m_free_log_entries >= num_log_entries) &&
+ (m_bytes_allocated_cap >= m_bytes_allocated + bytes_allocated)) {
+ m_free_lanes -= num_lanes;
+ m_free_log_entries -= num_log_entries;
+ m_unpublished_reserves += num_unpublished_reserves;
+ m_bytes_allocated += bytes_allocated;
+ m_bytes_cached += bytes_cached;
+ m_bytes_dirty += bytes_dirtied;
+ if (req->has_io_waited_for_buffers()) {
+ req->set_io_waited_for_buffers(false);
+ }
+ if (m_cache_state->clean && bytes_dirtied > 0) {
+ m_cache_state->clean = false;
+ update_image_cache_state();
+ write_image_cache_state(locker);
+ }
+ } else {
+ alloc_succeeds = false;
+ }
+ }
+
+ if (!alloc_succeeds && no_space) {
+ /* Expedite flushing and/or retiring */
+ std::lock_guard locker(m_lock);
+ m_alloc_failed_since_retire = true;
+ m_last_alloc_fail = ceph_clock_now();
+ }
+
+ return alloc_succeeds;
+}
+
+template <typename I>
+C_FlushRequest<AbstractWriteLog<I>>* AbstractWriteLog<I>::make_flush_req(Context *on_finish) {
+ utime_t flush_begins = ceph_clock_now();
+ bufferlist bl;
+ auto *flush_req =
+ new C_FlushRequestT(*this, flush_begins, Extents({whole_volume_extent()}),
+ std::move(bl), 0, m_lock, m_perfcounter, on_finish);
+
+ return flush_req;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::wake_up() {
+ CephContext *cct = m_image_ctx.cct;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (!m_wake_up_enabled) {
+ // wake_up is disabled during shutdown after flushing completes
+ ldout(m_image_ctx.cct, 6) << "deferred processing disabled" << dendl;
+ return;
+ }
+
+ if (m_wake_up_requested && m_wake_up_scheduled) {
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ /* Wake-up can be requested while it's already scheduled */
+ m_wake_up_requested = true;
+
+ /* Wake-up cannot be scheduled if it's already scheduled */
+ if (m_wake_up_scheduled) {
+ return;
+ }
+ m_wake_up_scheduled = true;
+ m_async_process_work++;
+ m_async_op_tracker.start_op();
+ m_work_queue.queue(new LambdaContext(
+ [this](int r) {
+ process_work();
+ m_async_op_tracker.finish_op();
+ m_async_process_work--;
+ }), 0);
+}
+
+template <typename I>
+bool AbstractWriteLog<I>::can_flush_entry(std::shared_ptr<GenericLogEntry> log_entry) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << "" << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (m_invalidating) {
+ return true;
+ }
+
+ /* For OWB we can flush entries with the same sync gen number (write between
+ * aio_flush() calls) concurrently. Here we'll consider an entry flushable if
+ * its sync gen number is <= the lowest sync gen number carried by all the
+ * entries currently flushing.
+ *
+ * If the entry considered here bears a sync gen number lower than a
+ * previously flushed entry, the application had to have submitted the write
+ * bearing the higher gen number before the write with the lower gen number
+ * completed. So, flushing these concurrently is OK.
+ *
+ * If the entry considered here bears a sync gen number higher than a
+ * currently flushing entry, the write with the lower gen number may have
+ * completed to the application before the write with the higher sync gen
+ * number was submitted, and the application may rely on that completion
+ * order for volume consistency. In this case the entry will not be
+ * considered flushable until all the entries bearing lower sync gen numbers
+ * finish flushing.
+ */
+
+ if (m_flush_ops_in_flight &&
+ (log_entry->ram_entry.sync_gen_number > m_lowest_flushing_sync_gen)) {
+ return false;
+ }
+
+ return (log_entry->can_writeback() &&
+ (m_flush_ops_in_flight <= IN_FLIGHT_FLUSH_WRITE_LIMIT) &&
+ (m_flush_bytes_in_flight <= IN_FLIGHT_FLUSH_BYTES_LIMIT));
+}
+
+template <typename I>
+void AbstractWriteLog<I>::detain_flush_guard_request(std::shared_ptr<GenericLogEntry> log_entry,
+ GuardedRequestFunctionContext *guarded_ctx) {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ BlockExtent extent;
+ if (log_entry->is_sync_point()) {
+ extent = block_extent(whole_volume_extent());
+ } else {
+ extent = log_entry->ram_entry.block_extent();
+ }
+
+ auto req = GuardedRequest(extent, guarded_ctx, false);
+ BlockGuardCell *cell = nullptr;
+
+ {
+ std::lock_guard locker(m_flush_guard_lock);
+ m_flush_guard.detain(req.block_extent, &req, &cell);
+ }
+ if (cell) {
+ req.guard_ctx->cell = cell;
+ m_image_ctx.op_work_queue->queue(req.guard_ctx, 0);
+ }
+}
+
+template <typename I>
+Context* AbstractWriteLog<I>::construct_flush_entry(std::shared_ptr<GenericLogEntry> log_entry,
+ bool invalidating) {
+ ldout(m_image_ctx.cct, 20) << "" << dendl;
+
+ /* Flush write completion action */
+ utime_t writeback_start_time = ceph_clock_now();
+ Context *ctx = new LambdaContext(
+ [this, log_entry, writeback_start_time, invalidating](int r) {
+ utime_t writeback_comp_time = ceph_clock_now();
+ m_perfcounter->tinc(l_librbd_pwl_writeback_latency,
+ writeback_comp_time - writeback_start_time);
+ {
+ std::lock_guard locker(m_lock);
+ if (r < 0) {
+ lderr(m_image_ctx.cct) << "failed to flush log entry"
+ << cpp_strerror(r) << dendl;
+ m_dirty_log_entries.push_front(log_entry);
+ } else {
+ ceph_assert(m_bytes_dirty >= log_entry->bytes_dirty());
+ log_entry->set_flushed(true);
+ m_bytes_dirty -= log_entry->bytes_dirty();
+ sync_point_writer_flushed(log_entry->get_sync_point_entry());
+ ldout(m_image_ctx.cct, 20) << "flushed: " << log_entry
+ << " invalidating=" << invalidating
+ << dendl;
+ }
+ m_flush_ops_in_flight -= 1;
+ m_flush_bytes_in_flight -= log_entry->ram_entry.write_bytes;
+ wake_up();
+ }
+ });
+ /* Flush through lower cache before completing */
+ ctx = new LambdaContext(
+ [this, ctx, log_entry](int r) {
+ {
+
+ WriteLogGuard::BlockOperations block_reqs;
+ BlockGuardCell *detained_cell = nullptr;
+
+ std::lock_guard locker{m_flush_guard_lock};
+ m_flush_guard.release(log_entry->m_cell, &block_reqs);
+
+ for (auto &req : block_reqs) {
+ m_flush_guard.detain(req.block_extent, &req, &detained_cell);
+ if (detained_cell) {
+ req.guard_ctx->cell = detained_cell;
+ m_image_ctx.op_work_queue->queue(req.guard_ctx, 0);
+ }
+ }
+ }
+
+ if (r < 0) {
+ lderr(m_image_ctx.cct) << "failed to flush log entry"
+ << cpp_strerror(r) << dendl;
+ ctx->complete(r);
+ } else {
+ m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, ctx);
+ }
+ });
+ return ctx;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::process_writeback_dirty_entries() {
+ CephContext *cct = m_image_ctx.cct;
+ bool all_clean = false;
+ int flushed = 0;
+ bool has_write_entry = false;
+ bool need_update_state = false;
+
+ ldout(cct, 20) << "Look for dirty entries" << dendl;
+ {
+ DeferredContexts post_unlock;
+ GenericLogEntries entries_to_flush;
+
+ std::shared_lock entry_reader_locker(m_entry_reader_lock);
+ std::lock_guard locker(m_lock);
+ while (flushed < IN_FLIGHT_FLUSH_WRITE_LIMIT) {
+ if (m_shutting_down) {
+ ldout(cct, 5) << "Flush during shutdown supressed" << dendl;
+ /* Do flush complete only when all flush ops are finished */
+ all_clean = !m_flush_ops_in_flight;
+ break;
+ }
+ if (m_dirty_log_entries.empty()) {
+ ldout(cct, 20) << "Nothing new to flush" << dendl;
+ /* Do flush complete only when all flush ops are finished */
+ all_clean = !m_flush_ops_in_flight;
+ if (!m_cache_state->clean && all_clean) {
+ m_cache_state->clean = true;
+ update_image_cache_state();
+ need_update_state = true;
+ }
+ break;
+ }
+
+ auto candidate = m_dirty_log_entries.front();
+ bool flushable = can_flush_entry(candidate);
+ if (flushable) {
+ entries_to_flush.push_back(candidate);
+ flushed++;
+ if (!has_write_entry)
+ has_write_entry = candidate->is_write_entry();
+ m_dirty_log_entries.pop_front();
+
+ // To track candidate, we should add m_flush_ops_in_flight in here
+ {
+ if (!m_flush_ops_in_flight ||
+ (candidate->ram_entry.sync_gen_number < m_lowest_flushing_sync_gen)) {
+ m_lowest_flushing_sync_gen = candidate->ram_entry.sync_gen_number;
+ }
+ m_flush_ops_in_flight += 1;
+ /* For write same this is the bytes affected by the flush op, not the bytes transferred */
+ m_flush_bytes_in_flight += candidate->ram_entry.write_bytes;
+ }
+ } else {
+ ldout(cct, 20) << "Next dirty entry isn't flushable yet" << dendl;
+ break;
+ }
+ }
+
+ construct_flush_entries(entries_to_flush, post_unlock, has_write_entry);
+ }
+ if (need_update_state) {
+ std::unique_lock locker(m_lock);
+ write_image_cache_state(locker);
+ }
+
+ if (all_clean) {
+ /* All flushing complete, drain outside lock */
+ Contexts flush_contexts;
+ {
+ std::lock_guard locker(m_lock);
+ flush_contexts.swap(m_flush_complete_contexts);
+ }
+ finish_contexts(m_image_ctx.cct, flush_contexts, 0);
+ }
+}
+
+/* Returns true if the specified SyncPointLogEntry is considered flushed, and
+ * the log will be updated to reflect this. */
+template <typename I>
+bool AbstractWriteLog<I>::handle_flushed_sync_point(std::shared_ptr<SyncPointLogEntry> log_entry)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(log_entry);
+
+ if ((log_entry->writes_flushed == log_entry->writes) &&
+ log_entry->completed && log_entry->prior_sync_point_flushed &&
+ log_entry->next_sync_point_entry) {
+ ldout(m_image_ctx.cct, 20) << "All writes flushed up to sync point="
+ << *log_entry << dendl;
+ log_entry->next_sync_point_entry->prior_sync_point_flushed = true;
+ /* Don't move the flushed sync gen num backwards. */
+ if (m_flushed_sync_gen < log_entry->ram_entry.sync_gen_number) {
+ m_flushed_sync_gen = log_entry->ram_entry.sync_gen_number;
+ }
+ m_async_op_tracker.start_op();
+ m_work_queue.queue(new LambdaContext(
+ [this, next = std::move(log_entry->next_sync_point_entry)](int r) {
+ bool handled_by_next;
+ {
+ std::lock_guard locker(m_lock);
+ handled_by_next = handle_flushed_sync_point(std::move(next));
+ }
+ if (!handled_by_next) {
+ persist_last_flushed_sync_gen();
+ }
+ m_async_op_tracker.finish_op();
+ }));
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::sync_point_writer_flushed(std::shared_ptr<SyncPointLogEntry> log_entry)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(log_entry);
+ log_entry->writes_flushed++;
+
+ /* If this entry might be completely flushed, look closer */
+ if ((log_entry->writes_flushed == log_entry->writes) && log_entry->completed) {
+ ldout(m_image_ctx.cct, 15) << "All writes flushed for sync point="
+ << *log_entry << dendl;
+ handle_flushed_sync_point(log_entry);
+ }
+}
+
+/* Make a new sync point and flush the previous during initialization, when there may or may
+ * not be a previous sync point */
+template <typename I>
+void AbstractWriteLog<I>::init_flush_new_sync_point(DeferredContexts &later) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(!m_initialized); /* Don't use this after init */
+
+ if (!m_current_sync_point) {
+ /* First sync point since start */
+ new_sync_point(later);
+ } else {
+ flush_new_sync_point(nullptr, later);
+ }
+}
+
+/**
+ * Begin a new sync point
+ */
+template <typename I>
+void AbstractWriteLog<I>::new_sync_point(DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ std::shared_ptr<SyncPoint> old_sync_point = m_current_sync_point;
+ std::shared_ptr<SyncPoint> new_sync_point;
+ ldout(cct, 20) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ /* The first time this is called, if this is a newly created log,
+ * this makes the first sync gen number we'll use 1. On the first
+ * call for a re-opened log m_current_sync_gen will be the highest
+ * gen number from all the sync point entries found in the re-opened
+ * log, and this advances to the next sync gen number. */
+ ++m_current_sync_gen;
+
+ new_sync_point = std::make_shared<SyncPoint>(m_current_sync_gen, cct);
+ m_current_sync_point = new_sync_point;
+
+ /* If this log has been re-opened, old_sync_point will initially be
+ * nullptr, but m_current_sync_gen may not be zero. */
+ if (old_sync_point) {
+ new_sync_point->setup_earlier_sync_point(old_sync_point, m_last_op_sequence_num);
+ m_perfcounter->hinc(l_librbd_pwl_syncpoint_hist,
+ old_sync_point->log_entry->writes,
+ old_sync_point->log_entry->bytes);
+ /* This sync point will acquire no more sub-ops. Activation needs
+ * to acquire m_lock, so defer to later*/
+ later.add(new LambdaContext(
+ [old_sync_point](int r) {
+ old_sync_point->prior_persisted_gather_activate();
+ }));
+ }
+
+ new_sync_point->prior_persisted_gather_set_finisher();
+
+ if (old_sync_point) {
+ ldout(cct,6) << "new sync point = [" << *m_current_sync_point
+ << "], prior = [" << *old_sync_point << "]" << dendl;
+ } else {
+ ldout(cct,6) << "first sync point = [" << *m_current_sync_point
+ << "]" << dendl;
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::flush_new_sync_point(C_FlushRequestT *flush_req,
+ DeferredContexts &later) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (!flush_req) {
+ m_async_null_flush_finish++;
+ m_async_op_tracker.start_op();
+ Context *flush_ctx = new LambdaContext([this](int r) {
+ m_async_null_flush_finish--;
+ m_async_op_tracker.finish_op();
+ });
+ flush_req = make_flush_req(flush_ctx);
+ flush_req->internal = true;
+ }
+
+ /* Add a new sync point. */
+ new_sync_point(later);
+ std::shared_ptr<SyncPoint> to_append = m_current_sync_point->earlier_sync_point;
+ ceph_assert(to_append);
+
+ /* This flush request will append/persist the (now) previous sync point */
+ flush_req->to_append = to_append;
+
+ /* When the m_sync_point_persist Gather completes this sync point can be
+ * appended. The only sub for this Gather is the finisher Context for
+ * m_prior_log_entries_persisted, which records the result of the Gather in
+ * the sync point, and completes. TODO: Do we still need both of these
+ * Gathers?*/
+ Context * ctx = new LambdaContext([this, flush_req](int r) {
+ ldout(m_image_ctx.cct, 20) << "Flush req=" << flush_req
+ << " sync point =" << flush_req->to_append
+ << ". Ready to persist." << dendl;
+ alloc_and_dispatch_io_req(flush_req);
+ });
+ to_append->persist_gather_set_finisher(ctx);
+
+ /* The m_sync_point_persist Gather has all the subs it will ever have, and
+ * now has its finisher. If the sub is already complete, activation will
+ * complete the Gather. The finisher will acquire m_lock, so we'll activate
+ * this when we release m_lock.*/
+ later.add(new LambdaContext([to_append](int r) {
+ to_append->persist_gather_activate();
+ }));
+
+ /* The flush request completes when the sync point persists */
+ to_append->add_in_on_persisted_ctxs(flush_req);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::flush_new_sync_point_if_needed(C_FlushRequestT *flush_req,
+ DeferredContexts &later) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ /* If there have been writes since the last sync point ... */
+ if (m_current_sync_point->log_entry->writes) {
+ flush_new_sync_point(flush_req, later);
+ } else {
+ /* There have been no writes to the current sync point. */
+ if (m_current_sync_point->earlier_sync_point) {
+ /* If previous sync point hasn't completed, complete this flush
+ * with the earlier sync point. No alloc or dispatch needed. */
+ m_current_sync_point->earlier_sync_point->on_sync_point_persisted.push_back(flush_req);
+ } else {
+ /* The previous sync point has already completed and been
+ * appended. The current sync point has no writes, so this flush
+ * has nothing to wait for. This flush completes now. */
+ later.add(flush_req);
+ }
+ }
+}
+
+/*
+ * RWL internal flush - will actually flush the RWL.
+ *
+ * User flushes should arrive at aio_flush(), and only flush prior
+ * writes to all log replicas.
+ *
+ * Librbd internal flushes will arrive at flush(invalidate=false,
+ * discard=false), and traverse the block guard to ensure in-flight writes are
+ * flushed.
+ */
+template <typename I>
+void AbstractWriteLog<I>::flush_dirty_entries(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ bool all_clean;
+ bool flushing;
+ bool stop_flushing;
+
+ {
+ std::unique_lock locker(m_lock);
+ flushing = (0 != m_flush_ops_in_flight);
+ all_clean = m_dirty_log_entries.empty();
+ stop_flushing = (m_shutting_down);
+ if (!m_cache_state->clean && all_clean && !flushing) {
+ m_cache_state->clean = true;
+ update_image_cache_state();
+ write_image_cache_state(locker);
+ }
+ }
+
+ if (!flushing && (all_clean || stop_flushing)) {
+ /* Complete without holding m_lock */
+ if (all_clean) {
+ ldout(cct, 20) << "no dirty entries" << dendl;
+ } else {
+ ldout(cct, 5) << "flush during shutdown suppressed" << dendl;
+ }
+ on_finish->complete(0);
+ } else {
+ if (all_clean) {
+ ldout(cct, 5) << "flush ops still in progress" << dendl;
+ } else {
+ ldout(cct, 20) << "dirty entries remain" << dendl;
+ }
+ std::lock_guard locker(m_lock);
+ /* on_finish can't be completed yet */
+ m_flush_complete_contexts.push_back(new LambdaContext(
+ [this, on_finish](int r) {
+ flush_dirty_entries(on_finish);
+ }));
+ wake_up();
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::internal_flush(bool invalidate, Context *on_finish) {
+ ldout(m_image_ctx.cct, 20) << "invalidate=" << invalidate << dendl;
+
+ if (m_perfcounter) {
+ if (invalidate) {
+ m_perfcounter->inc(l_librbd_pwl_invalidate_cache, 1);
+ } else {
+ m_perfcounter->inc(l_librbd_pwl_internal_flush, 1);
+ }
+ }
+
+ /* May be called even if initialization fails */
+ if (!m_initialized) {
+ ldout(m_image_ctx.cct, 05) << "never initialized" << dendl;
+ /* Deadlock if completed here */
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ /* Flush/invalidate must pass through block guard to ensure all layers of
+ * cache are consistently flush/invalidated. This ensures no in-flight write leaves
+ * some layers with valid regions, which may later produce inconsistent read
+ * results. */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext(
+ [this, on_finish, invalidate](GuardedRequestFunctionContext &guard_ctx) {
+ DeferredContexts on_exit;
+ ldout(m_image_ctx.cct, 20) << "cell=" << guard_ctx.cell << dendl;
+ ceph_assert(guard_ctx.cell);
+
+ Context *ctx = new LambdaContext(
+ [this, cell=guard_ctx.cell, invalidate, on_finish](int r) {
+ std::lock_guard locker(m_lock);
+ m_invalidating = false;
+ ldout(m_image_ctx.cct, 6) << "Done flush/invalidating (invalidate="
+ << invalidate << ")" << dendl;
+ if (m_log_entries.size()) {
+ ldout(m_image_ctx.cct, 1) << "m_log_entries.size()="
+ << m_log_entries.size() << ", "
+ << "front()=" << *m_log_entries.front()
+ << dendl;
+ }
+ if (invalidate) {
+ ceph_assert(m_log_entries.size() == 0);
+ }
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ m_image_ctx.op_work_queue->queue(on_finish, r);
+ release_guarded_request(cell);
+ });
+ ctx = new LambdaContext(
+ [this, ctx, invalidate](int r) {
+ Context *next_ctx = ctx;
+ ldout(m_image_ctx.cct, 6) << "flush_dirty_entries finished" << dendl;
+ if (r < 0) {
+ /* Override on_finish status with this error */
+ next_ctx = new LambdaContext([r, ctx](int _r) {
+ ctx->complete(r);
+ });
+ }
+ if (invalidate) {
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ ceph_assert(!m_invalidating);
+ ldout(m_image_ctx.cct, 6) << "Invalidating" << dendl;
+ m_invalidating = true;
+ }
+ /* Discards all RWL entries */
+ while (retire_entries(MAX_ALLOC_PER_TRANSACTION)) { }
+ next_ctx->complete(0);
+ } else {
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ ceph_assert(!m_invalidating);
+ }
+ m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, next_ctx);
+ }
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ flush_dirty_entries(ctx);
+ });
+ std::lock_guard locker(m_lock);
+ /* Even if we're throwing everything away, but we want the last entry to
+ * be a sync point so we can cleanly resume.
+ *
+ * Also, the blockguard only guarantees the replication of this op
+ * can't overlap with prior ops. It doesn't guarantee those are all
+ * completed and eligible for flush & retire, which we require here.
+ */
+ auto flush_req = make_flush_req(ctx);
+ flush_new_sync_point_if_needed(flush_req, on_exit);
+ });
+ detain_guarded_request(nullptr, guarded_ctx, true);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::add_into_log_map(GenericWriteLogEntries &log_entries,
+ C_BlockIORequestT *req) {
+ req->copy_cache();
+ m_blocks_to_log_entries.add_log_entries(log_entries);
+}
+
+template <typename I>
+bool AbstractWriteLog<I>::can_retire_entry(std::shared_ptr<GenericLogEntry> log_entry) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ return log_entry->can_retire();
+}
+
+template <typename I>
+void AbstractWriteLog<I>::check_image_cache_state_clean() {
+ ceph_assert(m_deferred_ios.empty());
+ ceph_assert(m_ops_to_append.empty());;
+ ceph_assert(m_async_flush_ops == 0);
+ ceph_assert(m_async_append_ops == 0);
+ ceph_assert(m_dirty_log_entries.empty());
+ ceph_assert(m_ops_to_flush.empty());
+ ceph_assert(m_flush_ops_in_flight == 0);
+ ceph_assert(m_flush_bytes_in_flight == 0);
+ ceph_assert(m_bytes_dirty == 0);
+ ceph_assert(m_work_queue.empty());
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/AbstractWriteLog.h b/src/librbd/cache/pwl/AbstractWriteLog.h
new file mode 100644
index 000000000..ffe299c37
--- /dev/null
+++ b/src/librbd/cache/pwl/AbstractWriteLog.h
@@ -0,0 +1,410 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
+
+#include "common/Timer.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/ReadRequest.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/Builder.h"
+#include <functional>
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace plugin { template <typename> struct Api; }
+
+namespace cache {
+namespace pwl {
+
+class GenericLogEntry;
+class GenericWriteLogEntry;
+class SyncPointLogEntry;
+class WriteLogEntry;
+struct WriteLogCacheEntry;
+
+typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries;
+typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
+typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
+typedef std::vector<std::shared_ptr<GenericLogEntry>> GenericLogEntriesVector;
+
+typedef LogMapEntries<GenericWriteLogEntry> WriteLogMapEntries;
+typedef LogMap<GenericWriteLogEntry> WriteLogMap;
+
+/**** Write log entries end ****/
+
+typedef librbd::BlockGuard<GuardedRequest> WriteLogGuard;
+
+class DeferredContexts;
+template <typename>
+class ImageCacheState;
+
+template<typename T>
+class Builder;
+
+template <typename T>
+struct C_BlockIORequest;
+
+template <typename T>
+struct C_WriteRequest;
+
+using GenericLogOperations = std::list<GenericLogOperationSharedPtr>;
+
+
+template <typename ImageCtxT>
+class AbstractWriteLog {
+public:
+ typedef io::Extent Extent;
+ typedef io::Extents Extents;
+ using This = AbstractWriteLog<ImageCtxT>;
+ Builder<This> *m_builder;
+
+ AbstractWriteLog(ImageCtxT &image_ctx,
+ librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ Builder<This> *builder,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api);
+ virtual ~AbstractWriteLog();
+ AbstractWriteLog(const AbstractWriteLog&) = delete;
+ AbstractWriteLog &operator=(const AbstractWriteLog&) = delete;
+
+ /// IO methods
+ void read(
+ Extents&& image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish);
+ void write(
+ Extents&& image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags,
+ Context *on_finish);
+ void discard(
+ uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish);
+ void flush(
+ io::FlushSource flush_source, Context *on_finish);
+ void writesame(
+ uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish);
+ void compare_and_write(
+ Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,int fadvise_flags,
+ Context *on_finish);
+
+ /// internal state methods
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+ void invalidate(Context *on_finish);
+ void flush(Context *on_finish);
+
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_FlushRequestT = pwl::C_FlushRequest<This>;
+ using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+ CephContext * get_context();
+ void release_guarded_request(BlockGuardCell *cell);
+ void release_write_lanes(C_BlockIORequestT *req);
+ virtual bool alloc_resources(C_BlockIORequestT *req) = 0;
+ virtual void setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush,
+ C_BlockIORequestT *req) = 0;
+ void schedule_append(pwl::GenericLogOperationsVector &ops, C_BlockIORequestT *req = nullptr);
+ void schedule_append(pwl::GenericLogOperationSharedPtr op, C_BlockIORequestT *req = nullptr);
+ void flush_new_sync_point(C_FlushRequestT *flush_req,
+ pwl::DeferredContexts &later);
+
+ std::shared_ptr<pwl::SyncPoint> get_current_sync_point() {
+ return m_current_sync_point;
+ }
+ bool get_persist_on_flush() {
+ return m_persist_on_flush;
+ }
+ void inc_last_op_sequence_num() {
+ m_perfcounter->inc(l_librbd_pwl_log_ops, 1);
+ ++m_last_op_sequence_num;
+ }
+ uint64_t get_last_op_sequence_num() {
+ return m_last_op_sequence_num;
+ }
+ uint64_t get_current_sync_gen() {
+ return m_current_sync_gen;
+ }
+ unsigned int get_free_lanes() {
+ return m_free_lanes;
+ }
+ uint32_t get_free_log_entries() {
+ return m_free_log_entries;
+ }
+ void add_into_log_map(pwl::GenericWriteLogEntries &log_entries,
+ C_BlockIORequestT *req);
+ virtual void complete_user_request(Context *&user_req, int r) = 0;
+ virtual void copy_bl_to_buffer(
+ WriteRequestResources *resources,
+ std::unique_ptr<WriteLogOperationSet> &op_set) {}
+
+private:
+ typedef std::list<pwl::C_WriteRequest<This> *> C_WriteRequests;
+ typedef std::list<pwl::C_BlockIORequest<This> *> C_BlockIORequests;
+
+ std::atomic<bool> m_initialized = {false};
+
+ uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */
+ utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */
+
+ pwl::WriteLogGuard m_write_log_guard;
+
+ /* Starts at 0 for a new write log. Incremented on every flush. */
+ uint64_t m_current_sync_gen = 0;
+ /* Starts at 0 on each sync gen increase. Incremented before applied
+ to an operation */
+ uint64_t m_last_op_sequence_num = 0;
+
+ bool m_persist_on_write_until_flush = true;
+
+ pwl::WriteLogGuard m_flush_guard;
+ mutable ceph::mutex m_flush_guard_lock;
+
+ /* Debug counters for the places m_async_op_tracker is used */
+ std::atomic<int> m_async_complete_ops = {0};
+ std::atomic<int> m_async_null_flush_finish = {0};
+ std::atomic<int> m_async_process_work = {0};
+
+ /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */
+ mutable ceph::mutex m_deferred_dispatch_lock;
+
+ /* Used in release/detain to make BlockGuard preserve submission order */
+ mutable ceph::mutex m_blockguard_lock;
+
+ /* Use m_blockguard_lock for the following 3 things */
+ bool m_barrier_in_progress = false;
+ BlockGuardCell *m_barrier_cell = nullptr;
+
+ bool m_wake_up_enabled = true;
+
+ Contexts m_flush_complete_contexts;
+
+ std::shared_ptr<pwl::SyncPoint> m_current_sync_point = nullptr;
+ bool m_persist_on_flush = false; //If false, persist each write before completion
+
+ int m_flush_ops_in_flight = 0;
+ int m_flush_bytes_in_flight = 0;
+ uint64_t m_lowest_flushing_sync_gen = 0;
+
+ /* Writes that have left the block guard, but are waiting for resources */
+ C_BlockIORequests m_deferred_ios;
+ /* Throttle writes concurrently allocating & replicating */
+ unsigned int m_free_lanes = pwl::MAX_CONCURRENT_WRITES;
+
+ SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */
+ mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */
+ Context *m_timer_ctx = nullptr;
+
+ ThreadPool m_thread_pool;
+
+ uint32_t m_discard_granularity_bytes;
+
+ BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req);
+ BlockGuardCell* detain_guarded_request_barrier_helper(
+ pwl::GuardedRequest &req);
+ void detain_guarded_request(C_BlockIORequestT *request,
+ pwl::GuardedRequestFunctionContext *guarded_ctx,
+ bool is_barrier);
+ void perf_start(const std::string name);
+ void perf_stop();
+ void log_perf();
+ void periodic_stats();
+ void arm_periodic_stats();
+
+ void pwl_init(Context *on_finish, pwl::DeferredContexts &later);
+ void check_image_cache_state_clean();
+
+ void flush_dirty_entries(Context *on_finish);
+ bool can_flush_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
+ bool handle_flushed_sync_point(
+ std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+ void sync_point_writer_flushed(
+ std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+
+ void init_flush_new_sync_point(pwl::DeferredContexts &later);
+ void new_sync_point(pwl::DeferredContexts &later);
+ pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(
+ Context *on_finish);
+ void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req,
+ pwl::DeferredContexts &later);
+
+ void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req);
+ void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops,
+ const int r);
+ void internal_flush(bool invalidate, Context *on_finish);
+
+protected:
+ librbd::cache::pwl::ImageCacheState<ImageCtxT>* m_cache_state = nullptr;
+
+ std::atomic<bool> m_shutting_down = {false};
+ std::atomic<bool> m_invalidating = {false};
+
+ ImageCtxT &m_image_ctx;
+
+ std::string m_log_pool_name;
+ uint64_t m_log_pool_size;
+
+ uint32_t m_total_log_entries = 0;
+ uint32_t m_free_log_entries = 0;
+
+ std::atomic<uint64_t> m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */
+ uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */
+ uint64_t m_bytes_allocated_cap = 0;
+
+ std::atomic<bool> m_alloc_failed_since_retire = {false};
+
+ cache::ImageWritebackInterface& m_image_writeback;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+
+ /*
+ * When m_first_free_entry == m_first_valid_entry, the log is
+ * empty. There is always at least one free entry, which can't be
+ * used.
+ */
+ uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */
+ uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */
+
+ /* All writes bearing this and all prior sync gen numbers are flushed */
+ uint64_t m_flushed_sync_gen = 0;
+
+ AsyncOpTracker m_async_op_tracker;
+ /* Debug counters for the places m_async_op_tracker is used */
+ std::atomic<int> m_async_flush_ops = {0};
+ std::atomic<int> m_async_append_ops = {0};
+
+ /* Acquire locks in order declared here */
+
+ mutable ceph::mutex m_log_retire_lock;
+ /* Hold a read lock on m_entry_reader_lock to add readers to log entry
+ * bufs. Hold a write lock to prevent readers from being added (e.g. when
+ * removing log entrys from the map). No lock required to remove readers. */
+ mutable RWLock m_entry_reader_lock;
+ /* Hold m_log_append_lock while appending or retiring log entries. */
+ mutable ceph::mutex m_log_append_lock;
+ /* Used for most synchronization */
+ mutable ceph::mutex m_lock;
+
+ /* Use m_blockguard_lock for the following 3 things */
+ pwl::WriteLogGuard::BlockOperations m_awaiting_barrier;
+
+ bool m_wake_up_requested = false;
+ bool m_wake_up_scheduled = false;
+ bool m_appending = false;
+ bool m_dispatching_deferred_ops = false;
+
+ pwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */
+ pwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */
+
+ pwl::WriteLogMap m_blocks_to_log_entries;
+
+ /* New entries are at the back. Oldest at the front */
+ pwl::GenericLogEntries m_log_entries;
+ pwl::GenericLogEntries m_dirty_log_entries;
+
+ PerfCounters *m_perfcounter = nullptr;
+
+ unsigned int m_unpublished_reserves = 0;
+
+ ContextWQ m_work_queue;
+
+ void wake_up();
+
+ void update_entries(
+ std::shared_ptr<pwl::GenericLogEntry> *log_entry,
+ pwl::WriteLogCacheEntry *cache_entry,
+ std::map<uint64_t, bool> &missing_sync_points,
+ std::map<uint64_t,
+ std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
+ uint64_t entry_index);
+ void update_sync_points(
+ std::map<uint64_t, bool> &missing_sync_points,
+ std::map<uint64_t,
+ std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
+ pwl::DeferredContexts &later);
+ virtual void inc_allocated_cached_bytes(
+ std::shared_ptr<pwl::GenericLogEntry> log_entry) = 0;
+ Context *construct_flush_entry(
+ const std::shared_ptr<pwl::GenericLogEntry> log_entry, bool invalidating);
+ void detain_flush_guard_request(std::shared_ptr<GenericLogEntry> log_entry,
+ GuardedRequestFunctionContext *guarded_ctx);
+ void process_writeback_dirty_entries();
+ bool can_retire_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
+
+ void dispatch_deferred_writes(void);
+ void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
+
+ bool check_allocation(
+ C_BlockIORequestT *req, uint64_t bytes_cached, uint64_t bytes_dirtied,
+ uint64_t bytes_allocated, uint32_t num_lanes, uint32_t num_log_entries,
+ uint32_t num_unpublished_reserves);
+ void append_scheduled(
+ pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending,
+ bool isRWL=false);
+
+ virtual void process_work() = 0;
+ virtual void append_scheduled_ops(void) = 0;
+ virtual void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) = 0;
+ virtual void remove_pool_file() = 0;
+ virtual bool initialize_pool(Context *on_finish,
+ pwl::DeferredContexts &later) = 0;
+ virtual void collect_read_extents(
+ uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+ Extent hit_extent, pwl::C_ReadRequest *read_ctx) = 0;
+ virtual void complete_read(
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, Context *ctx) = 0;
+ virtual void write_data_to_buffer(
+ std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+ pwl::WriteLogCacheEntry *cache_entry) {}
+ virtual void release_ram(
+ const std::shared_ptr<pwl::GenericLogEntry> log_entry) {}
+ virtual void alloc_op_log_entries(pwl::GenericLogOperations &ops) {}
+ virtual bool retire_entries(const unsigned long int frees_per_tx) {
+ return false;
+ }
+ virtual void schedule_flush_and_append(
+ pwl::GenericLogOperationsVector &ops) {}
+ virtual void persist_last_flushed_sync_gen() {}
+ virtual void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds,
+ bool &no_space) {}
+ virtual void construct_flush_entries(pwl::GenericLogEntries entries_to_flush,
+ DeferredContexts &post_unlock,
+ bool has_write_entry) = 0;
+ virtual uint64_t get_max_extent() {
+ return 0;
+ }
+ void update_image_cache_state(void);
+ void write_image_cache_state(std::unique_lock<ceph::mutex>& locker);
+ void handle_write_image_cache_state(int r);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
diff --git a/src/librbd/cache/pwl/Builder.h b/src/librbd/cache/pwl/Builder.h
new file mode 100644
index 000000000..9db28ea68
--- /dev/null
+++ b/src/librbd/cache/pwl/Builder.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_BUILDER_H
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename T>
+class Builder {
+public:
+ virtual ~Builder() {}
+ virtual std::shared_ptr<WriteLogEntry> create_write_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes) = 0;
+ virtual std::shared_ptr<WriteLogEntry> create_write_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes) = 0;
+ virtual std::shared_ptr<WriteLogEntry> create_writesame_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) = 0;
+ virtual std::shared_ptr<WriteLogEntry> create_writesame_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) = 0;
+ virtual C_WriteRequest<T> *create_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) = 0;
+ virtual C_WriteSameRequest<T> *create_writesame_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) = 0;
+ virtual C_WriteRequest<T> *create_comp_and_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) = 0;
+ virtual std::shared_ptr<WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> write_log_entry) = 0;
+ virtual std::shared_ptr<WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> writesame_log_entry) = 0;
+ virtual std::shared_ptr<pwl::DiscardLogOperation> create_discard_log_operation(
+ std::shared_ptr<SyncPoint> sync_point, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t discard_granularity_bytes,
+ utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) = 0;
+ virtual C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived,
+ PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) = 0;
+
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_BUILDER_H
diff --git a/src/librbd/cache/pwl/DiscardRequest.cc b/src/librbd/cache/pwl/DiscardRequest.cc
new file mode 100644
index 000000000..eaf24137d
--- /dev/null
+++ b/src/librbd/cache/pwl/DiscardRequest.cc
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/hostname.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/DiscardRequest.h"
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#endif
+
+#include "librbd/cache/pwl/ImageCacheState.h"
+
+#include "librbd/cache/Types.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl:DiscardRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+DiscardRequest<I>* DiscardRequest<I>::create(
+ I &image_ctx,
+ plugin::Api<I>& plugin_api,
+ Context *on_finish) {
+ return new DiscardRequest(image_ctx, plugin_api, on_finish);
+}
+
+template <typename I>
+DiscardRequest<I>::DiscardRequest(
+ I &image_ctx,
+ plugin::Api<I>& plugin_api,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_plugin_api(plugin_api),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+void DiscardRequest<I>::send() {
+ delete_image_cache_file();
+}
+
+template <typename I>
+void DiscardRequest<I>::delete_image_cache_file() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ m_cache_state = ImageCacheState<I>::get_image_cache_state(&m_image_ctx, m_plugin_api);
+ if (!m_cache_state) {
+ remove_feature_bit();
+ return;
+ }
+ if (m_cache_state->present &&
+ !m_cache_state->host.compare(ceph_get_short_hostname()) &&
+ fs::exists(m_cache_state->path)) {
+ std::error_code ec;
+ fs::remove(m_cache_state->path, ec);
+ if (ec) {
+ lderr(cct) << "failed to remove persistent cache file: " << ec.message()
+ << dendl;
+ // not fatal
+ }
+ }
+
+ remove_image_cache_state();
+}
+
+template <typename I>
+void DiscardRequest<I>::remove_image_cache_state() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = DiscardRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_image_cache_state>(
+ this);
+
+ m_cache_state->clear_image_cache_state(ctx);
+}
+
+template <typename I>
+void DiscardRequest<I>::handle_remove_image_cache_state(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ finish();
+ return;
+ }
+
+ remove_feature_bit();
+}
+
+template <typename I>
+void DiscardRequest<I>::remove_feature_bit() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE;
+ uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
+ ldout(cct, 10) << "old_features=" << m_image_ctx.features
+ << ", new_features=" << new_features
+ << ", features_mask=" << features_mask
+ << dendl;
+
+ int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
+ new_features, features_mask);
+ m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE;
+ using klass = DiscardRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_feature_bit>(
+ this);
+ ctx->complete(r);
+}
+
+template <typename I>
+void DiscardRequest<I>::handle_remove_feature_bit(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ }
+ finish();
+}
+
+template <typename I>
+void DiscardRequest<I>::finish() {
+ if (m_cache_state) {
+ delete m_cache_state;
+ m_cache_state = nullptr;
+ }
+
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::DiscardRequest<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/DiscardRequest.h b/src/librbd/cache/pwl/DiscardRequest.h
new file mode 100644
index 000000000..c896369fe
--- /dev/null
+++ b/src/librbd/cache/pwl/DiscardRequest.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SHUTDOWN_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_SHUTDOWN_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+namespace plugin { template <typename> struct Api; }
+
+namespace cache {
+
+namespace pwl {
+
+template<typename>
+class ImageCacheState;
+
+template <typename ImageCtxT = ImageCtx>
+class DiscardRequest {
+public:
+ static DiscardRequest* create(
+ ImageCtxT &image_ctx,
+ plugin::Api<ImageCtxT>& plugin_api,
+ Context *on_finish);
+
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * Shutdown request goes through the following state machine:
+ *
+ * <start>
+ * |
+ * v
+ * REMOVE_IMAGE_CACHE_FILE
+ * |
+ * v
+ * REMOVE_IMAGE_CACHE_STATE
+ * |
+ * v
+ * REMOVE_IMAGE_FEATURE_BIT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ DiscardRequest(ImageCtxT &image_ctx,
+ plugin::Api<ImageCtxT>& plugin_api,
+ Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ ImageCacheState<ImageCtxT>* m_cache_state;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ void delete_image_cache_file();
+
+ void remove_image_cache_state();
+ void handle_remove_image_cache_state(int r);
+
+ void remove_feature_bit();
+ void handle_remove_feature_bit(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::DiscardRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SHUTDOWN_REQUEST_H
diff --git a/src/librbd/cache/pwl/ImageCacheState.cc b/src/librbd/cache/pwl/ImageCacheState.cc
new file mode 100644
index 000000000..2bd6e1132
--- /dev/null
+++ b/src/librbd/cache/pwl/ImageCacheState.cc
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/Types.h"
+#include "librbd/cache/Utils.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Operations.h"
+#include "common/config_proxy.h"
+#include "common/environment.h"
+#include "common/hostname.h"
+#include "librbd/plugin/Api.h"
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ImageCacheState: " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename I>
+void ImageCacheState<I>::init_from_config() {
+ ldout(m_image_ctx->cct, 20) << dendl;
+
+ present = false;
+ empty = true;
+ clean = true;
+ host = "";
+ path = "";
+ ConfigProxy &config = m_image_ctx->config;
+ mode = config.get_val<std::string>("rbd_persistent_cache_mode");
+ size = 0;
+}
+
+template <typename I>
+bool ImageCacheState<I>::init_from_metadata(json_spirit::mValue& json_root) {
+ ldout(m_image_ctx->cct, 20) << dendl;
+
+ try {
+ auto& o = json_root.get_obj();
+ present = o["present"].get_bool();
+ empty = o["empty"].get_bool();
+ clean = o["clean"].get_bool();
+ host = o["host"].get_str();
+ path = o["path"].get_str();
+ mode = o["mode"].get_str();
+ size = o["size"].get_uint64();
+ } catch (std::runtime_error& e) {
+ lderr(m_image_ctx->cct) << "failed to parse cache state: " << e.what()
+ << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+template <typename I>
+void ImageCacheState<I>::write_image_cache_state(std::unique_lock<ceph::mutex>& locker,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked_by_me(*locker.mutex()));
+ stats_timestamp = ceph_clock_now();
+ json_spirit::mObject o;
+ o["present"] = present;
+ o["empty"] = empty;
+ o["clean"] = clean;
+ o["host"] = host;
+ o["path"] = path;
+ o["mode"] = mode;
+ o["size"] = size;
+ o["stats_timestamp"] = stats_timestamp.sec();
+ o["allocated_bytes"] = allocated_bytes;
+ o["cached_bytes"] = cached_bytes;
+ o["dirty_bytes"] = dirty_bytes;
+ o["free_bytes"] = free_bytes;
+ o["hits_full"] = hits_full;
+ o["hits_partial"] = hits_partial;
+ o["misses"] = misses;
+ o["hit_bytes"] = hit_bytes;
+ o["miss_bytes"] = miss_bytes;
+ std::string image_state_json = json_spirit::write(o);
+ locker.unlock();
+
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ ldout(m_image_ctx->cct, 20) << __func__ << " Store state: "
+ << image_state_json << dendl;
+ m_plugin_api.execute_image_metadata_set(m_image_ctx, PERSISTENT_CACHE_STATE,
+ image_state_json, on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::clear_image_cache_state(Context *on_finish) {
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl;
+ m_plugin_api.execute_image_metadata_remove(
+ m_image_ctx, PERSISTENT_CACHE_STATE, on_finish);
+}
+
+template <typename I>
+ImageCacheState<I>* ImageCacheState<I>::create_image_cache_state(
+ I* image_ctx, plugin::Api<I>& plugin_api, int &r) {
+ std::string cache_state_str;
+ ImageCacheState<I>* cache_state = nullptr;
+
+ r = 0;
+ bool dirty_cache = plugin_api.test_image_features(image_ctx, RBD_FEATURE_DIRTY_CACHE);
+ if (dirty_cache) {
+ cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ PERSISTENT_CACHE_STATE, &cache_state_str);
+ }
+
+ ldout(image_ctx->cct, 20) << "image_cache_state: " << cache_state_str << dendl;
+
+ bool pwl_enabled = cache::util::is_pwl_enabled(*image_ctx);
+ bool cache_desired = pwl_enabled;
+ cache_desired &= !image_ctx->read_only;
+ cache_desired &= !plugin_api.test_image_features(image_ctx, RBD_FEATURE_MIGRATING);
+ cache_desired &= !plugin_api.test_image_features(image_ctx, RBD_FEATURE_JOURNALING);
+ cache_desired &= !image_ctx->old_format;
+
+ if (!dirty_cache && !cache_desired) {
+ ldout(image_ctx->cct, 20) << "Do not desire to use image cache." << dendl;
+ } else if (dirty_cache && !cache_desired) {
+ lderr(image_ctx->cct) << "There's a dirty cache, but RWL cache is disabled."
+ << dendl;
+ r = -EINVAL;
+ }else if ((!dirty_cache || cache_state_str.empty()) && cache_desired) {
+ cache_state = new ImageCacheState<I>(image_ctx, plugin_api);
+ cache_state->init_from_config();
+ } else {
+ ceph_assert(!cache_state_str.empty());
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(cache_state_str.c_str(), json_root)) {
+ lderr(image_ctx->cct) << "failed to parse cache state" << dendl;
+ r = -EINVAL;
+ return nullptr;
+ }
+ cache_state = new ImageCacheState<I>(image_ctx, plugin_api);
+ if (!cache_state->init_from_metadata(json_root)) {
+ delete cache_state;
+ r = -EINVAL;
+ return nullptr;
+ }
+ if (!cache_state->present) {
+ cache_state->init_from_config();
+ }
+ }
+ return cache_state;
+}
+
+template <typename I>
+ImageCacheState<I>* ImageCacheState<I>::get_image_cache_state(
+ I* image_ctx, plugin::Api<I>& plugin_api) {
+ ImageCacheState<I>* cache_state = nullptr;
+ string cache_state_str;
+ cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ PERSISTENT_CACHE_STATE, &cache_state_str);
+ if (!cache_state_str.empty()) {
+ // ignore errors, best effort
+ cache_state = new ImageCacheState<I>(image_ctx, plugin_api);
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(cache_state_str.c_str(), json_root)) {
+ lderr(image_ctx->cct) << "failed to parse cache state" << dendl;
+ } else {
+ cache_state->init_from_metadata(json_root);
+ }
+ }
+ return cache_state;
+}
+
+template <typename I>
+bool ImageCacheState<I>::is_valid() {
+ if (this->present &&
+ (host.compare(ceph_get_short_hostname()) != 0)) {
+ auto cleanstring = "dirty";
+ if (this->clean) {
+ cleanstring = "clean";
+ }
+ lderr(m_image_ctx->cct) << "An image cache (RWL) remains on another host "
+ << host << " which is " << cleanstring
+ << ". Flush/close the image there to remove the "
+ << "image cache" << dendl;
+ return false;
+ }
+ return true;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ImageCacheState<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/ImageCacheState.h b/src/librbd/cache/pwl/ImageCacheState.h
new file mode 100644
index 000000000..5be5f73ac
--- /dev/null
+++ b/src/librbd/cache/pwl/ImageCacheState.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+
+#include "json_spirit/json_spirit.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/Types.h"
+#include <string>
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace librbd {
+
+namespace plugin { template <typename> struct Api; }
+
+namespace cache {
+namespace pwl {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCacheState {
+private:
+ ImageCtxT* m_image_ctx;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+public:
+ bool present = false;
+ bool empty = true;
+ bool clean = true;
+ std::string host;
+ std::string path;
+ std::string mode;
+ uint64_t size = 0;
+ /* After reloading, the following data does not need to be read,
+ * but recalculated. */
+ utime_t stats_timestamp;
+ uint64_t allocated_bytes = 0;
+ uint64_t cached_bytes = 0;
+ uint64_t dirty_bytes = 0;
+ uint64_t free_bytes = 0;
+ uint64_t hits_full = 0;
+ uint64_t hits_partial = 0;
+ uint64_t misses = 0;
+ uint64_t hit_bytes = 0;
+ uint64_t miss_bytes = 0;
+
+ ImageCacheState(ImageCtxT* image_ctx, plugin::Api<ImageCtxT>& plugin_api)
+ : m_image_ctx(image_ctx), m_plugin_api(plugin_api) {}
+
+ ~ImageCacheState() {}
+
+ ImageCacheType get_image_cache_mode() const {
+ if (mode == "rwl") {
+ return IMAGE_CACHE_TYPE_RWL;
+ } else if (mode == "ssd") {
+ return IMAGE_CACHE_TYPE_SSD;
+ }
+ return IMAGE_CACHE_TYPE_UNKNOWN;
+ }
+
+ void init_from_config();
+ bool init_from_metadata(json_spirit::mValue& json_root);
+
+ void write_image_cache_state(std::unique_lock<ceph::mutex>& locker,
+ Context *on_finish);
+
+ void clear_image_cache_state(Context *on_finish);
+
+ static ImageCacheState<ImageCtxT>* create_image_cache_state(
+ ImageCtxT* image_ctx, plugin::Api<ImageCtxT>& plugin_api, int &r);
+
+ static ImageCacheState<ImageCtxT>* get_image_cache_state(
+ ImageCtxT* image_ctx, plugin::Api<ImageCtxT>& plugin_api);
+
+ bool is_valid();
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ImageCacheState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
diff --git a/src/librbd/cache/pwl/InitRequest.cc b/src/librbd/cache/pwl/InitRequest.cc
new file mode 100644
index 000000000..65dac8b46
--- /dev/null
+++ b/src/librbd/cache/pwl/InitRequest.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/pwl/InitRequest.h"
+#include "librbd/io/ImageDispatcher.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/asio/ContextWQ.h"
+
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/WriteLogImageDispatch.h"
+#include "librbd/cache/ImageWriteback.h"
+#ifdef WITH_RBD_RWL
+#include "librbd/cache/pwl/rwl/WriteLog.h"
+#endif
+
+#ifdef WITH_RBD_SSD_CACHE
+#include "librbd/cache/pwl/ssd/WriteLog.h"
+#endif
+
+#include "librbd/cache/Utils.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/plugin/Api.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl:InitRequest " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+InitRequest<I>* InitRequest<I>::create(
+ I &image_ctx,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api,
+ Context *on_finish) {
+ return new InitRequest(image_ctx, image_writeback, plugin_api, on_finish);
+}
+
+template <typename I>
+InitRequest<I>::InitRequest(
+ I &image_ctx,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_image_writeback(image_writeback),
+ m_plugin_api(plugin_api),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+void InitRequest<I>::send() {
+ get_image_cache_state();
+}
+
+template <typename I>
+void InitRequest<I>::get_image_cache_state() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ int r;
+ auto cache_state = ImageCacheState<I>::create_image_cache_state(
+ &m_image_ctx, m_plugin_api, r);
+
+ if (r < 0 || !cache_state) {
+ save_result(r);
+ finish();
+ return;
+ } else if (!cache_state->is_valid()) {
+ delete cache_state;
+ cache_state = nullptr;
+ lderr(cct) << "failed to get image cache state: " << cpp_strerror(r)
+ << dendl;
+ save_result(-ENOENT);
+ finish();
+ return;
+ }
+
+ auto mode = cache_state->get_image_cache_mode();
+ switch (mode) {
+ #ifdef WITH_RBD_RWL
+ case cache::IMAGE_CACHE_TYPE_RWL:
+ m_image_cache =
+ new librbd::cache::pwl::rwl::WriteLog<I>(m_image_ctx,
+ cache_state,
+ m_image_writeback,
+ m_plugin_api);
+ break;
+ #endif
+ #ifdef WITH_RBD_SSD_CACHE
+ case cache::IMAGE_CACHE_TYPE_SSD:
+ m_image_cache =
+ new librbd::cache::pwl::ssd::WriteLog<I>(m_image_ctx,
+ cache_state,
+ m_image_writeback,
+ m_plugin_api);
+ break;
+ #endif
+ default:
+ delete cache_state;
+ cache_state = nullptr;
+ save_result(-ENOENT);
+ finish();
+ return;
+ }
+
+ init_image_cache();
+}
+
+template <typename I>
+void InitRequest<I>::init_image_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = InitRequest<I>;
+ Context *ctx = create_async_context_callback(m_image_ctx,
+ create_context_callback<klass, &klass::handle_init_image_cache>(this));
+ m_image_cache->init(ctx);
+}
+
+template <typename I>
+void InitRequest<I>::handle_init_image_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to init image cache: " << cpp_strerror(r)
+ << dendl;
+ delete m_image_cache;
+ m_image_cache = nullptr;
+ save_result(r);
+ finish();
+ return;
+ }
+ set_feature_bit();
+}
+
+template <typename I>
+void InitRequest<I>::set_feature_bit() {
+ CephContext *cct = m_image_ctx.cct;
+
+ uint64_t new_features = m_image_ctx.features | RBD_FEATURE_DIRTY_CACHE;
+ uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
+ ldout(cct, 10) << "old_features=" << m_image_ctx.features
+ << ", new_features=" << new_features
+ << ", features_mask=" << features_mask
+ << dendl;
+
+ int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx,
+ m_image_ctx.header_oid,
+ new_features, features_mask);
+ m_image_ctx.features |= RBD_FEATURE_DIRTY_CACHE;
+ using klass = InitRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_set_feature_bit>(
+ this);
+ ctx->complete(r);
+}
+
+template <typename I>
+void InitRequest<I>::handle_set_feature_bit(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to set feature bit: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+
+ shutdown_image_cache();
+ }
+
+ // Register RWL dispatch
+ auto image_dispatch = new cache::WriteLogImageDispatch<I>(
+ &m_image_ctx, m_image_cache, m_plugin_api);
+
+ m_image_ctx.io_image_dispatcher->register_dispatch(image_dispatch);
+
+ finish();
+}
+
+template <typename I>
+void InitRequest<I>::shutdown_image_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = InitRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_shutdown_image_cache>(this);
+ m_image_cache->shut_down(ctx);
+}
+
+template <typename I>
+void InitRequest<I>::handle_shutdown_image_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to close image cache: " << cpp_strerror(r)
+ << dendl;
+ }
+ delete m_image_cache;
+ m_image_cache = nullptr;
+
+ finish();
+}
+
+template <typename I>
+void InitRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::InitRequest<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/InitRequest.h b/src/librbd/cache/pwl/InitRequest.h
new file mode 100644
index 000000000..56e63425e
--- /dev/null
+++ b/src/librbd/cache/pwl/InitRequest.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io { class ImageDispatchInterface; }
+
+namespace plugin { template <typename> struct Api; }
+
+namespace cache {
+
+class ImageWritebackInterface;
+
+namespace pwl {
+
+template<typename>
+class AbstractWriteLog;
+
+template<typename>
+class ImageCacheState;
+
+template <typename ImageCtxT = ImageCtx>
+class InitRequest {
+public:
+ static InitRequest* create(
+ ImageCtxT &image_ctx,
+ librbd::cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api,
+ Context *on_finish);
+
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * Init request goes through the following state machine:
+ *
+ * <start>
+ * |
+ * v
+ * GET_IMAGE_CACHE_STATE
+ * |
+ * v
+ * INIT_IMAGE_CACHE
+ * |
+ * v
+ * SET_FEATURE_BIT * * * > CLOSE_IMAGE_CACHE
+ * | |
+ * v |
+ * <finish> <-------------------/
+ *
+ * @endverbatim
+ */
+
+ InitRequest(ImageCtxT &image_ctx,
+ librbd::cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api,
+ Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ librbd::cache::ImageWritebackInterface& m_image_writeback;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+ AbstractWriteLog<ImageCtxT> *m_image_cache;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ bool is_pwl_enabled();
+
+ void get_image_cache_state();
+
+ void init_image_cache();
+ void handle_init_image_cache(int r);
+
+ void set_feature_bit();
+ void handle_set_feature_bit(int r);
+
+ void shutdown_image_cache();
+ void handle_shutdown_image_cache(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::InitRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
diff --git a/src/librbd/cache/pwl/LogEntry.cc b/src/librbd/cache/pwl/LogEntry.cc
new file mode 100644
index 000000000..504d21051
--- /dev/null
+++ b/src/librbd/cache/pwl/LogEntry.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "LogEntry.h"
+#include "librbd/cache/ImageWriteback.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::LogEntry: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+std::ostream& GenericLogEntry::format(std::ostream &os) const {
+ os << "ram_entry=[" << ram_entry << "], "
+ << "cache_entry=" << (void*)cache_entry << ", "
+ << "log_entry_index=" << log_entry_index << ", "
+ << "completed=" << completed;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericLogEntry &entry) {
+ return entry.format(os);
+}
+
+std::ostream& SyncPointLogEntry::format(std::ostream &os) const {
+ os << "(Sync Point) ";
+ GenericLogEntry::format(os);
+ os << ", "
+ << "writes=" << writes << ", "
+ << "bytes=" << bytes << ", "
+ << "writes_completed=" << writes_completed << ", "
+ << "writes_flushed=" << writes_flushed << ", "
+ << "prior_sync_point_flushed=" << prior_sync_point_flushed << ", "
+ << "next_sync_point_entry=" << next_sync_point_entry;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogEntry &entry) {
+ return entry.format(os);
+}
+
+bool GenericWriteLogEntry::can_writeback() const {
+ return (this->completed &&
+ (ram_entry.is_sequenced() ||
+ (sync_point_entry &&
+ sync_point_entry->completed)));
+}
+
+std::ostream& GenericWriteLogEntry::format(std::ostream &os) const {
+ GenericLogEntry::format(os);
+ os << ", "
+ << "sync_point_entry=[";
+ if (sync_point_entry) {
+ os << *sync_point_entry;
+ } else {
+ os << "nullptr";
+ }
+ os << "], "
+ << "referring_map_entries=" << referring_map_entries;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogEntry &entry) {
+ return entry.format(os);
+}
+
+void WriteLogEntry::init(bool has_data,
+ uint64_t current_sync_gen,
+ uint64_t last_op_sequence_num, bool persist_on_flush) {
+ ram_entry.set_has_data(has_data);
+ ram_entry.sync_gen_number = current_sync_gen;
+ if (persist_on_flush) {
+ /* Persist on flush. Sequence #0 is never used. */
+ ram_entry.write_sequence_number = 0;
+ } else {
+ /* Persist on write */
+ ram_entry.write_sequence_number = last_op_sequence_num;
+ ram_entry.set_sequenced(true);
+ }
+ ram_entry.set_sync_point(false);
+ ram_entry.set_discard(false);
+}
+
+std::ostream& WriteLogEntry::format(std::ostream &os) const {
+ os << "(Write) ";
+ GenericWriteLogEntry::format(os);
+ os << ", "
+ << "cache_buffer=" << (void*)cache_buffer << ", ";
+ os << "cache_bp=" << cache_bp << ", ";
+ os << "cache_bl=" << cache_bl << ", ";
+ os << "bl_refs=" << bl_refs;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteLogEntry &entry) {
+ return entry.format(os);
+}
+
+void DiscardLogEntry::writeback(
+ librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+ image_writeback.aio_discard(ram_entry.image_offset_bytes,
+ ram_entry.write_bytes,
+ m_discard_granularity_bytes, ctx);
+}
+
+void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num) {
+ ram_entry.sync_gen_number = current_sync_gen;
+ if (persist_on_flush) {
+ /* Persist on flush. Sequence #0 is never used. */
+ ram_entry.write_sequence_number = 0;
+ } else {
+ /* Persist on write */
+ ram_entry.write_sequence_number = last_op_sequence_num;
+ ram_entry.set_sequenced(true);
+ }
+}
+
+std::ostream &DiscardLogEntry::format(std::ostream &os) const {
+ os << "(Discard) ";
+ GenericWriteLogEntry::format(os);
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const DiscardLogEntry &entry) {
+ return entry.format(os);
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/LogEntry.h b/src/librbd/cache/pwl/LogEntry.h
new file mode 100644
index 000000000..ecaca0b7b
--- /dev/null
+++ b/src/librbd/cache/pwl/LogEntry.h
@@ -0,0 +1,280 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
+
+#include "common/ceph_mutex.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/pwl/Types.h"
+#include <atomic>
+#include <memory>
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+
+class SyncPointLogEntry;
+class GenericWriteLogEntry;
+class WriteLogEntry;
+
+typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
+
+class GenericLogEntry {
+public:
+ WriteLogCacheEntry ram_entry;
+ WriteLogCacheEntry *cache_entry = nullptr;
+ uint64_t log_entry_index = 0;
+ bool completed = false;
+ BlockGuardCell* m_cell = nullptr;
+ GenericLogEntry(uint64_t image_offset_bytes = 0, uint64_t write_bytes = 0)
+ : ram_entry(image_offset_bytes, write_bytes) {
+ };
+ virtual ~GenericLogEntry() { };
+ GenericLogEntry(const GenericLogEntry&) = delete;
+ GenericLogEntry &operator=(const GenericLogEntry&) = delete;
+ virtual bool can_writeback() const {
+ return false;
+ }
+ virtual bool can_retire() const {
+ return false;
+ }
+ virtual void set_flushed(bool flushed) {
+ ceph_assert(false);
+ }
+ virtual unsigned int write_bytes() const {
+ return 0;
+ };
+ virtual unsigned int bytes_dirty() const {
+ return 0;
+ };
+ virtual std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() {
+ return nullptr;
+ }
+ virtual void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) {
+ ceph_assert(false);
+ };
+ virtual void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) {
+ ceph_assert(false);
+ }
+ virtual bool is_write_entry() const {
+ return false;
+ }
+ virtual bool is_writesame_entry() const {
+ return false;
+ }
+ virtual bool is_sync_point() const {
+ return false;
+ }
+ virtual unsigned int get_aligned_data_size() const {
+ return 0;
+ }
+ virtual void remove_cache_bl() {}
+ virtual std::ostream& format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericLogEntry &entry);
+};
+
+class SyncPointLogEntry : public GenericLogEntry {
+public:
+ /* Writing entries using this sync gen number */
+ std::atomic<unsigned int> writes = {0};
+ /* Total bytes for all writing entries using this sync gen number */
+ std::atomic<uint64_t> bytes = {0};
+ /* Writing entries using this sync gen number that have completed */
+ std::atomic<unsigned int> writes_completed = {0};
+ /* Writing entries using this sync gen number that have completed flushing to the writeback interface */
+ std::atomic<unsigned int> writes_flushed = {0};
+ /* All writing entries using all prior sync gen numbers have been flushed */
+ std::atomic<bool> prior_sync_point_flushed = {true};
+ std::shared_ptr<SyncPointLogEntry> next_sync_point_entry = nullptr;
+ SyncPointLogEntry(uint64_t sync_gen_number) {
+ ram_entry.sync_gen_number = sync_gen_number;
+ ram_entry.set_sync_point(true);
+ };
+ ~SyncPointLogEntry() override {};
+ SyncPointLogEntry(const SyncPointLogEntry&) = delete;
+ SyncPointLogEntry &operator=(const SyncPointLogEntry&) = delete;
+ bool can_retire() const override {
+ return this->completed;
+ }
+ bool is_sync_point() const override {
+ return true;
+ }
+ std::ostream& format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogEntry &entry);
+};
+
+class GenericWriteLogEntry : public GenericLogEntry {
+public:
+ uint32_t referring_map_entries = 0;
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry;
+ GenericWriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { }
+ GenericWriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
+ : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { }
+ ~GenericWriteLogEntry() override {};
+ GenericWriteLogEntry(const GenericWriteLogEntry&) = delete;
+ GenericWriteLogEntry &operator=(const GenericWriteLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ /* The valid bytes in this ops data buffer. Discard and WS override. */
+ return ram_entry.write_bytes;
+ };
+ unsigned int bytes_dirty() const override {
+ /* The bytes in the image this op makes dirty. Discard and WS override. */
+ return write_bytes();
+ };
+ BlockExtent block_extent() {
+ return ram_entry.block_extent();
+ }
+ uint32_t get_map_ref() {
+ return(referring_map_entries);
+ }
+ void inc_map_ref() { referring_map_entries++; }
+ void dec_map_ref() { referring_map_entries--; }
+ bool can_writeback() const override;
+ std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() override {
+ return sync_point_entry;
+ }
+ virtual void copy_cache_bl(bufferlist *out_bl) = 0;
+ void set_flushed(bool flushed) override {
+ m_flushed = flushed;
+ }
+ bool get_flushed() const {
+ return m_flushed;
+ }
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogEntry &entry);
+
+private:
+ bool m_flushed = false; /* or invalidated */
+};
+
+class WriteLogEntry : public GenericWriteLogEntry {
+protected:
+ bool is_writesame = false;
+ buffer::ptr cache_bp;
+ buffer::list cache_bl;
+ std::atomic<int> bl_refs = {0}; /* The refs held on cache_bp by cache_bl */
+ /* Used in WriteLogEntry::get_cache_bl() to syncronize between threads making entries readable */
+ mutable ceph::mutex m_entry_bl_lock;
+
+ virtual void init_cache_bp() {}
+
+ virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {}
+public:
+ uint8_t *cache_buffer = nullptr;
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
+ m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
+ { }
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
+ : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes),
+ m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
+ { }
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
+ ram_entry.set_writesame(true);
+ ram_entry.ws_datalen = data_length;
+ is_writesame = true;
+ };
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
+ ram_entry.set_writesame(true);
+ ram_entry.ws_datalen = data_length;
+ is_writesame = true;
+ };
+ ~WriteLogEntry() override {};
+ WriteLogEntry(const WriteLogEntry&) = delete;
+ WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ // The valid bytes in this ops data buffer.
+ if(is_writesame) {
+ return ram_entry.ws_datalen;
+ }
+ return ram_entry.write_bytes;
+ };
+ unsigned int bytes_dirty() const override {
+ // The bytes in the image this op makes dirty.
+ return ram_entry.write_bytes;
+ };
+ void init(bool has_data,
+ uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush);
+ virtual void init_cache_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {}
+ virtual void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) {}
+ /* Returns a ref to a bl containing bufferptrs to the entry cache buffer */
+ virtual buffer::list &get_cache_bl() = 0;
+
+ BlockExtent block_extent();
+ virtual unsigned int reader_count() const = 0;
+ /* Constructs a new bl containing copies of cache_bp */
+ bool can_retire() const override {
+ return (this->completed && this->get_flushed() && (0 == reader_count()));
+ }
+ bool is_write_entry() const override {
+ return true;
+ }
+ bool is_writesame_entry() const override {
+ return is_writesame;
+ }
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteLogEntry &entry);
+};
+
+class DiscardLogEntry : public GenericWriteLogEntry {
+public:
+ DiscardLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t discard_granularity_bytes)
+ : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
+ m_discard_granularity_bytes(discard_granularity_bytes) {
+ ram_entry.set_discard(true);
+ };
+ DiscardLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
+ : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
+ ram_entry.set_discard(true);
+ };
+ DiscardLogEntry(const DiscardLogEntry&) = delete;
+ DiscardLogEntry &operator=(const DiscardLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ /* The valid bytes in this ops data buffer. */
+ return 0;
+ };
+ unsigned int bytes_dirty() const override {
+ /* The bytes in the image this op makes dirty. */
+ return ram_entry.write_bytes;
+ };
+ bool can_retire() const override {
+ return this->completed;
+ }
+ void copy_cache_bl(bufferlist *out_bl) override {
+ ceph_assert(false);
+ }
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+ void init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num);
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const DiscardLogEntry &entry);
+private:
+ uint32_t m_discard_granularity_bytes;
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
diff --git a/src/librbd/cache/pwl/LogMap.cc b/src/librbd/cache/pwl/LogMap.cc
new file mode 100644
index 000000000..a2e6d65eb
--- /dev/null
+++ b/src/librbd/cache/pwl/LogMap.cc
@@ -0,0 +1,278 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LogMap.h"
+#include "include/ceph_assert.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::LogMap: " << this << " " \
+ << __func__ << ": "
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ LogMapEntry<T> &e) {
+ os << "block_extent=" << e.block_extent << ", "
+ << "log_entry=[" << e.log_entry << "]";
+ return os;
+}
+
+template <typename T>
+LogMapEntry<T>::LogMapEntry(const BlockExtent block_extent,
+ std::shared_ptr<T> log_entry)
+ : block_extent(block_extent) , log_entry(log_entry) {
+}
+
+template <typename T>
+LogMapEntry<T>::LogMapEntry(std::shared_ptr<T> log_entry)
+ : block_extent(log_entry->block_extent()) , log_entry(log_entry) {
+}
+
+template <typename T>
+LogMap<T>::LogMap(CephContext *cct)
+ : m_cct(cct),
+ m_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::LogMap::m_lock", this))) {
+}
+
+/**
+ * Add a write log entry to the map. Subsequent queries for blocks
+ * within this log entry's extent will find this log entry. Portions
+ * of prior write log entries overlapping with this log entry will
+ * be replaced in the map by this log entry.
+ *
+ * The map_entries field of the log entry object will be updated to
+ * contain this map entry.
+ *
+ * The map_entries fields of all log entries overlapping with this
+ * entry will be updated to remove the regions that overlap with
+ * this.
+ */
+template <typename T>
+void LogMap<T>::add_log_entry(std::shared_ptr<T> log_entry) {
+ std::lock_guard locker(m_lock);
+ add_log_entry_locked(log_entry);
+}
+
+template <typename T>
+void LogMap<T>::add_log_entries(std::list<std::shared_ptr<T>> &log_entries) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ for (auto &log_entry : log_entries) {
+ add_log_entry_locked(log_entry);
+ }
+}
+
+/**
+ * Remove any map entries that refer to the supplied write log
+ * entry.
+ */
+template <typename T>
+void LogMap<T>::remove_log_entry(std::shared_ptr<T> log_entry) {
+ std::lock_guard locker(m_lock);
+ remove_log_entry_locked(log_entry);
+}
+
+template <typename T>
+void LogMap<T>::remove_log_entries(std::list<std::shared_ptr<T>> &log_entries) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ for (auto &log_entry : log_entries) {
+ remove_log_entry_locked(log_entry);
+ }
+}
+
+/**
+ * Returns the list of all write log entries that overlap the specified block
+ * extent. This doesn't tell you which portions of these entries overlap the
+ * extent, or each other. For that, use find_map_entries(). A log entry may
+ * appear in the list more than once, if multiple map entries refer to it
+ * (e.g. the middle of that write log entry has been overwritten).
+ */
+template <typename T>
+std::list<std::shared_ptr<T>> LogMap<T>::find_log_entries(BlockExtent block_extent) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ return find_log_entries_locked(block_extent);
+}
+
+/**
+ * Returns the list of all write log map entries that overlap the
+ * specified block extent.
+ */
+template <typename T>
+LogMapEntries<T> LogMap<T>::find_map_entries(BlockExtent block_extent) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ return find_map_entries_locked(block_extent);
+}
+
+template <typename T>
+void LogMap<T>::add_log_entry_locked(std::shared_ptr<T> log_entry) {
+ LogMapEntry<T> map_entry(log_entry);
+ ldout(m_cct, 20) << "block_extent=" << map_entry.block_extent
+ << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ LogMapEntries<T> overlap_entries = find_map_entries_locked(map_entry.block_extent);
+ for (auto &entry : overlap_entries) {
+ ldout(m_cct, 20) << entry << dendl;
+ if (map_entry.block_extent.block_start <= entry.block_extent.block_start) {
+ if (map_entry.block_extent.block_end >= entry.block_extent.block_end) {
+ ldout(m_cct, 20) << "map entry completely occluded by new log entry" << dendl;
+ remove_map_entry_locked(entry);
+ } else {
+ ceph_assert(map_entry.block_extent.block_end < entry.block_extent.block_end);
+ /* The new entry occludes the beginning of the old entry */
+ BlockExtent adjusted_extent(map_entry.block_extent.block_end,
+ entry.block_extent.block_end);
+ adjust_map_entry_locked(entry, adjusted_extent);
+ }
+ } else {
+ if (map_entry.block_extent.block_end >= entry.block_extent.block_end) {
+ /* The new entry occludes the end of the old entry */
+ BlockExtent adjusted_extent(entry.block_extent.block_start,
+ map_entry.block_extent.block_start);
+ adjust_map_entry_locked(entry, adjusted_extent);
+ } else {
+ /* The new entry splits the old entry */
+ split_map_entry_locked(entry, map_entry.block_extent);
+ }
+ }
+ }
+ add_map_entry_locked(map_entry);
+}
+
+template <typename T>
+void LogMap<T>::remove_log_entry_locked(std::shared_ptr<T> log_entry) {
+ ldout(m_cct, 20) << "*log_entry=" << *log_entry << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ LogMapEntries<T> possible_hits = find_map_entries_locked(log_entry->block_extent());
+ for (auto &possible_hit : possible_hits) {
+ if (possible_hit.log_entry == log_entry) {
+ /* This map entry refers to the specified log entry */
+ remove_map_entry_locked(possible_hit);
+ }
+ }
+}
+
+template <typename T>
+void LogMap<T>::add_map_entry_locked(LogMapEntry<T> &map_entry) {
+ ceph_assert(map_entry.log_entry);
+ m_block_to_log_entry_map.insert(map_entry);
+ map_entry.log_entry->inc_map_ref();
+}
+
+template <typename T>
+void LogMap<T>::remove_map_entry_locked(LogMapEntry<T> &map_entry) {
+ auto it = m_block_to_log_entry_map.find(map_entry);
+ ceph_assert(it != m_block_to_log_entry_map.end());
+
+ LogMapEntry<T> erased = *it;
+ m_block_to_log_entry_map.erase(it);
+ erased.log_entry->dec_map_ref();
+ if (0 == erased.log_entry->get_map_ref()) {
+ ldout(m_cct, 20) << "log entry has zero map entries: " << erased.log_entry << dendl;
+ }
+}
+
+template <typename T>
+void LogMap<T>::adjust_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &new_extent) {
+ auto it = m_block_to_log_entry_map.find(map_entry);
+ ceph_assert(it != m_block_to_log_entry_map.end());
+
+ LogMapEntry<T> adjusted = *it;
+ m_block_to_log_entry_map.erase(it);
+
+ m_block_to_log_entry_map.insert(LogMapEntry<T>(new_extent, adjusted.log_entry));
+}
+
+template <typename T>
+void LogMap<T>::split_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &removed_extent) {
+ auto it = m_block_to_log_entry_map.find(map_entry);
+ ceph_assert(it != m_block_to_log_entry_map.end());
+
+ LogMapEntry<T> split = *it;
+ m_block_to_log_entry_map.erase(it);
+
+ BlockExtent left_extent(split.block_extent.block_start,
+ removed_extent.block_start);
+ m_block_to_log_entry_map.insert(LogMapEntry<T>(left_extent, split.log_entry));
+
+ BlockExtent right_extent(removed_extent.block_end,
+ split.block_extent.block_end);
+ m_block_to_log_entry_map.insert(LogMapEntry<T>(right_extent, split.log_entry));
+
+ split.log_entry->inc_map_ref();
+}
+
+template <typename T>
+std::list<std::shared_ptr<T>> LogMap<T>::find_log_entries_locked(const BlockExtent &block_extent) {
+ std::list<std::shared_ptr<T>> overlaps;
+ ldout(m_cct, 20) << "block_extent=" << block_extent << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ LogMapEntries<T> map_entries = find_map_entries_locked(block_extent);
+ for (auto &map_entry : map_entries) {
+ overlaps.emplace_back(map_entry.log_entry);
+ }
+ return overlaps;
+}
+
+/**
+ * TODO: Generalize this to do some arbitrary thing to each map
+ * extent, instead of returning a list.
+ */
+template <typename T>
+LogMapEntries<T> LogMap<T>::find_map_entries_locked(const BlockExtent &block_extent) {
+ LogMapEntries<T> overlaps;
+
+ ldout(m_cct, 20) << "block_extent=" << block_extent << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ auto p = m_block_to_log_entry_map.equal_range(LogMapEntry<T>(block_extent));
+ ldout(m_cct, 20) << "count=" << std::distance(p.first, p.second) << dendl;
+ for ( auto i = p.first; i != p.second; ++i ) {
+ LogMapEntry<T> entry = *i;
+ overlaps.emplace_back(entry);
+ ldout(m_cct, 20) << entry << dendl;
+ }
+ return overlaps;
+}
+
+/* We map block extents to write log entries, or portions of write log
+ * entries. These are both represented by a WriteLogMapEntry. When a
+ * GenericWriteLogEntry is added to this map, a WriteLogMapEntry is created to
+ * represent the entire block extent of the GenericWriteLogEntry, and the
+ * WriteLogMapEntry is added to the set.
+ *
+ * The set must not contain overlapping WriteLogMapEntrys. WriteLogMapEntrys
+ * in the set that overlap with one being added are adjusted (shrunk, split,
+ * or removed) before the new entry is added.
+ *
+ * This comparison works despite the ambiguity because we ensure the set
+ * contains no overlapping entries. This comparison works to find entries
+ * that overlap with a given block extent because equal_range() returns the
+ * first entry in which the extent doesn't end before the given extent
+ * starts, and the last entry for which the extent starts before the given
+ * extent ends (the first entry that the key is less than, and the last entry
+ * that is less than the key).
+ */
+template <typename T>
+bool LogMap<T>::LogMapEntryCompare::operator()(const LogMapEntry<T> &lhs,
+ const LogMapEntry<T> &rhs) const {
+ if (lhs.block_extent.block_end <= rhs.block_extent.block_start) {
+ return true;
+ }
+ return false;
+}
+
+} //namespace pwl
+} //namespace cache
+} //namespace librbd
+
+template class librbd::cache::pwl::LogMap<librbd::cache::pwl::GenericWriteLogEntry>;
diff --git a/src/librbd/cache/pwl/LogMap.h b/src/librbd/cache/pwl/LogMap.h
new file mode 100644
index 000000000..a05307896
--- /dev/null
+++ b/src/librbd/cache/pwl/LogMap.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
+
+#include "librbd/BlockGuard.h"
+#include <list>
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+/**
+ * WriteLogMap: maps block extents to GenericWriteLogEntries
+ *
+ * A WriteLogMapEntry (based on LogMapEntry) refers to a portion of a GenericWriteLogEntry
+ */
+template <typename T>
+class LogMapEntry {
+public:
+ BlockExtent block_extent;
+ std::shared_ptr<T> log_entry;
+
+ LogMapEntry(BlockExtent block_extent,
+ std::shared_ptr<T> log_entry = nullptr);
+ LogMapEntry(std::shared_ptr<T> log_entry);
+
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ LogMapEntry<U> &e);
+};
+
+template <typename T>
+using LogMapEntries = std::list<LogMapEntry<T>>;
+
+template <typename T>
+class LogMap {
+public:
+ LogMap(CephContext *cct);
+ LogMap(const LogMap&) = delete;
+ LogMap &operator=(const LogMap&) = delete;
+
+ void add_log_entry(std::shared_ptr<T> log_entry);
+ void add_log_entries(std::list<std::shared_ptr<T>> &log_entries);
+ void remove_log_entry(std::shared_ptr<T> log_entry);
+ void remove_log_entries(std::list<std::shared_ptr<T>> &log_entries);
+ std::list<std::shared_ptr<T>> find_log_entries(BlockExtent block_extent);
+ LogMapEntries<T> find_map_entries(BlockExtent block_extent);
+
+private:
+ void add_log_entry_locked(std::shared_ptr<T> log_entry);
+ void remove_log_entry_locked(std::shared_ptr<T> log_entry);
+ void add_map_entry_locked(LogMapEntry<T> &map_entry);
+ void remove_map_entry_locked(LogMapEntry<T> &map_entry);
+ void adjust_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &new_extent);
+ void split_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &removed_extent);
+ std::list<std::shared_ptr<T>> find_log_entries_locked(const BlockExtent &block_extent);
+ LogMapEntries<T> find_map_entries_locked(const BlockExtent &block_extent);
+
+ using LogMapEntryT = LogMapEntry<T>;
+
+ class LogMapEntryCompare {
+ public:
+ bool operator()(const LogMapEntryT &lhs,
+ const LogMapEntryT &rhs) const;
+ };
+
+ using BlockExtentToLogMapEntries = std::set<LogMapEntryT,
+ LogMapEntryCompare>;
+
+ CephContext *m_cct;
+ ceph::mutex m_lock;
+ BlockExtentToLogMapEntries m_block_to_log_entry_map;
+};
+
+} //namespace pwl
+} //namespace cache
+} //namespace librbd
+
+#endif //CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
diff --git a/src/librbd/cache/pwl/LogOperation.cc b/src/librbd/cache/pwl/LogOperation.cc
new file mode 100644
index 000000000..4fc13a91a
--- /dev/null
+++ b/src/librbd/cache/pwl/LogOperation.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "LogOperation.h"
+#include "librbd/cache/pwl/Types.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+GenericLogOperation::GenericLogOperation(utime_t dispatch_time,
+ PerfCounters *perfcounter)
+ : m_perfcounter(perfcounter), dispatch_time(dispatch_time) {
+}
+
+std::ostream& GenericLogOperation::format(std::ostream &os) const {
+ os << "dispatch_time=[" << dispatch_time << "], "
+ << "buf_persist_start_time=[" << buf_persist_start_time << "], "
+ << "buf_persist_comp_time=[" << buf_persist_comp_time << "], "
+ << "log_append_start_time=[" << log_append_start_time << "], "
+ << "log_append_comp_time=[" << log_append_comp_time << "], ";
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericLogOperation &op) {
+ return op.format(os);
+}
+
+SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock,
+ std::shared_ptr<SyncPoint> sync_point,
+ utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct)
+ : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock),
+ sync_point(sync_point) {
+}
+
+SyncPointLogOperation::~SyncPointLogOperation() { }
+
+std::ostream &SyncPointLogOperation::format(std::ostream &os) const {
+ os << "(Sync Point) ";
+ GenericLogOperation::format(os);
+ os << ", "
+ << "sync_point=[" << *sync_point << "]";
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogOperation &op) {
+ return op.format(os);
+}
+
+std::vector<Context*> SyncPointLogOperation::append_sync_point() {
+ std::vector<Context*> appending_contexts;
+ std::lock_guard locker(m_lock);
+ if (!sync_point->appending) {
+ sync_point->appending = true;
+ }
+ appending_contexts.swap(sync_point->on_sync_point_appending);
+ return appending_contexts;
+}
+
+void SyncPointLogOperation::clear_earlier_sync_point() {
+ std::lock_guard locker(m_lock);
+ ceph_assert(sync_point->later_sync_point);
+ ceph_assert(sync_point->later_sync_point->earlier_sync_point == sync_point);
+ sync_point->later_sync_point->earlier_sync_point = nullptr;
+ sync_point->later_sync_point = nullptr;
+}
+
+std::vector<Context*> SyncPointLogOperation::swap_on_sync_point_persisted() {
+ std::lock_guard locker(m_lock);
+ std::vector<Context*> persisted_contexts;
+ persisted_contexts.swap(sync_point->on_sync_point_persisted);
+ return persisted_contexts;
+}
+
+void SyncPointLogOperation::appending() {
+ ceph_assert(sync_point);
+ ldout(m_cct, 20) << "Sync point op=[" << *this
+ << "] appending" << dendl;
+ auto appending_contexts = append_sync_point();
+ for (auto &ctx : appending_contexts) {
+ ctx->complete(0);
+ }
+}
+
+void SyncPointLogOperation::complete(int result) {
+ ceph_assert(sync_point);
+ ldout(m_cct, 20) << "Sync point op =[" << *this
+ << "] completed" << dendl;
+ clear_earlier_sync_point();
+
+ /* Do append now in case completion occurred before the
+ * normal append callback executed, and to handle
+ * on_append work that was queued after the sync point
+ * entered the appending state. */
+ appending();
+ auto persisted_contexts = swap_on_sync_point_persisted();
+ for (auto &ctx : persisted_contexts) {
+ ctx->complete(result);
+ }
+}
+
+GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct)
+ : GenericLogOperation(dispatch_time, perfcounter),
+ m_lock(ceph::make_mutex(pwl::unique_lock_name(
+ "librbd::cache::pwl::GenericWriteLogOperation::m_lock", this))),
+ m_cct(cct),
+ sync_point(sync_point) {
+}
+
+GenericWriteLogOperation::~GenericWriteLogOperation() { }
+
+std::ostream &GenericWriteLogOperation::format(std::ostream &os) const {
+ GenericLogOperation::format(os);
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogOperation &op) {
+ return op.format(os);
+}
+
+/* Called when the write log operation is appending and its log position is guaranteed */
+void GenericWriteLogOperation::appending() {
+ Context *on_append = nullptr;
+ ldout(m_cct, 20) << __func__ << " " << this << dendl;
+ {
+ std::lock_guard locker(m_lock);
+ on_append = on_write_append;
+ on_write_append = nullptr;
+ }
+ if (on_append) {
+ ldout(m_cct, 20) << __func__ << " " << this << " on_append=" << on_append << dendl;
+ on_append->complete(0);
+ }
+}
+
+/* Called when the write log operation is completed in all log replicas */
+void GenericWriteLogOperation::complete(int result) {
+ appending();
+ Context *on_persist = nullptr;
+ ldout(m_cct, 20) << __func__ << " " << this << dendl;
+ {
+ std::lock_guard locker(m_lock);
+ on_persist = on_write_persist;
+ on_write_persist = nullptr;
+ }
+ if (on_persist) {
+ ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist
+ << dendl;
+ on_persist->complete(result);
+ }
+}
+
+WriteLogOperation::WriteLogOperation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> write_log_entry)
+ : GenericWriteLogOperation(set.sync_point, set.dispatch_time,
+ set.perfcounter, cct),
+ log_entry(write_log_entry) {
+ on_write_append = set.extent_ops_appending->new_sub();
+ on_write_persist = set.extent_ops_persist->new_sub();
+ log_entry->sync_point_entry->writes++;
+ log_entry->sync_point_entry->bytes += write_bytes;
+}
+
+WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
+ uint32_t data_len,
+ CephContext *cct,
+ std::shared_ptr<WriteLogEntry> writesame_log_entry)
+ : WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+ writesame_log_entry) {
+ is_writesame = true;
+}
+
+WriteLogOperation::~WriteLogOperation() { }
+
+void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
+ uint64_t current_sync_gen,
+ uint64_t last_op_sequence_num,
+ bufferlist &write_req_bl, uint64_t buffer_offset,
+ bool persist_on_flush) {
+ log_entry->init(has_data, current_sync_gen, last_op_sequence_num,
+ persist_on_flush);
+ buffer_alloc = &(*allocation);
+ bl.substr_of(write_req_bl, buffer_offset, log_entry->write_bytes());
+ log_entry->init_cache_bl(write_req_bl, buffer_offset,
+ log_entry->write_bytes());
+}
+
+std::ostream &WriteLogOperation::format(std::ostream &os) const {
+ string op_name = is_writesame ? "(Write Same) " : "(Write) ";
+ os << op_name;
+ GenericWriteLogOperation::format(os);
+ os << ", ";
+ if (log_entry) {
+ os << "log_entry=[" << *log_entry << "], ";
+ } else {
+ os << "log_entry=nullptr, ";
+ }
+ os << "bl=[" << bl << "],"
+ << "buffer_alloc=" << buffer_alloc;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperation &op) {
+ return op.format(os);
+}
+
+
+void WriteLogOperation::complete(int result) {
+ GenericWriteLogOperation::complete(result);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_buf_t,
+ buf_persist_start_time - dispatch_time);
+ utime_t buf_persist_lat = buf_persist_comp_time - buf_persist_start_time;
+ m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_bufc_t, buf_persist_lat);
+ m_perfcounter->hinc(l_librbd_pwl_log_op_buf_to_bufc_t_hist,
+ buf_persist_lat.to_nsec(),
+ log_entry->ram_entry.write_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t,
+ log_append_start_time - buf_persist_start_time);
+}
+
+WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
+ bool persist_on_flush, CephContext *cct, Context *on_finish)
+ : m_cct(cct), m_on_finish(on_finish),
+ persist_on_flush(persist_on_flush),
+ dispatch_time(dispatched),
+ perfcounter(perfcounter),
+ sync_point(sync_point) {
+ on_ops_appending = sync_point->prior_persisted_gather_new_sub();
+ on_ops_persist = nullptr;
+ extent_ops_persist =
+ new C_Gather(m_cct,
+ new LambdaContext( [this](int r) {
+ ldout(this->m_cct,20) << __func__ << " " << this << " m_extent_ops_persist completed" << dendl;
+ if (on_ops_persist) {
+ on_ops_persist->complete(r);
+ }
+ m_on_finish->complete(r);
+ }));
+ auto appending_persist_sub = extent_ops_persist->new_sub();
+ extent_ops_appending =
+ new C_Gather(m_cct,
+ new LambdaContext( [this, appending_persist_sub](int r) {
+ ldout(this->m_cct, 20) << __func__ << " " << this << " m_extent_ops_appending completed" << dendl;
+ on_ops_appending->complete(r);
+ appending_persist_sub->complete(r);
+ }));
+}
+
+WriteLogOperationSet::~WriteLogOperationSet() { }
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperationSet &s) {
+ os << "cell=" << (void*)s.cell << ", "
+ << "extent_ops_appending=[" << s.extent_ops_appending << ", "
+ << "extent_ops_persist=[" << s.extent_ops_persist << "]";
+ return os;
+}
+
+DiscardLogOperation::DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
+ uint32_t discard_granularity_bytes,
+ utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct)
+ : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct),
+ log_entry(std::make_shared<DiscardLogEntry>(sync_point->log_entry,
+ image_offset_bytes,
+ write_bytes,
+ discard_granularity_bytes)) {
+ on_write_persist = nullptr;
+ log_entry->sync_point_entry->writes++;
+ log_entry->sync_point_entry->bytes += write_bytes;
+}
+
+DiscardLogOperation::~DiscardLogOperation() { }
+
+std::ostream &DiscardLogOperation::format(std::ostream &os) const {
+ os << "(Discard) ";
+ GenericWriteLogOperation::format(os);
+ os << ", ";
+ if (log_entry) {
+ os << "log_entry=[" << *log_entry << "], ";
+ } else {
+ os << "log_entry=nullptr, ";
+ }
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const DiscardLogOperation &op) {
+ return op.format(os);
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/LogOperation.h b/src/librbd/cache/pwl/LogOperation.h
new file mode 100644
index 000000000..15befe05f
--- /dev/null
+++ b/src/librbd/cache/pwl/LogOperation.h
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
+
+#include "include/utime.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/SyncPoint.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+struct WriteBufferAllocation;
+
+class WriteLogOperationSet;
+
+class WriteLogOperation;
+
+class GenericWriteLogOperation;
+
+class SyncPointLogOperation;
+
+class GenericLogOperation;
+
+template <typename T>
+class AbstractWriteLog;
+
+using GenericLogOperationSharedPtr = std::shared_ptr<GenericLogOperation>;
+
+using GenericLogOperationsVector = std::vector<GenericLogOperationSharedPtr>;
+
+class GenericLogOperation {
+protected:
+ PerfCounters *m_perfcounter = nullptr;
+public:
+ utime_t dispatch_time; // When op created
+ utime_t buf_persist_start_time; // When buffer persist begins
+ utime_t buf_persist_comp_time; // When buffer persist completes
+ utime_t log_append_start_time; // When log append begins
+ utime_t log_append_comp_time; // When log append completes
+ GenericLogOperation(utime_t dispatch_time, PerfCounters *perfcounter);
+ virtual ~GenericLogOperation() { };
+ GenericLogOperation(const GenericLogOperation&) = delete;
+ GenericLogOperation &operator=(const GenericLogOperation&) = delete;
+ virtual std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericLogOperation &op);
+ virtual const std::shared_ptr<GenericLogEntry> get_log_entry() = 0;
+ virtual void appending() = 0;
+ virtual void complete(int r) = 0;
+ virtual void mark_log_entry_completed() {};
+ virtual bool reserved_allocated() const {
+ return false;
+ }
+ virtual bool is_writing_op() const {
+ return false;
+ }
+ virtual void init_op(uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist,
+ Context *write_append) {};
+ virtual void copy_bl_to_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) {};
+};
+
+class SyncPointLogOperation : public GenericLogOperation {
+private:
+ CephContext *m_cct;
+ ceph::mutex &m_lock;
+ std::vector<Context*> append_sync_point();
+ void clear_earlier_sync_point();
+ std::vector<Context*> swap_on_sync_point_persisted();
+public:
+ std::shared_ptr<SyncPoint> sync_point;
+ SyncPointLogOperation(ceph::mutex &lock,
+ std::shared_ptr<SyncPoint> sync_point,
+ utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct);
+ ~SyncPointLogOperation() override;
+ SyncPointLogOperation(const SyncPointLogOperation&) = delete;
+ SyncPointLogOperation &operator=(const SyncPointLogOperation&) = delete;
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogOperation &op);
+ const std::shared_ptr<GenericLogEntry> get_log_entry() override {
+ return sync_point->log_entry;
+ }
+ void appending() override;
+ void complete(int r) override;
+};
+
+class GenericWriteLogOperation : public GenericLogOperation {
+protected:
+ ceph::mutex m_lock;
+ CephContext *m_cct;
+public:
+ std::shared_ptr<SyncPoint> sync_point;
+ Context *on_write_append = nullptr; /* Completion for things waiting on this
+ * write's position in the log to be
+ * guaranteed */
+ Context *on_write_persist = nullptr; /* Completion for things waiting on this
+ * write to persist */
+ GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct);
+ ~GenericWriteLogOperation() override;
+ GenericWriteLogOperation(const GenericWriteLogOperation&) = delete;
+ GenericWriteLogOperation &operator=(const GenericWriteLogOperation&) = delete;
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogOperation &op);
+ void mark_log_entry_completed() override{
+ sync_point->log_entry->writes_completed++;
+ }
+ bool reserved_allocated() const override {
+ return true;
+ }
+ bool is_writing_op() const override {
+ return true;
+ }
+ void appending() override;
+ void complete(int r) override;
+};
+
+class WriteLogOperation : public GenericWriteLogOperation {
+public:
+ using GenericWriteLogOperation::m_lock;
+ using GenericWriteLogOperation::sync_point;
+ using GenericWriteLogOperation::on_write_append;
+ using GenericWriteLogOperation::on_write_persist;
+ std::shared_ptr<WriteLogEntry> log_entry;
+ bufferlist bl;
+ bool is_writesame = false;
+ WriteBufferAllocation *buffer_alloc = nullptr;
+ WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> write_log_entry);
+ WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len,
+ CephContext *cct,
+ std::shared_ptr<WriteLogEntry> writesame_log_entry);
+ ~WriteLogOperation() override;
+ WriteLogOperation(const WriteLogOperation&) = delete;
+ WriteLogOperation &operator=(const WriteLogOperation&) = delete;
+ void init(bool has_data,
+ std::vector<WriteBufferAllocation>::iterator allocation,
+ uint64_t current_sync_gen, uint64_t last_op_sequence_num,
+ bufferlist &write_req_bl, uint64_t buffer_offset,
+ bool persist_on_flush);
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperation &op);
+ const std::shared_ptr<GenericLogEntry> get_log_entry() override {
+ return log_entry;
+ }
+
+ void complete(int r) override;
+};
+
+
+class WriteLogOperationSet {
+private:
+ CephContext *m_cct;
+ Context *m_on_finish;
+public:
+ bool persist_on_flush;
+ BlockGuardCell *cell;
+ C_Gather *extent_ops_appending;
+ Context *on_ops_appending;
+ C_Gather *extent_ops_persist;
+ Context *on_ops_persist;
+ GenericLogOperationsVector operations;
+ utime_t dispatch_time; /* When set created */
+ PerfCounters *perfcounter = nullptr;
+ std::shared_ptr<SyncPoint> sync_point;
+ WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter,
+ std::shared_ptr<SyncPoint> sync_point,
+ const bool persist_on_flush, CephContext *cct,
+ Context *on_finish);
+ ~WriteLogOperationSet();
+ WriteLogOperationSet(const WriteLogOperationSet&) = delete;
+ WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperationSet &s);
+};
+
+class DiscardLogOperation : public GenericWriteLogOperation {
+public:
+ using GenericWriteLogOperation::m_lock;
+ using GenericWriteLogOperation::sync_point;
+ using GenericWriteLogOperation::on_write_append;
+ using GenericWriteLogOperation::on_write_persist;
+ std::shared_ptr<DiscardLogEntry> log_entry;
+ DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
+ uint32_t discard_granularity_bytes,
+ utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct);
+ ~DiscardLogOperation() override;
+ DiscardLogOperation(const DiscardLogOperation&) = delete;
+ DiscardLogOperation &operator=(const DiscardLogOperation&) = delete;
+ const std::shared_ptr<GenericLogEntry> get_log_entry() override {
+ return log_entry;
+ }
+ bool reserved_allocated() const override {
+ return false;
+ }
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const DiscardLogOperation &op);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
diff --git a/src/librbd/cache/pwl/ReadRequest.h b/src/librbd/cache/pwl/ReadRequest.h
new file mode 100644
index 000000000..d4b2aee5b
--- /dev/null
+++ b/src/librbd/cache/pwl/ReadRequest.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
+
+#include "include/Context.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+typedef std::vector<std::shared_ptr<pwl::ImageExtentBuf>> ImageExtentBufs;
+
+class C_ReadRequest : public Context {
+public:
+ io::Extents miss_extents; // move back to caller
+ ImageExtentBufs read_extents;
+ bufferlist miss_bl;
+
+ C_ReadRequest(
+ CephContext *cct, utime_t arrived, PerfCounters *perfcounter,
+ bufferlist *out_bl, Context *on_finish)
+ : m_cct(cct), m_on_finish(on_finish), m_out_bl(out_bl),
+ m_arrived_time(arrived), m_perfcounter(perfcounter) {}
+ ~C_ReadRequest() {}
+
+ const char *get_name() const {
+ return "C_ReadRequest";
+ }
+
+protected:
+ CephContext *m_cct;
+ Context *m_on_finish;
+ bufferlist *m_out_bl;
+ utime_t m_arrived_time;
+ PerfCounters *m_perfcounter;
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
diff --git a/src/librbd/cache/pwl/Request.cc b/src/librbd/cache/pwl/Request.cc
new file mode 100644
index 000000000..8159b121a
--- /dev/null
+++ b/src/librbd/cache/pwl/Request.cc
@@ -0,0 +1,561 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::Request: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename T>
+C_BlockIORequest<T>::C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents,
+ bufferlist&& bl, const int fadvise_flags, Context *user_req)
+ : pwl(pwl), image_extents(std::move(extents)),
+ bl(std::move(bl)), fadvise_flags(fadvise_flags),
+ user_req(user_req), image_extents_summary(image_extents), m_arrived_time(arrived) {
+ ldout(pwl.get_context(), 99) << this << dendl;
+}
+
+template <typename T>
+C_BlockIORequest<T>::~C_BlockIORequest() {
+ ldout(pwl.get_context(), 99) << this << dendl;
+ ceph_assert(m_cell_released || !m_cell);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_BlockIORequest<T> &req) {
+ os << "image_extents=[" << req.image_extents << "], "
+ << "image_extents_summary=[" << req.image_extents_summary << "], "
+ << "bl=" << req.bl << ", "
+ << "user_req=" << req.user_req << ", "
+ << "m_user_req_completed=" << req.m_user_req_completed << ", "
+ << "m_deferred=" << req.m_deferred << ", "
+ << "detained=" << req.detained << ", "
+ << "waited_lanes=" << req.waited_lanes << ", "
+ << "waited_entries=" << req.waited_entries << ", "
+ << "waited_buffers=" << req.waited_buffers << "";
+ return os;
+}
+
+template <typename T>
+void C_BlockIORequest<T>::set_cell(BlockGuardCell *cell) {
+ ldout(pwl.get_context(), 20) << this << " cell=" << cell << dendl;
+ ceph_assert(cell);
+ ceph_assert(!m_cell);
+ m_cell = cell;
+}
+
+template <typename T>
+BlockGuardCell *C_BlockIORequest<T>::get_cell(void) {
+ ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl;
+ return m_cell;
+}
+
+template <typename T>
+void C_BlockIORequest<T>::release_cell() {
+ ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl;
+ ceph_assert(m_cell);
+ bool initial = false;
+ if (m_cell_released.compare_exchange_strong(initial, true)) {
+ pwl.release_guarded_request(m_cell);
+ } else {
+ ldout(pwl.get_context(), 5) << "cell " << m_cell << " already released for " << this << dendl;
+ }
+}
+
+template <typename T>
+void C_BlockIORequest<T>::complete_user_request(int r) {
+ bool initial = false;
+ if (m_user_req_completed.compare_exchange_strong(initial, true)) {
+ ldout(pwl.get_context(), 15) << this << " completing user req" << dendl;
+ m_user_req_completed_time = ceph_clock_now();
+ pwl.complete_user_request(user_req, r);
+ } else {
+ ldout(pwl.get_context(), 20) << this << " user req already completed" << dendl;
+ }
+}
+
+template <typename T>
+void C_BlockIORequest<T>::finish(int r) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+
+ complete_user_request(r);
+ bool initial = false;
+ if (m_finish_called.compare_exchange_strong(initial, true)) {
+ ldout(pwl.get_context(), 15) << this << " finishing" << dendl;
+ finish_req(0);
+ } else {
+ ldout(pwl.get_context(), 20) << this << " already finished" << dendl;
+ ceph_assert(0);
+ }
+}
+
+template <typename T>
+void C_BlockIORequest<T>::deferred() {
+ bool initial = false;
+ if (m_deferred.compare_exchange_strong(initial, true)) {
+ deferred_handler();
+ }
+}
+
+template <typename T>
+C_WriteRequest<T>::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req),
+ m_perfcounter(perfcounter), m_lock(lock) {
+ ldout(pwl.get_context(), 99) << this << dendl;
+}
+
+template <typename T>
+C_WriteRequest<T>::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req),
+ mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)),
+ m_perfcounter(perfcounter), m_lock(lock) {
+ is_comp_and_write = true;
+ ldout(pwl.get_context(), 20) << dendl;
+}
+
+template <typename T>
+C_WriteRequest<T>::~C_WriteRequest() {
+ ldout(pwl.get_context(), 99) << this << dendl;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_WriteRequest<T> &req) {
+ os << (C_BlockIORequest<T>&)req
+ << " m_resources.allocated=" << req.m_resources.allocated;
+ if (req.op_set) {
+ os << "op_set=" << *req.op_set;
+ }
+ return os;
+}
+
+template <typename T>
+void C_WriteRequest<T>::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) {
+ ldout(pwl.get_context(), 20) << __func__ << " write_req=" << this << " cell=" << guard_ctx.cell << dendl;
+
+ ceph_assert(guard_ctx.cell);
+ this->detained = guard_ctx.state.detained; /* overlapped */
+ this->m_queued = guard_ctx.state.queued; /* queued behind at least one barrier */
+ this->set_cell(guard_ctx.cell);
+}
+
+template <typename T>
+void C_WriteRequest<T>::finish_req(int r) {
+ ldout(pwl.get_context(), 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl;
+
+ /* Completed to caller by here (in finish(), which calls this) */
+ utime_t now = ceph_clock_now();
+ if(is_comp_and_write && !compare_succeeded) {
+ update_req_stats(now);
+ return;
+ }
+ pwl.release_write_lanes(this);
+ ceph_assert(m_resources.allocated);
+ m_resources.allocated = false;
+ this->release_cell(); /* TODO: Consider doing this in appending state */
+ update_req_stats(now);
+}
+
+template <typename T>
+std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(
+ uint64_t offset, uint64_t len) {
+ return pwl.m_builder->create_write_log_operation(
+ *op_set, offset, len, pwl.get_context(),
+ pwl.m_builder->create_write_log_entry(op_set->sync_point->log_entry, offset, len));
+}
+
+template <typename T>
+void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
+ GenericWriteLogEntries log_entries;
+ {
+ std::lock_guard locker(m_lock);
+ std::shared_ptr<SyncPoint> current_sync_point = pwl.get_current_sync_point();
+ if ((!pwl.get_persist_on_flush() && current_sync_point->log_entry->writes_completed) ||
+ (current_sync_point->log_entry->writes > MAX_WRITES_PER_SYNC_POINT) ||
+ (current_sync_point->log_entry->bytes > MAX_BYTES_PER_SYNC_POINT)) {
+ /* Create new sync point and persist the previous one. This sequenced
+ * write will bear a sync gen number shared with no already completed
+ * writes. A group of sequenced writes may be safely flushed concurrently
+ * if they all arrived before any of them completed. We'll insert one on
+ * an aio_flush() from the application. Here we're inserting one to cap
+ * the number of bytes and writes per sync point. When the application is
+ * not issuing flushes, we insert sync points to record some observed
+ * write concurrency information that enables us to safely issue >1 flush
+ * write (for writes observed here to have been in flight simultaneously)
+ * at a time in persist-on-write mode.
+ */
+ pwl.flush_new_sync_point(nullptr, on_exit);
+ current_sync_point = pwl.get_current_sync_point();
+ }
+ uint64_t current_sync_gen = pwl.get_current_sync_gen();
+ op_set =
+ make_unique<WriteLogOperationSet>(this->m_dispatched_time,
+ m_perfcounter,
+ current_sync_point,
+ pwl.get_persist_on_flush(),
+ pwl.get_context(), this);
+ ldout(pwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() << dendl;
+ ceph_assert(m_resources.allocated);
+ /* op_set->operations initialized differently for plain write or write same */
+ auto allocation = m_resources.buffers.begin();
+ uint64_t buffer_offset = 0;
+ for (auto &extent : this->image_extents) {
+ /* operation->on_write_persist connected to m_prior_log_entries_persisted Gather */
+ auto operation = this->create_operation(extent.first, extent.second);
+ this->op_set->operations.emplace_back(operation);
+
+ /* A WS is also a write */
+ ldout(pwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get()
+ << " operation=" << operation << dendl;
+ log_entries.emplace_back(operation->log_entry);
+ if (!op_set->persist_on_flush) {
+ pwl.inc_last_op_sequence_num();
+ }
+ operation->init(true, allocation, current_sync_gen,
+ pwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush);
+ buffer_offset += operation->log_entry->write_bytes();
+ ldout(pwl.get_context(), 20) << "operation=[" << *operation << "]" << dendl;
+ allocation++;
+ }
+ }
+ /* All extent ops subs created */
+ op_set->extent_ops_appending->activate();
+ op_set->extent_ops_persist->activate();
+
+ pwl.add_into_log_map(log_entries, this);
+}
+
+template <typename T>
+void C_WriteRequest<T>::copy_cache() {
+ pwl.copy_bl_to_buffer(&m_resources, op_set);
+}
+
+template <typename T>
+bool C_WriteRequest<T>::append_write_request(std::shared_ptr<SyncPoint> sync_point) {
+ std::lock_guard locker(m_lock);
+ auto write_req_sp = this;
+ if (sync_point->earlier_sync_point) {
+ Context *schedule_append_ctx = new LambdaContext([write_req_sp](int r) {
+ write_req_sp->schedule_append();
+ });
+ sync_point->earlier_sync_point->on_sync_point_appending.push_back(schedule_append_ctx);
+ return true;
+ }
+ return false;
+}
+
+template <typename T>
+void C_WriteRequest<T>::schedule_append() {
+ ceph_assert(++m_appended == 1);
+ pwl.setup_schedule_append(this->op_set->operations, m_do_early_flush, this);
+}
+
+/**
+ * Attempts to allocate log resources for a write. Returns true if successful.
+ *
+ * Resources include 1 lane per extent, 1 log entry per extent, and the payload
+ * data space for each extent.
+ *
+ * Lanes are released after the write persists via release_write_lanes()
+ */
+template <typename T>
+bool C_WriteRequest<T>::alloc_resources() {
+ this->allocated_time = ceph_clock_now();
+ return pwl.alloc_resources(this);
+}
+
+/**
+ * Takes custody of write_req. Resources must already be allocated.
+ *
+ * Locking:
+ * Acquires lock
+ */
+template <typename T>
+void C_WriteRequest<T>::dispatch()
+{
+ CephContext *cct = pwl.get_context();
+ DeferredContexts on_exit;
+ utime_t now = ceph_clock_now();
+ this->m_dispatched_time = now;
+
+ ldout(cct, 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl;
+ this->setup_log_operations(on_exit);
+
+ bool append_deferred = false;
+ if (!op_set->persist_on_flush &&
+ append_write_request(op_set->sync_point)) {
+ /* In persist-on-write mode, we defer the append of this write until the
+ * previous sync point is appending (meaning all the writes before it are
+ * persisted and that previous sync point can now appear in the
+ * log). Since we insert sync points in persist-on-write mode when writes
+ * have already completed to the current sync point, this limits us to
+ * one inserted sync point in flight at a time, and gives the next
+ * inserted sync point some time to accumulate a few writes if they
+ * arrive soon. Without this we can insert an absurd number of sync
+ * points, each with one or two writes. That uses a lot of log entries,
+ * and limits flushing to very few writes at a time. */
+ m_do_early_flush = false;
+ append_deferred = true;
+ } else {
+ /* The prior sync point is done, so we'll schedule append here. If this is
+ * persist-on-write, and probably still the caller's thread, we'll use this
+ * caller's thread to perform the persist & replication of the payload
+ * buffer. */
+ m_do_early_flush =
+ !(this->detained || this->m_queued || this->m_deferred || op_set->persist_on_flush);
+ }
+ if (!append_deferred) {
+ this->schedule_append();
+ }
+}
+
+template <typename T>
+C_FlushRequest<T>::C_FlushRequest(T &pwl, const utime_t arrived,
+ io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags,
+ ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, user_req),
+ m_lock(lock), m_perfcounter(perfcounter) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+void C_FlushRequest<T>::finish_req(int r) {
+ ldout(pwl.get_context(), 20) << "flush_req=" << this
+ << " cell=" << this->get_cell() << dendl;
+ /* Block guard already released */
+ ceph_assert(!this->get_cell());
+
+ /* Completed to caller by here */
+ utime_t now = ceph_clock_now();
+ m_perfcounter->tinc(l_librbd_pwl_aio_flush_latency, now - this->m_arrived_time);
+}
+
+template <typename T>
+bool C_FlushRequest<T>::alloc_resources() {
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ return pwl.alloc_resources(this);
+}
+
+template <typename T>
+void C_FlushRequest<T>::dispatch() {
+ utime_t now = ceph_clock_now();
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ ceph_assert(this->m_resources.allocated);
+ this->m_dispatched_time = now;
+
+ op = std::make_shared<SyncPointLogOperation>(m_lock,
+ to_append,
+ now,
+ m_perfcounter,
+ pwl.get_context());
+
+ m_perfcounter->inc(l_librbd_pwl_log_ops, 1);
+ pwl.schedule_append(op);
+}
+
+template <typename T>
+void C_FlushRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ *number_log_entries = 1;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_FlushRequest<T> &req) {
+ os << (C_BlockIORequest<T>&)req
+ << " m_resources.allocated=" << req.m_resources.allocated;
+ return os;
+}
+
+template <typename T>
+C_DiscardRequest<T>::C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), bufferlist(), 0, user_req),
+ m_discard_granularity_bytes(discard_granularity_bytes),
+ m_lock(lock),
+ m_perfcounter(perfcounter) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+C_DiscardRequest<T>::~C_DiscardRequest() {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+bool C_DiscardRequest<T>::alloc_resources() {
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ return pwl.alloc_resources(this);
+}
+
+template <typename T>
+void C_DiscardRequest<T>::setup_log_operations() {
+ std::lock_guard locker(m_lock);
+ GenericWriteLogEntries log_entries;
+ for (auto &extent : this->image_extents) {
+ op = pwl.m_builder->create_discard_log_operation(
+ pwl.get_current_sync_point(), extent.first, extent.second,
+ m_discard_granularity_bytes, this->m_dispatched_time, m_perfcounter,
+ pwl.get_context());
+ log_entries.emplace_back(op->log_entry);
+ break;
+ }
+ uint64_t current_sync_gen = pwl.get_current_sync_gen();
+ bool persist_on_flush = pwl.get_persist_on_flush();
+ if (!persist_on_flush) {
+ pwl.inc_last_op_sequence_num();
+ }
+ auto discard_req = this;
+ Context *on_write_append = pwl.get_current_sync_point()->prior_persisted_gather_new_sub();
+
+ Context *on_write_persist = new LambdaContext(
+ [this, discard_req](int r) {
+ ldout(pwl.get_context(), 20) << "discard_req=" << discard_req
+ << " cell=" << discard_req->get_cell() << dendl;
+ ceph_assert(discard_req->get_cell());
+ discard_req->complete_user_request(r);
+ discard_req->release_cell();
+ });
+ op->init_op(current_sync_gen, persist_on_flush, pwl.get_last_op_sequence_num(),
+ on_write_persist, on_write_append);
+ pwl.add_into_log_map(log_entries, this);
+}
+
+template <typename T>
+void C_DiscardRequest<T>::dispatch() {
+ utime_t now = ceph_clock_now();
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ ceph_assert(this->m_resources.allocated);
+ this->m_dispatched_time = now;
+ setup_log_operations();
+ m_perfcounter->inc(l_librbd_pwl_log_ops, 1);
+ pwl.schedule_append(op);
+}
+
+template <typename T>
+void C_DiscardRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ *number_log_entries = 1;
+ /* No bytes are allocated for a discard, but we count the discarded bytes
+ * as dirty. This means it's possible to have more bytes dirty than
+ * there are bytes cached or allocated. */
+ for (auto &extent : this->image_extents) {
+ *bytes_dirtied = extent.second;
+ break;
+ }
+}
+
+template <typename T>
+void C_DiscardRequest<T>::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) {
+ ldout(pwl.get_context(), 20) << " cell=" << guard_ctx.cell << dendl;
+
+ ceph_assert(guard_ctx.cell);
+ this->detained = guard_ctx.state.detained; /* overlapped */
+ this->set_cell(guard_ctx.cell);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_DiscardRequest<T> &req) {
+ os << (C_BlockIORequest<T>&)req;
+ if (req.op) {
+ os << " op=[" << *req.op << "]";
+ } else {
+ os << " op=nullptr";
+ }
+ return os;
+}
+
+template <typename T>
+C_WriteSameRequest<T>::C_WriteSameRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+C_WriteSameRequest<T>::~C_WriteSameRequest() {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::update_req_stats(utime_t &now) {
+ /* Write same stats excluded from most write stats
+ * because the read phase will make them look like slow writes in
+ * those histograms. */
+ ldout(pwl.get_context(), 20) << this << dendl;
+ utime_t comp_latency = now - this->m_arrived_time;
+ this->m_perfcounter->tinc(l_librbd_pwl_ws_latency, comp_latency);
+}
+
+template <typename T>
+std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(
+ uint64_t offset, uint64_t len) {
+ ceph_assert(this->image_extents.size() == 1);
+ WriteLogOperationSet &set = *this->op_set.get();
+ return pwl.m_builder->create_write_log_operation(
+ *this->op_set.get(), offset, len, this->bl.length(), pwl.get_context(),
+ pwl.m_builder->create_writesame_log_entry(set.sync_point->log_entry, offset,
+ len, this->bl.length()));
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_WriteSameRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req;
+ return os;
+}
+
+template <typename T>
+void C_WriteRequest<T>::update_req_stats(utime_t &now) {
+ /* Compare-and-write stats. Compare-and-write excluded from most write
+ * stats because the read phase will make them look like slow writes in
+ * those histograms. */
+ if(is_comp_and_write) {
+ if (!compare_succeeded) {
+ this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1);
+ }
+ utime_t comp_latency = now - this->m_arrived_time;
+ this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency);
+ }
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::C_BlockIORequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_FlushRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_DiscardRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
diff --git a/src/librbd/cache/pwl/Request.h b/src/librbd/cache/pwl/Request.h
new file mode 100644
index 000000000..7953c2887
--- /dev/null
+++ b/src/librbd/cache/pwl/Request.h
@@ -0,0 +1,374 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_REQUEST_H
+
+#include "include/Context.h"
+#include "librbd/cache/pwl/Types.h"
+#include "librbd/cache/pwl/LogOperation.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+
+class GuardedRequestFunctionContext;
+
+struct WriteRequestResources {
+ bool allocated = false;
+ std::vector<WriteBufferAllocation> buffers;
+};
+
+/**
+ * A request that can be deferred in a BlockGuard to sequence
+ * overlapping operations.
+ * This is the custodian of the BlockGuard cell for this IO, and the
+ * state information about the progress of this IO. This object lives
+ * until the IO is persisted in all (live) log replicas. User request
+ * may be completed from here before the IO persists.
+ */
+template <typename T>
+class C_BlockIORequest : public Context {
+public:
+ T &pwl;
+ io::Extents image_extents;
+ bufferlist bl;
+ int fadvise_flags;
+ Context *user_req; /* User write request */
+ ExtentsSummary<io::Extents> image_extents_summary;
+ bool detained = false; /* Detained in blockguard (overlapped with a prior IO) */
+ utime_t allocated_time; /* When allocation began */
+ bool waited_lanes = false; /* This IO waited for free persist/replicate lanes */
+ bool waited_entries = false; /* This IO waited for free log entries */
+ bool waited_buffers = false; /* This IO waited for data buffers (pmemobj_reserve() failed) */
+
+ C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents,
+ bufferlist&& bl, const int fadvise_flags, Context *user_req);
+ ~C_BlockIORequest() override;
+ C_BlockIORequest(const C_BlockIORequest&) = delete;
+ C_BlockIORequest &operator=(const C_BlockIORequest&) = delete;
+
+ void set_cell(BlockGuardCell *cell);
+ BlockGuardCell *get_cell(void);
+ void release_cell();
+
+ void complete_user_request(int r);
+ void finish(int r);
+ virtual void finish_req(int r) = 0;
+
+ virtual bool alloc_resources() = 0;
+
+ void deferred();
+
+ virtual void deferred_handler() = 0;
+
+ virtual void dispatch() = 0;
+
+ virtual void copy_cache() {};
+
+ virtual const char *get_name() const {
+ return "C_BlockIORequest";
+ }
+ uint64_t get_image_extents_size() {
+ return image_extents.size();
+ }
+ void set_io_waited_for_lanes(bool waited) {
+ waited_lanes = waited;
+ }
+ void set_io_waited_for_entries(bool waited) {
+ waited_entries = waited;
+ }
+ void set_io_waited_for_buffers(bool waited) {
+ waited_buffers = waited;
+ }
+ bool has_io_waited_for_buffers() {
+ return waited_buffers;
+ }
+ std::vector<WriteBufferAllocation>& get_resources_buffers() {
+ return m_resources.buffers;
+ }
+
+ void set_allocated(bool allocated) {
+ if (allocated) {
+ m_resources.allocated = true;
+ } else {
+ m_resources.buffers.clear();
+ }
+ }
+
+ virtual void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) = 0;
+
+protected:
+ utime_t m_arrived_time;
+ utime_t m_dispatched_time; /* When dispatch began */
+ utime_t m_user_req_completed_time;
+ std::atomic<bool> m_deferred = {false}; /* Deferred because this or a prior IO had to wait for write resources */
+ WriteRequestResources m_resources;
+
+private:
+ std::atomic<bool> m_user_req_completed = {false};
+ std::atomic<bool> m_finish_called = {false};
+ std::atomic<bool> m_cell_released = {false};
+ BlockGuardCell* m_cell = nullptr;
+
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_BlockIORequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this write. Block
+ * guard is not released until the write persists everywhere (this is
+ * how we guarantee to each log replica that they will never see
+ * overlapping writes).
+ */
+template <typename T>
+class C_WriteRequest : public C_BlockIORequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ bool compare_succeeded = false;
+ uint64_t *mismatch_offset;
+ bufferlist cmp_bl;
+ bufferlist read_bl;
+ bool is_comp_and_write = false;
+ unique_ptr<WriteLogOperationSet> op_set = nullptr;
+
+ C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req);
+
+ C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req);
+
+ ~C_WriteRequest() override;
+
+ void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
+
+ /* Common finish to plain write and compare-and-write (if it writes) */
+ void finish_req(int r) override;
+
+ /* Compare and write will override this */
+ virtual void update_req_stats(utime_t &now);
+
+ bool alloc_resources() override;
+
+ void deferred_handler() override { }
+
+ void dispatch() override;
+
+ void copy_cache() override;
+
+ virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset,
+ uint64_t len);
+
+ virtual void setup_log_operations(DeferredContexts &on_exit);
+
+ bool append_write_request(std::shared_ptr<SyncPoint> sync_point);
+
+ virtual void schedule_append();
+
+ const char *get_name() const override {
+ return "C_WriteRequest";
+ }
+
+protected:
+ using C_BlockIORequest<T>::m_resources;
+ PerfCounters *m_perfcounter = nullptr;
+
+private:
+ bool m_do_early_flush = false;
+ std::atomic<int> m_appended = {0};
+ bool m_queued = false;
+ ceph::mutex &m_lock;
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_WriteRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this
+ * aio_flush. Block guard is released as soon as the new
+ * sync point (if required) is created. Subsequent IOs can
+ * proceed while this flush waits for prior IOs to complete
+ * and any required sync points to be persisted.
+ */
+template <typename T>
+class C_FlushRequest : public C_BlockIORequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ bool internal = false;
+ std::shared_ptr<SyncPoint> to_append;
+
+ C_FlushRequest(T &pwl, const utime_t arrived,
+ io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags,
+ ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req);
+
+ ~C_FlushRequest() override {}
+
+ bool alloc_resources() override;
+
+ void dispatch() override;
+
+ const char *get_name() const override {
+ return "C_FlushRequest";
+ }
+
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+private:
+ std::shared_ptr<SyncPointLogOperation> op;
+ ceph::mutex &m_lock;
+ PerfCounters *m_perfcounter = nullptr;
+
+ void finish_req(int r) override;
+ void deferred_handler() override {
+ m_perfcounter->inc(l_librbd_pwl_aio_flush_def, 1);
+ }
+
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_FlushRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this discard. As in the
+ * case of write, the block guard is not released until the discard persists
+ * everywhere.
+ */
+template <typename T>
+class C_DiscardRequest : public C_BlockIORequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ std::shared_ptr<DiscardLogOperation> op;
+
+ C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req);
+
+ ~C_DiscardRequest() override;
+ void finish_req(int r) override {}
+
+ bool alloc_resources() override;
+
+ void deferred_handler() override { }
+
+ void setup_log_operations();
+
+ void dispatch() override;
+
+ void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
+
+ const char *get_name() const override {
+ return "C_DiscardRequest";
+ }
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+private:
+ uint32_t m_discard_granularity_bytes;
+ ceph::mutex &m_lock;
+ PerfCounters *m_perfcounter = nullptr;
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_DiscardRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this write same.
+ *
+ * A writesame allocates and persists a data buffer like a write, but the
+ * data buffer is usually much shorter than the write same.
+ */
+template <typename T>
+class C_WriteSameRequest : public C_WriteRequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req);
+
+ ~C_WriteSameRequest() override;
+
+ void update_req_stats(utime_t &now) override;
+
+ std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len) override;
+
+ const char *get_name() const override {
+ return "C_WriteSameRequest";
+ }
+
+ template<typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_WriteSameRequest<U> &req);
+};
+
+struct BlockGuardReqState {
+ bool barrier = false; /* This is a barrier request */
+ bool current_barrier = false; /* This is the currently active barrier */
+ bool detained = false;
+ bool queued = false; /* Queued for barrier */
+ friend std::ostream &operator<<(std::ostream &os,
+ const BlockGuardReqState &r) {
+ os << "barrier=" << r.barrier << ", "
+ << "current_barrier=" << r.current_barrier << ", "
+ << "detained=" << r.detained << ", "
+ << "queued=" << r.queued;
+ return os;
+ }
+};
+
+class GuardedRequestFunctionContext : public Context {
+public:
+ BlockGuardCell *cell = nullptr;
+ BlockGuardReqState state;
+ GuardedRequestFunctionContext(boost::function<void(GuardedRequestFunctionContext&)> &&callback)
+ : m_callback(std::move(callback)){ }
+ ~GuardedRequestFunctionContext(void) override { };
+ GuardedRequestFunctionContext(const GuardedRequestFunctionContext&) = delete;
+ GuardedRequestFunctionContext &operator=(const GuardedRequestFunctionContext&) = delete;
+
+private:
+ boost::function<void(GuardedRequestFunctionContext&)> m_callback;
+ void finish(int r) override {
+ ceph_assert(cell);
+ m_callback(*this);
+ }
+};
+
+class GuardedRequest {
+public:
+ const BlockExtent block_extent;
+ GuardedRequestFunctionContext *guard_ctx; /* Work to do when guard on range obtained */
+
+ GuardedRequest(const BlockExtent block_extent,
+ GuardedRequestFunctionContext *on_guard_acquire, bool barrier = false)
+ : block_extent(block_extent), guard_ctx(on_guard_acquire) {
+ guard_ctx->state.barrier = barrier;
+ }
+ friend std::ostream &operator<<(std::ostream &os,
+ const GuardedRequest &r) {
+ os << "guard_ctx->state=[" << r.guard_ctx->state << "], "
+ << "block_extent.block_start=" << r.block_extent.block_start << ", "
+ << "block_extent.block_start=" << r.block_extent.block_end;
+ return os;
+ }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_REQUEST_H
diff --git a/src/librbd/cache/pwl/ShutdownRequest.cc b/src/librbd/cache/pwl/ShutdownRequest.cc
new file mode 100644
index 000000000..e022328ba
--- /dev/null
+++ b/src/librbd/cache/pwl/ShutdownRequest.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/pwl/ShutdownRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/Operations.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/Types.h"
+
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/plugin/Api.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl:ShutdownRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+ShutdownRequest<I>* ShutdownRequest<I>::create(
+ I &image_ctx,
+ AbstractWriteLog<I> *image_cache,
+ plugin::Api<I>& plugin_api,
+ Context *on_finish) {
+ return new ShutdownRequest(image_ctx, image_cache, plugin_api, on_finish);
+}
+
+template <typename I>
+ShutdownRequest<I>::ShutdownRequest(
+ I &image_ctx,
+ AbstractWriteLog<I> *image_cache,
+ plugin::Api<I>& plugin_api,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_image_cache(image_cache),
+ m_plugin_api(plugin_api),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+void ShutdownRequest<I>::send() {
+ send_shutdown_image_cache();
+}
+
+template <typename I>
+void ShutdownRequest<I>::send_shutdown_image_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (m_image_cache == nullptr) {
+ finish();
+ return;
+ }
+
+ using klass = ShutdownRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_shutdown_image_cache>(
+ this);
+
+ m_image_cache->shut_down(ctx);
+}
+
+template <typename I>
+void ShutdownRequest<I>::handle_shutdown_image_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to shut down the image cache: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ finish();
+ return;
+ } else {
+ delete m_image_cache;
+ m_image_cache = nullptr;
+ }
+ send_remove_feature_bit();
+}
+
+template <typename I>
+void ShutdownRequest<I>::send_remove_feature_bit() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE;
+ uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
+ ldout(cct, 10) << "old_features=" << m_image_ctx.features
+ << ", new_features=" << new_features
+ << ", features_mask=" << features_mask
+ << dendl;
+
+ int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
+ new_features, features_mask);
+ m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE;
+ using klass = ShutdownRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_feature_bit>(
+ this);
+ ctx->complete(r);
+}
+
+template <typename I>
+void ShutdownRequest<I>::handle_remove_feature_bit(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ finish();
+ return;
+ }
+ send_remove_image_cache_state();
+}
+
+template <typename I>
+void ShutdownRequest<I>::send_remove_image_cache_state() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = ShutdownRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_image_cache_state>(
+ this);
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ m_plugin_api.execute_image_metadata_remove(&m_image_ctx, PERSISTENT_CACHE_STATE, ctx);
+}
+
+template <typename I>
+void ShutdownRequest<I>::handle_remove_image_cache_state(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ }
+ finish();
+}
+
+template <typename I>
+void ShutdownRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ShutdownRequest<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/ShutdownRequest.h b/src/librbd/cache/pwl/ShutdownRequest.h
new file mode 100644
index 000000000..dafac9e9c
--- /dev/null
+++ b/src/librbd/cache/pwl/ShutdownRequest.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace plugin { template <typename> struct Api; }
+
+namespace cache {
+namespace pwl {
+
+template<typename>
+class AbstractWriteLog;
+
+template<typename>
+class ImageCacheState;
+
+template <typename ImageCtxT = ImageCtx>
+class ShutdownRequest {
+public:
+ static ShutdownRequest* create(
+ ImageCtxT &image_ctx,
+ AbstractWriteLog<ImageCtxT> *image_cache,
+ plugin::Api<ImageCtxT>& plugin_api,
+ Context *on_finish);
+
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * Shutdown request goes through the following state machine:
+ *
+ * <start>
+ * |
+ * v
+ * SHUTDOWN_IMAGE_CACHE
+ * |
+ * v
+ * REMOVE_IMAGE_FEATURE_BIT
+ * |
+ * v
+ * REMOVE_IMAGE_CACHE_STATE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ShutdownRequest(ImageCtxT &image_ctx,
+ AbstractWriteLog<ImageCtxT> *image_cache,
+ plugin::Api<ImageCtxT>& plugin_api,
+ Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ AbstractWriteLog<ImageCtxT> *m_image_cache;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ void send_shutdown_image_cache();
+ void handle_shutdown_image_cache(int r);
+
+ void send_remove_feature_bit();
+ void handle_remove_feature_bit(int r);
+
+ void send_remove_image_cache_state();
+ void handle_remove_image_cache_state(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ShutdownRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
diff --git a/src/librbd/cache/pwl/SyncPoint.cc b/src/librbd/cache/pwl/SyncPoint.cc
new file mode 100644
index 000000000..8fb2f8205
--- /dev/null
+++ b/src/librbd/cache/pwl/SyncPoint.cc
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPoint.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::SyncPoint: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+SyncPoint::SyncPoint(uint64_t sync_gen_num, CephContext *cct)
+ : log_entry(std::make_shared<SyncPointLogEntry>(sync_gen_num)), m_cct(cct) {
+ m_prior_log_entries_persisted = new C_Gather(cct, nullptr);
+ m_sync_point_persist = new C_Gather(cct, nullptr);
+ on_sync_point_appending.reserve(MAX_WRITES_PER_SYNC_POINT + 2);
+ on_sync_point_persisted.reserve(MAX_WRITES_PER_SYNC_POINT + 2);
+ ldout(m_cct, 20) << "sync point " << sync_gen_num << dendl;
+}
+
+SyncPoint::~SyncPoint() {
+ ceph_assert(on_sync_point_appending.empty());
+ ceph_assert(on_sync_point_persisted.empty());
+ ceph_assert(!earlier_sync_point);
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const SyncPoint &p) {
+ os << "log_entry=[" << *p.log_entry << "], "
+ << "earlier_sync_point=" << p.earlier_sync_point << ", "
+ << "later_sync_point=" << p.later_sync_point << ", "
+ << "m_final_op_sequence_num=" << p.m_final_op_sequence_num << ", "
+ << "m_prior_log_entries_persisted=" << p.m_prior_log_entries_persisted << ", "
+ << "m_prior_log_entries_persisted_complete=" << p.m_prior_log_entries_persisted_complete << ", "
+ << "m_append_scheduled=" << p.m_append_scheduled << ", "
+ << "appending=" << p.appending << ", "
+ << "on_sync_point_appending=" << p.on_sync_point_appending.size() << ", "
+ << "on_sync_point_persisted=" << p.on_sync_point_persisted.size() << "";
+ return os;
+}
+
+void SyncPoint::persist_gather_set_finisher(Context *ctx) {
+ m_append_scheduled = true;
+ /* All prior sync points that are still in this list must already be scheduled for append */
+ std::shared_ptr<SyncPoint> previous = earlier_sync_point;
+ while (previous) {
+ ceph_assert(previous->m_append_scheduled);
+ previous = previous->earlier_sync_point;
+ }
+
+ m_sync_point_persist->set_finisher(ctx);
+}
+
+void SyncPoint::persist_gather_activate() {
+ m_sync_point_persist->activate();
+}
+
+Context* SyncPoint::persist_gather_new_sub() {
+ return m_sync_point_persist->new_sub();
+}
+
+void SyncPoint::prior_persisted_gather_activate() {
+ m_prior_log_entries_persisted->activate();
+}
+
+Context* SyncPoint::prior_persisted_gather_new_sub() {
+ return m_prior_log_entries_persisted->new_sub();
+}
+
+void SyncPoint::prior_persisted_gather_set_finisher() {
+ Context *sync_point_persist_ready = persist_gather_new_sub();
+ std::shared_ptr<SyncPoint> sp = shared_from_this();
+ m_prior_log_entries_persisted->
+ set_finisher(new LambdaContext([this, sp, sync_point_persist_ready](int r) {
+ ldout(m_cct, 20) << "Prior log entries persisted for sync point =["
+ << sp << "]" << dendl;
+ sp->m_prior_log_entries_persisted_result = r;
+ sp->m_prior_log_entries_persisted_complete = true;
+ sync_point_persist_ready->complete(r);
+ }));
+}
+
+void SyncPoint::add_in_on_persisted_ctxs(Context* ctx) {
+ on_sync_point_persisted.push_back(ctx);
+}
+
+void SyncPoint::add_in_on_appending_ctxs(Context* ctx) {
+ on_sync_point_appending.push_back(ctx);
+}
+
+void SyncPoint::setup_earlier_sync_point(std::shared_ptr<SyncPoint> sync_point,
+ uint64_t last_op_sequence_num) {
+ earlier_sync_point = sync_point;
+ log_entry->prior_sync_point_flushed = false;
+ earlier_sync_point->log_entry->next_sync_point_entry = log_entry;
+ earlier_sync_point->later_sync_point = shared_from_this();
+ earlier_sync_point->m_final_op_sequence_num = last_op_sequence_num;
+ if (!earlier_sync_point->appending) {
+ /* Append of new sync point deferred until old sync point is appending */
+ earlier_sync_point->add_in_on_appending_ctxs(prior_persisted_gather_new_sub());
+ }
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/SyncPoint.h b/src/librbd/cache/pwl/SyncPoint.h
new file mode 100644
index 000000000..424e3730e
--- /dev/null
+++ b/src/librbd/cache/pwl/SyncPoint.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
+#define CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
+
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+class SyncPoint: public std::enable_shared_from_this<SyncPoint> {
+public:
+ std::shared_ptr<SyncPointLogEntry> log_entry;
+ /* Use lock for earlier/later links */
+ std::shared_ptr<SyncPoint> earlier_sync_point; /* NULL if earlier has completed */
+ std::shared_ptr<SyncPoint> later_sync_point;
+ bool appending = false;
+ /* Signal these when this sync point is appending to the log, and its order
+ * of appearance is guaranteed. One of these is is a sub-operation of the
+ * next sync point's m_prior_log_entries_persisted Gather. */
+ std::vector<Context*> on_sync_point_appending;
+ /* Signal these when this sync point is appended and persisted. User
+ * aio_flush() calls are added to this. */
+ std::vector<Context*> on_sync_point_persisted;
+
+ SyncPoint(uint64_t sync_gen_num, CephContext *cct);
+ ~SyncPoint();
+ SyncPoint(const SyncPoint&) = delete;
+ SyncPoint &operator=(const SyncPoint&) = delete;
+ void persist_gather_activate();
+ Context* persist_gather_new_sub();
+ void persist_gather_set_finisher(Context *ctx);
+ void prior_persisted_gather_activate();
+ Context* prior_persisted_gather_new_sub();
+ void prior_persisted_gather_set_finisher();
+ void add_in_on_persisted_ctxs(Context* cxt);
+ void add_in_on_appending_ctxs(Context* cxt);
+ void setup_earlier_sync_point(std::shared_ptr<SyncPoint> sync_point,
+ uint64_t last_op_sequence_num);
+private:
+ CephContext *m_cct;
+ bool m_append_scheduled = false;
+ uint64_t m_final_op_sequence_num = 0;
+ /* A sync point can't appear in the log until all the writes bearing
+ * it and all the prior sync points have been appended and
+ * persisted.
+ *
+ * Writes bearing this sync gen number and the prior sync point will be
+ * sub-ops of this Gather. This sync point will not be appended until all
+ * these complete to the point where their persist order is guaranteed. */
+ C_Gather *m_prior_log_entries_persisted;
+ /* The finisher for this will append the sync point to the log. The finisher
+ * for m_prior_log_entries_persisted will be a sub-op of this. */
+ C_Gather *m_sync_point_persist;
+ int m_prior_log_entries_persisted_result = 0;
+ int m_prior_log_entries_persisted_complete = false;
+ friend std::ostream &operator<<(std::ostream &os,
+ const SyncPoint &p);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
diff --git a/src/librbd/cache/pwl/Types.cc b/src/librbd/cache/pwl/Types.cc
new file mode 100644
index 000000000..505f5d57b
--- /dev/null
+++ b/src/librbd/cache/pwl/Types.cc
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "Types.h"
+#include "common/ceph_context.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::Types: " << this << " " \
+ << __func__ << ": "
+using ceph::Formatter;
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+DeferredContexts::~DeferredContexts() {
+ finish_contexts(nullptr, contexts, 0);
+}
+
+void DeferredContexts::add(Context* ctx) {
+ contexts.push_back(ctx);
+}
+
+/*
+ * A BlockExtent identifies a range by first and last.
+ *
+ * An Extent ("image extent") identifies a range by start and length.
+ *
+ * The ImageDispatch interface is defined in terms of image extents, and
+ * requires no alignment of the beginning or end of the extent. We
+ * convert between image and block extents here using a "block size"
+ * of 1.
+ */
+BlockExtent convert_to_block_extent(uint64_t offset_bytes, uint64_t length_bytes)
+{
+ return BlockExtent(offset_bytes,
+ offset_bytes + length_bytes);
+}
+
+BlockExtent WriteLogCacheEntry::block_extent() {
+ return convert_to_block_extent(image_offset_bytes, write_bytes);
+}
+
+uint64_t WriteLogCacheEntry::get_offset_bytes() {
+ return image_offset_bytes;
+}
+
+uint64_t WriteLogCacheEntry::get_write_bytes() {
+ return write_bytes;
+}
+
+#ifdef WITH_RBD_SSD_CACHE
+void WriteLogCacheEntry::dump(Formatter *f) const {
+ f->dump_unsigned("sync_gen_number", sync_gen_number);
+ f->dump_unsigned("write_sequence_number", write_sequence_number);
+ f->dump_unsigned("image_offset_bytes", image_offset_bytes);
+ f->dump_unsigned("write_bytes", write_bytes);
+ f->dump_unsigned("write_data_pos", write_data_pos);
+ f->dump_bool("entry_valid", is_entry_valid());
+ f->dump_bool("sync_point", is_sync_point());
+ f->dump_bool("sequenced", is_sequenced());
+ f->dump_bool("has_data", has_data());
+ f->dump_bool("discard", is_discard());
+ f->dump_bool("writesame", is_writesame());
+ f->dump_unsigned("ws_datalen", ws_datalen);
+ f->dump_unsigned("entry_index", entry_index);
+}
+
+void WriteLogCacheEntry::generate_test_instances(list<WriteLogCacheEntry*>& ls) {
+ ls.push_back(new WriteLogCacheEntry());
+ ls.push_back(new WriteLogCacheEntry);
+ ls.back()->sync_gen_number = 1;
+ ls.back()->write_sequence_number = 1;
+ ls.back()->image_offset_bytes = 1;
+ ls.back()->write_bytes = 1;
+ ls.back()->write_data_pos = 1;
+ ls.back()->set_entry_valid(true);
+ ls.back()->set_sync_point(true);
+ ls.back()->set_sequenced(true);
+ ls.back()->set_has_data(true);
+ ls.back()->set_discard(true);
+ ls.back()->set_writesame(true);
+ ls.back()->ws_datalen = 1;
+ ls.back()->entry_index = 1;
+}
+
+void WriteLogPoolRoot::dump(Formatter *f) const {
+ f->dump_unsigned("layout_version", layout_version);
+ f->dump_unsigned("cur_sync_gen", cur_sync_gen);
+ f->dump_unsigned("pool_size", pool_size);
+ f->dump_unsigned("flushed_sync_gen", flushed_sync_gen);
+ f->dump_unsigned("block_size", block_size);
+ f->dump_unsigned("num_log_entries", num_log_entries);
+ f->dump_unsigned("first_free_entry", first_free_entry);
+ f->dump_unsigned("first_valid_entry", first_valid_entry);
+}
+
+void WriteLogPoolRoot::generate_test_instances(list<WriteLogPoolRoot*>& ls) {
+ ls.push_back(new WriteLogPoolRoot());
+ ls.push_back(new WriteLogPoolRoot);
+ ls.back()->layout_version = 2;
+ ls.back()->cur_sync_gen = 1;
+ ls.back()->pool_size = 1024;
+ ls.back()->flushed_sync_gen = 1;
+ ls.back()->block_size = 4096;
+ ls.back()->num_log_entries = 10000000;
+ ls.back()->first_free_entry = 1;
+ ls.back()->first_valid_entry = 0;
+}
+#endif
+
+std::ostream& operator<<(std::ostream& os,
+ const WriteLogCacheEntry &entry) {
+ os << "entry_valid=" << entry.is_entry_valid() << ", "
+ << "sync_point=" << entry.is_sync_point() << ", "
+ << "sequenced=" << entry.is_sequenced() << ", "
+ << "has_data=" << entry.has_data() << ", "
+ << "discard=" << entry.is_discard() << ", "
+ << "writesame=" << entry.is_writesame() << ", "
+ << "sync_gen_number=" << entry.sync_gen_number << ", "
+ << "write_sequence_number=" << entry.write_sequence_number << ", "
+ << "image_offset_bytes=" << entry.image_offset_bytes << ", "
+ << "write_bytes=" << entry.write_bytes << ", "
+ << "ws_datalen=" << entry.ws_datalen << ", "
+ << "entry_index=" << entry.entry_index;
+ return os;
+}
+
+template <typename ExtentsType>
+ExtentsSummary<ExtentsType>::ExtentsSummary(const ExtentsType &extents)
+ : total_bytes(0), first_image_byte(0), last_image_byte(0)
+{
+ if (extents.empty()) return;
+ /* These extents refer to image offsets between first_image_byte
+ * and last_image_byte, inclusive, but we don't guarantee here
+ * that they address all of those bytes. There may be gaps. */
+ first_image_byte = extents.front().first;
+ last_image_byte = first_image_byte + extents.front().second;
+ for (auto &extent : extents) {
+ /* Ignore zero length extents */
+ if (extent.second) {
+ total_bytes += extent.second;
+ if (extent.first < first_image_byte) {
+ first_image_byte = extent.first;
+ }
+ if ((extent.first + extent.second) > last_image_byte) {
+ last_image_byte = extent.first + extent.second;
+ }
+ }
+ }
+}
+
+io::Extent whole_volume_extent() {
+ return io::Extent({0, std::numeric_limits<uint64_t>::max()});
+}
+
+BlockExtent block_extent(const io::Extent& image_extent) {
+ return convert_to_block_extent(image_extent.first, image_extent.second);
+}
+
+Context * override_ctx(int r, Context *ctx) {
+ if (r < 0) {
+ /* Override next_ctx status with this error */
+ return new LambdaContext(
+ [r, ctx](int _r) {
+ ctx->complete(r);
+ });
+ } else {
+ return ctx;
+ }
+}
+
+std::string unique_lock_name(const std::string &name, void *address) {
+ return name + " (" + stringify(address) + ")";
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ExtentsSummary<librbd::io::Extents>;
diff --git a/src/librbd/cache/pwl/Types.h b/src/librbd/cache/pwl/Types.h
new file mode 100644
index 000000000..dc067612b
--- /dev/null
+++ b/src/librbd/cache/pwl/Types.h
@@ -0,0 +1,444 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_TYPES_H
+#define CEPH_LIBRBD_CACHE_PWL_TYPES_H
+
+#include "acconfig.h"
+
+#ifdef WITH_RBD_RWL
+#include "libpmemobj.h"
+#endif
+
+#include <vector>
+#include "librbd/BlockGuard.h"
+#include "librbd/io/Types.h"
+
+namespace ceph {
+class Formatter;
+}
+
+class Context;
+
+enum {
+ l_librbd_pwl_first = 26500,
+
+ // All read requests
+ l_librbd_pwl_rd_req, // read requests
+ l_librbd_pwl_rd_bytes, // bytes read
+ l_librbd_pwl_rd_latency, // average req completion latency
+
+ // Read requests completed from RWL (no misses)
+ l_librbd_pwl_rd_hit_req, // read requests
+ l_librbd_pwl_rd_hit_bytes, // bytes read
+ l_librbd_pwl_rd_hit_latency, // average req completion latency
+
+ // Reed requests with hit and miss extents
+ l_librbd_pwl_rd_part_hit_req, // read ops
+
+ // Per SyncPoint's LogEntry number and write bytes distribution
+ l_librbd_pwl_syncpoint_hist,
+
+ // All write requests
+ l_librbd_pwl_wr_req, // write requests
+ l_librbd_pwl_wr_bytes, // bytes written
+ l_librbd_pwl_wr_req_def, // write requests deferred for resources
+ l_librbd_pwl_wr_req_def_lanes, // write requests deferred for lanes
+ l_librbd_pwl_wr_req_def_log, // write requests deferred for log entries
+ l_librbd_pwl_wr_req_def_buf, // write requests deferred for buffer space
+ l_librbd_pwl_wr_req_overlap, // write requests detained for overlap
+ l_librbd_pwl_wr_req_queued, // write requests queued for prior barrier
+
+ // Write log operations (1 .. n per request that appends to the log)
+ l_librbd_pwl_log_ops, // log append ops
+ l_librbd_pwl_log_op_bytes, // average bytes written per log op
+
+ /*
+
+ Req and op average latencies to the beginning of and over various phases:
+
+ +------------------------------+------+-------------------------------+
+ | Phase | Name | Description |
+ +------------------------------+------+-------------------------------+
+ | Arrive at RWL | arr |Arrives as a request |
+ +------------------------------+------+-------------------------------+
+ | Allocate resources | all |time spent in block guard for |
+ | | |overlap sequencing occurs |
+ | | |before this point |
+ +------------------------------+------+-------------------------------+
+ | Dispatch | dis |Op lifetime begins here. time |
+ | | |spent in allocation waiting for|
+ | | |resources occurs before this |
+ | | |point |
+ +------------------------------+------+-------------------------------+
+ | Payload buffer persist and | buf |time spent queued for |
+ |replicate | |replication occurs before here |
+ +------------------------------+------+-------------------------------+
+ | Payload buffer persist | bufc |bufc - buf is just the persist |
+ |complete | |time |
+ +------------------------------+------+-------------------------------+
+ | Log append | app |time spent queued for append |
+ | | |occurs before here |
+ +------------------------------+------+-------------------------------+
+ | Append complete | appc |appc - app is just the time |
+ | | |spent in the append operation |
+ +------------------------------+------+-------------------------------+
+ | Complete | cmp |write persisted, replicated, |
+ | | |and globally visible |
+ +------------------------------+------+-------------------------------+
+
+ */
+
+ /* Request times */
+ l_librbd_pwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
+ l_librbd_pwl_req_arr_to_dis_t, // arrival to dispatch elapsed time
+ l_librbd_pwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
+ l_librbd_pwl_wr_latency, // average req (persist) completion latency
+ l_librbd_pwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
+ l_librbd_pwl_wr_caller_latency, // average req completion (to caller) latency
+
+ /* Request times for requests that never waited for space*/
+ l_librbd_pwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
+ l_librbd_pwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time
+ l_librbd_pwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
+ l_librbd_pwl_nowait_wr_latency, // average req (persist) completion latency
+ l_librbd_pwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
+ l_librbd_pwl_nowait_wr_caller_latency, // average req completion (to caller) latency
+
+ /* Log operation times */
+ l_librbd_pwl_log_op_alloc_t, // elapsed time of pmemobj_reserve()
+ l_librbd_pwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve()
+
+ l_librbd_pwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time
+ l_librbd_pwl_log_op_dis_to_app_t, // dispatch to log append elapsed time
+ l_librbd_pwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time
+ l_librbd_pwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time
+
+ l_librbd_pwl_log_op_buf_to_app_t, // data buf persist + append wait time
+ l_librbd_pwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time
+ l_librbd_pwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram
+ l_librbd_pwl_log_op_app_to_cmp_t, // log entry append + completion wait time
+ l_librbd_pwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time
+ l_librbd_pwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram
+
+ l_librbd_pwl_discard,
+ l_librbd_pwl_discard_bytes,
+ l_librbd_pwl_discard_latency,
+
+ l_librbd_pwl_aio_flush,
+ l_librbd_pwl_aio_flush_def,
+ l_librbd_pwl_aio_flush_latency,
+ l_librbd_pwl_ws,
+ l_librbd_pwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes
+ l_librbd_pwl_ws_latency,
+
+ l_librbd_pwl_cmp,
+ l_librbd_pwl_cmp_bytes,
+ l_librbd_pwl_cmp_latency,
+ l_librbd_pwl_cmp_fails,
+
+ l_librbd_pwl_internal_flush,
+ l_librbd_pwl_writeback_latency,
+ l_librbd_pwl_invalidate_cache,
+ l_librbd_pwl_invalidate_discard_cache,
+
+ l_librbd_pwl_append_tx_t,
+ l_librbd_pwl_retire_tx_t,
+ l_librbd_pwl_append_tx_t_hist,
+ l_librbd_pwl_retire_tx_t_hist,
+
+ l_librbd_pwl_last,
+};
+
+enum {
+ WRITE_LOG_CACHE_ENTRY_VALID = 1U << 0, /* if 0, this entry is free */
+ WRITE_LOG_CACHE_ENTRY_SYNC_POINT = 1U << 1, /* No data. No write sequence number.
+ Marks sync point for this sync gen number */
+ WRITE_LOG_CACHE_ENTRY_SEQUENCED = 1U << 2, /* write sequence number is valid */
+ WRITE_LOG_CACHE_ENTRY_HAS_DATA = 1U << 3, /* write_data field is valid (else ignore) */
+ WRITE_LOG_CACHE_ENTRY_DISCARD = 1U << 4, /* has_data will be 0 if this is a discard */
+ WRITE_LOG_CACHE_ENTRY_WRITESAME = 1U << 5, /* ws_datalen indicates length of data at write_bytes */
+};
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+class ImageExtentBuf;
+
+const int IN_FLIGHT_FLUSH_WRITE_LIMIT = 64;
+const int IN_FLIGHT_FLUSH_BYTES_LIMIT = (1 * 1024 * 1024);
+
+/* Limit work between sync points */
+const uint64_t MAX_WRITES_PER_SYNC_POINT = 256;
+const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8);
+
+const uint32_t MIN_WRITE_ALLOC_SIZE = 512;
+const uint32_t MIN_WRITE_ALLOC_SSD_SIZE = 4096;
+const uint32_t LOG_STATS_INTERVAL_SECONDS = 5;
+
+/**** Write log entries ****/
+const unsigned long int MAX_ALLOC_PER_TRANSACTION = 8;
+const unsigned long int MAX_FREE_PER_TRANSACTION = 1;
+const unsigned int MAX_CONCURRENT_WRITES = (1024 * 1024);
+
+const uint64_t DEFAULT_POOL_SIZE = 1u<<30;
+const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE;
+const uint64_t POOL_SIZE_ALIGN = 1 << 20;
+constexpr double USABLE_SIZE = (7.0 / 10);
+const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16;
+const uint8_t RWL_LAYOUT_VERSION = 1;
+const uint8_t SSD_LAYOUT_VERSION = 1;
+const uint64_t MAX_LOG_ENTRIES = (1024 * 1024);
+const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75;
+const double RETIRE_HIGH_WATER = 0.50;
+const double RETIRE_LOW_WATER = 0.40;
+const int RETIRE_BATCH_TIME_LIMIT_MS = 250;
+const uint64_t CONTROL_BLOCK_MAX_LOG_ENTRIES = 32;
+const uint64_t SPAN_MAX_DATA_LEN = (16 * 1024 * 1024);
+
+/* offset of ring on SSD */
+const uint64_t DATA_RING_BUFFER_OFFSET = 8192;
+
+/* Defer a set of Contexts until destruct/exit. Used for deferring
+ * work on a given thread until a required lock is dropped. */
+class DeferredContexts {
+private:
+ std::vector<Context*> contexts;
+public:
+ ~DeferredContexts();
+ void add(Context* ctx);
+};
+
+/* Pmem structures */
+#ifdef WITH_RBD_RWL
+POBJ_LAYOUT_BEGIN(rbd_pwl);
+POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot);
+POBJ_LAYOUT_TOID(rbd_pwl, uint8_t);
+POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogCacheEntry);
+POBJ_LAYOUT_END(rbd_pwl);
+#endif
+
+struct WriteLogCacheEntry {
+ uint64_t sync_gen_number = 0;
+ uint64_t write_sequence_number = 0;
+ uint64_t image_offset_bytes;
+ uint64_t write_bytes;
+ #ifdef WITH_RBD_RWL
+ TOID(uint8_t) write_data;
+ #endif
+ #ifdef WITH_RBD_SSD_CACHE
+ uint64_t write_data_pos = 0; /* SSD data offset */
+ #endif
+ uint8_t flags = 0;
+ uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */
+ uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
+ * we need the space */
+ WriteLogCacheEntry(uint64_t image_offset_bytes=0, uint64_t write_bytes=0)
+ : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes) {}
+ BlockExtent block_extent();
+ uint64_t get_offset_bytes();
+ uint64_t get_write_bytes();
+ bool is_entry_valid() const {
+ return flags & WRITE_LOG_CACHE_ENTRY_VALID;
+ }
+ bool is_sync_point() const {
+ return flags & WRITE_LOG_CACHE_ENTRY_SYNC_POINT;
+ }
+ bool is_sequenced() const {
+ return flags & WRITE_LOG_CACHE_ENTRY_SEQUENCED;
+ }
+ bool has_data() const {
+ return flags & WRITE_LOG_CACHE_ENTRY_HAS_DATA;
+ }
+ bool is_discard() const {
+ return flags & WRITE_LOG_CACHE_ENTRY_DISCARD;
+ }
+ bool is_writesame() const {
+ return flags & WRITE_LOG_CACHE_ENTRY_WRITESAME;
+ }
+ bool is_write() const {
+ /* Log entry is a basic write */
+ return !is_sync_point() && !is_discard() && !is_writesame();
+ }
+ bool is_writer() const {
+ /* Log entry is any type that writes data */
+ return is_write() || is_discard() || is_writesame();
+ }
+ void set_entry_valid(bool flag) {
+ if (flag) {
+ flags |= WRITE_LOG_CACHE_ENTRY_VALID;
+ } else {
+ flags &= ~WRITE_LOG_CACHE_ENTRY_VALID;
+ }
+ }
+ void set_sync_point(bool flag) {
+ if (flag) {
+ flags |= WRITE_LOG_CACHE_ENTRY_SYNC_POINT;
+ } else {
+ flags &= ~WRITE_LOG_CACHE_ENTRY_SYNC_POINT;
+ }
+ }
+ void set_sequenced(bool flag) {
+ if (flag) {
+ flags |= WRITE_LOG_CACHE_ENTRY_SEQUENCED;
+ } else {
+ flags &= ~WRITE_LOG_CACHE_ENTRY_SEQUENCED;
+ }
+ }
+ void set_has_data(bool flag) {
+ if (flag) {
+ flags |= WRITE_LOG_CACHE_ENTRY_HAS_DATA;
+ } else {
+ flags &= ~WRITE_LOG_CACHE_ENTRY_HAS_DATA;
+ }
+ }
+ void set_discard(bool flag) {
+ if (flag) {
+ flags |= WRITE_LOG_CACHE_ENTRY_DISCARD;
+ } else {
+ flags &= ~WRITE_LOG_CACHE_ENTRY_DISCARD;
+ }
+ }
+ void set_writesame(bool flag) {
+ if (flag) {
+ flags |= WRITE_LOG_CACHE_ENTRY_WRITESAME;
+ } else {
+ flags &= ~WRITE_LOG_CACHE_ENTRY_WRITESAME;
+ }
+ }
+ friend std::ostream& operator<<(std::ostream& os,
+ const WriteLogCacheEntry &entry);
+ #ifdef WITH_RBD_SSD_CACHE
+ DENC(WriteLogCacheEntry, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.sync_gen_number, p);
+ denc(v.write_sequence_number, p);
+ denc(v.image_offset_bytes, p);
+ denc(v.write_bytes, p);
+ denc(v.write_data_pos, p);
+ denc(v.flags, p);
+ denc(v.ws_datalen, p);
+ denc(v.entry_index, p);
+ DENC_FINISH(p);
+ }
+ #endif
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(list<WriteLogCacheEntry*>& ls);
+};
+
+struct WriteLogPoolRoot {
+ #ifdef WITH_RBD_RWL
+ union {
+ struct {
+ uint8_t layout_version; /* Version of this structure (RWL_LAYOUT_VERSION) */
+ };
+ uint64_t _u64;
+ } header;
+ TOID(struct WriteLogCacheEntry) log_entries; /* contiguous array of log entries */
+ #endif
+ #ifdef WITH_RBD_SSD_CACHE
+ uint64_t layout_version = 0;
+ uint64_t cur_sync_gen = 0;
+ #endif
+ uint64_t pool_size;
+ uint64_t flushed_sync_gen; /* All writing entries with this or a lower
+ * sync gen number are flushed. */
+ uint32_t block_size; /* block size */
+ uint32_t num_log_entries;
+ uint64_t first_free_entry; /* Entry following the newest valid entry */
+ uint64_t first_valid_entry; /* Index of the oldest valid entry in the log */
+
+ #ifdef WITH_RBD_SSD_CACHE
+ DENC(WriteLogPoolRoot, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.layout_version, p);
+ denc(v.cur_sync_gen, p);
+ denc(v.pool_size, p);
+ denc(v.flushed_sync_gen, p);
+ denc(v.block_size, p);
+ denc(v.num_log_entries, p);
+ denc(v.first_free_entry, p);
+ denc(v.first_valid_entry, p);
+ DENC_FINISH(p);
+ }
+ #endif
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(list<WriteLogPoolRoot*>& ls);
+};
+
+struct WriteBufferAllocation {
+ unsigned int allocation_size = 0;
+ #ifdef WITH_RBD_RWL
+ pobj_action buffer_alloc_action;
+ TOID(uint8_t) buffer_oid = OID_NULL;
+ #endif
+ bool allocated = false;
+ utime_t allocation_lat;
+};
+
+static inline io::Extent image_extent(const BlockExtent& block_extent) {
+ return io::Extent(block_extent.block_start,
+ block_extent.block_end - block_extent.block_start);
+}
+
+template <typename ExtentsType>
+class ExtentsSummary {
+public:
+ uint64_t total_bytes;
+ uint64_t first_image_byte;
+ uint64_t last_image_byte;
+ explicit ExtentsSummary(const ExtentsType &extents);
+ friend std::ostream &operator<<(std::ostream &os,
+ const ExtentsSummary &s) {
+ os << "total_bytes=" << s.total_bytes << ", "
+ << "first_image_byte=" << s.first_image_byte << ", "
+ << "last_image_byte=" << s.last_image_byte << "";
+ return os;
+ }
+ BlockExtent block_extent() {
+ return BlockExtent(first_image_byte, last_image_byte);
+ }
+ io::Extent image_extent() {
+ return librbd::cache::pwl::image_extent(block_extent());
+ }
+};
+
+io::Extent whole_volume_extent();
+
+BlockExtent block_extent(const io::Extent& image_extent);
+
+Context * override_ctx(int r, Context *ctx);
+
+class ImageExtentBuf : public io::Extent {
+public:
+ bufferlist m_bl;
+ bool need_to_truncate;
+ int truncate_offset;
+ bool writesame;
+ ImageExtentBuf() {}
+ ImageExtentBuf(io::Extent extent,
+ bool need_to_truncate = false, uint64_t truncate_offset = 0,
+ bool writesame = false)
+ : io::Extent(extent), need_to_truncate(need_to_truncate),
+ truncate_offset(truncate_offset), writesame(writesame) {}
+ ImageExtentBuf(io::Extent extent, bufferlist bl,
+ bool need_to_truncate = false, uint64_t truncate_offset = 0,
+ bool writesame = false)
+ : io::Extent(extent), m_bl(bl), need_to_truncate(need_to_truncate),
+ truncate_offset(truncate_offset), writesame(writesame) {}
+};
+
+std::string unique_lock_name(const std::string &name, void *address);
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#ifdef WITH_RBD_SSD_CACHE
+WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogCacheEntry)
+WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPoolRoot)
+#endif
+
+#endif // CEPH_LIBRBD_CACHE_PWL_TYPES_H
diff --git a/src/librbd/cache/pwl/rwl/Builder.h b/src/librbd/cache/pwl/rwl/Builder.h
new file mode 100644
index 000000000..c13c7b5ae
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/Builder.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
+
+#include <iostream>
+#include "LogEntry.h"
+#include "ReadRequest.h"
+#include "Request.h"
+#include "LogOperation.h"
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/Builder.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+class Builder : public pwl::Builder<T> {
+public:
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ image_offset_bytes, write_bytes, data_length);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes, data_length);
+ }
+ pwl::C_WriteRequest<T> *create_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteSameRequest<T> *create_writesame_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteRequest<T> *create_comp_and_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new rwl::C_CompAndWriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> write_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, cct, write_log_entry);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, data_len, cct,
+ writesame_log_entry);
+ }
+ std::shared_ptr<pwl::DiscardLogOperation> create_discard_log_operation(
+ std::shared_ptr<SyncPoint> sync_point, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t discard_granularity_bytes,
+ utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) {
+ return std::make_shared<DiscardLogOperation>(
+ sync_point, image_offset_bytes, write_bytes, discard_granularity_bytes,
+ dispatch_time, perfcounter, cct);
+ }
+ C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived,
+ PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) {
+ return new C_ReadRequest(cct, arrived, perfcounter, bl, on_finish);
+ }
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
diff --git a/src/librbd/cache/pwl/rwl/LogEntry.cc b/src/librbd/cache/pwl/rwl/LogEntry.cc
new file mode 100644
index 000000000..056701fb5
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogEntry.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ImageWriteback.h"
+#include "LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLogEntry: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void WriteLogEntry::writeback(
+ librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+ /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the
+ * bl even after flush()). */
+ bufferlist entry_bl;
+ buffer::list entry_bl_copy;
+ copy_cache_bl(&entry_bl_copy);
+ entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+ image_writeback.aio_write({{ram_entry.image_offset_bytes,
+ ram_entry.write_bytes}},
+ std::move(entry_bl), 0, ctx);
+}
+
+void WriteLogEntry::init_cache_bp() {
+ ceph_assert(!this->cache_bp.have_raw());
+ cache_bp = buffer::ptr(buffer::create_static(this->write_bytes(),
+ (char*)this->cache_buffer));
+}
+
+void WriteLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
+ if(!is_writesame) {
+ bl.append(bp);
+ return;
+ }
+ for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
+ bl.append(bp);
+ }
+ int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
+ if (trailing_partial) {
+ bl.append(bp, 0, trailing_partial);
+ }
+}
+
+void WriteLogEntry::init_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) {
+ this->ram_entry.write_data = allocation->buffer_oid;
+ ceph_assert(!TOID_IS_NULL(this->ram_entry.write_data));
+ cache_buffer = D_RW(this->ram_entry.write_data);
+}
+
+buffer::list& WriteLogEntry::get_cache_bl() {
+ if (0 == bl_refs) {
+ std::lock_guard locker(m_entry_bl_lock);
+ if (0 == bl_refs) {
+ //init pmem bufferlist
+ cache_bl.clear();
+ init_cache_bp();
+ ceph_assert(cache_bp.have_raw());
+ int before_bl = cache_bp.raw_nref();
+ this->init_bl(cache_bp, cache_bl);
+ int after_bl = cache_bp.raw_nref();
+ bl_refs = after_bl - before_bl;
+ }
+ ceph_assert(0 != bl_refs);
+ }
+ return cache_bl;
+}
+
+void WriteLogEntry::copy_cache_bl(bufferlist *out_bl) {
+ this->get_cache_bl();
+ // cache_bp is now initialized
+ buffer::ptr cloned_bp(cache_bp.clone());
+ out_bl->clear();
+ this->init_bl(cloned_bp, *out_bl);
+}
+
+unsigned int WriteLogEntry::reader_count() const {
+ if (cache_bp.have_raw()) {
+ return (cache_bp.raw_nref() - bl_refs - 1);
+ } else {
+ return 0;
+ }
+}
+
+void WriteSameLogEntry::writeback(
+ librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+ bufferlist entry_bl;
+ buffer::list entry_bl_copy;
+ copy_cache_bl(&entry_bl_copy);
+ entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+ image_writeback.aio_writesame(ram_entry.image_offset_bytes,
+ ram_entry.write_bytes,
+ std::move(entry_bl), 0, ctx);
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/rwl/LogEntry.h b/src/librbd/cache/pwl/rwl/LogEntry.h
new file mode 100644
index 000000000..a4675c5fb
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogEntry.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
+
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+namespace rwl {
+
+class WriteLogEntry : public pwl::WriteLogEntry {
+public:
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {}
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {}
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes,
+ data_length) {}
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteLogEntry() {}
+ WriteLogEntry(const WriteLogEntry&) = delete;
+ WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+ void init_cache_bp() override;
+ void init_bl(buffer::ptr &bp, buffer::list &bl) override;
+ void init_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) override;
+ buffer::list &get_cache_bl() override;
+ void copy_cache_bl(bufferlist *out_bl) override;
+ unsigned int reader_count() const override;
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+public:
+ WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes,
+ data_length) {}
+ WriteSameLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteSameLogEntry() {}
+ WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+ WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
diff --git a/src/librbd/cache/pwl/rwl/LogOperation.cc b/src/librbd/cache/pwl/rwl/LogOperation.cc
new file mode 100644
index 000000000..53fb917b2
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogOperation.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LogOperation.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::LogOperation: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void WriteLogOperation::copy_bl_to_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) {
+ /* operation is a shared_ptr, so write_op is only good as long as operation is
+ * in scope */
+ bufferlist::iterator i(&bl);
+ m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes());
+ ldout(m_cct, 20) << bl << dendl;
+ log_entry->init_cache_buffer(allocation);
+ i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->cache_buffer);
+}
+
+void DiscardLogOperation::init_op(
+ uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist,
+ Context *write_append) {
+ log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num);
+ this->on_write_append = write_append;
+ this->on_write_persist = write_persist;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/rwl/LogOperation.h b/src/librbd/cache/pwl/rwl/LogOperation.h
new file mode 100644
index 000000000..874ac77fb
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogOperation.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
+
+#include "librbd/cache/pwl/LogOperation.h"
+
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+class WriteLogOperation : public pwl::WriteLogOperation {
+public:
+ WriteLogOperation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> write_log_entry)
+ : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+ write_log_entry) {}
+
+ WriteLogOperation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry)
+ : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+ writesame_log_entry) {}
+
+ void copy_bl_to_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) override;
+};
+
+class DiscardLogOperation : public pwl::DiscardLogOperation {
+public:
+ DiscardLogOperation(
+ std::shared_ptr<SyncPoint> sync_point, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t discard_granularity_bytes,
+ utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct)
+ : pwl::DiscardLogOperation(sync_point, image_offset_bytes, write_bytes,
+ discard_granularity_bytes, dispatch_time,
+ perfcounter, cct) {}
+ void init_op(
+ uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist,
+ Context *write_append) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
diff --git a/src/librbd/cache/pwl/rwl/ReadRequest.cc b/src/librbd/cache/pwl/rwl/ReadRequest.cc
new file mode 100644
index 000000000..f91f8e5a7
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/ReadRequest.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReadRequest.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::ReadRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void C_ReadRequest::finish(int r) {
+ ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
+ int hits = 0;
+ int misses = 0;
+ int hit_bytes = 0;
+ int miss_bytes = 0;
+ if (r >= 0) {
+ /*
+ * At this point the miss read has completed. We'll iterate through
+ * read_extents and produce *m_out_bl by assembling pieces of miss_bl
+ * and the individual hit extent bufs in the read extents that represent
+ * hits.
+ */
+ uint64_t miss_bl_offset = 0;
+ for (auto extent : read_extents) {
+ if (extent->m_bl.length()) {
+ /* This was a hit */
+ ceph_assert(extent->second == extent->m_bl.length());
+ ++hits;
+ hit_bytes += extent->second;
+ m_out_bl->claim_append(extent->m_bl);
+ } else {
+ /* This was a miss. */
+ ++misses;
+ miss_bytes += extent->second;
+ bufferlist miss_extent_bl;
+ miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent->second);
+ /* Add this read miss bufferlist to the output bufferlist */
+ m_out_bl->claim_append(miss_extent_bl);
+ /* Consume these bytes in the read miss bufferlist */
+ miss_bl_offset += extent->second;
+ }
+ }
+ }
+ ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
+ utime_t now = ceph_clock_now();
+ ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
+ m_on_finish->complete(r);
+ m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes);
+ m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time);
+ if (!misses) {
+ m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1);
+ m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time);
+ } else {
+ if (hits) {
+ m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1);
+ }
+ }
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/rwl/ReadRequest.h b/src/librbd/cache/pwl/rwl/ReadRequest.h
new file mode 100644
index 000000000..25168e83b
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/ReadRequest.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H
+
+#include "librbd/cache/pwl/ReadRequest.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+typedef std::vector<pwl::ImageExtentBuf> ImageExtentBufs;
+
+class C_ReadRequest : public pwl::C_ReadRequest {
+protected:
+ using pwl::C_ReadRequest::m_cct;
+ using pwl::C_ReadRequest::m_on_finish;
+ using pwl::C_ReadRequest::m_out_bl;
+ using pwl::C_ReadRequest::m_arrived_time;
+ using pwl::C_ReadRequest::m_perfcounter;
+public:
+ C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish)
+ : pwl::C_ReadRequest(cct, arrived, perfcounter, out_bl, on_finish) {}
+ void finish(int r) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H
diff --git a/src/librbd/cache/pwl/rwl/Request.cc b/src/librbd/cache/pwl/rwl/Request.cc
new file mode 100644
index 000000000..091581272
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/Request.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::Request: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+
+ ceph_assert(!this->m_resources.allocated);
+
+ auto image_extents_size = this->image_extents.size();
+ this->m_resources.buffers.reserve(image_extents_size);
+
+ *bytes_cached = 0;
+ *bytes_allocated = 0;
+ *number_lanes = image_extents_size;
+ *number_log_entries = image_extents_size;
+ *number_unpublished_reserves = image_extents_size;
+
+ for (auto &extent : this->image_extents) {
+ this->m_resources.buffers.emplace_back();
+ struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+ buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+ buffer.allocated = false;
+ *bytes_cached += extent.second;
+ if (extent.second > buffer.allocation_size) {
+ buffer.allocation_size = extent.second;
+ }
+ *bytes_allocated += buffer.allocation_size;
+ }
+ *bytes_dirtied = *bytes_cached;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req
+ << "cmp_bl=" << req.cmp_bl << ", "
+ << "read_bl=" << req.read_bl << ", "
+ << "compare_succeeded=" << req.compare_succeeded << ", "
+ << "mismatch_offset=" << req.mismatch_offset;
+ return os;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ ceph_assert(this->image_extents.size() == 1);
+ *number_log_entries = 1;
+ *bytes_dirtied += this->image_extents[0].second;
+ auto pattern_length = this->bl.length();
+ this->m_resources.buffers.emplace_back();
+ struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+ buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+ buffer.allocated = false;
+ *bytes_cached += pattern_length;
+ if (pattern_length > buffer.allocation_size) {
+ buffer.allocation_size = pattern_length;
+ }
+ *bytes_allocated += buffer.allocation_size;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::rwl::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::rwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::rwl::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
diff --git a/src/librbd/cache/pwl/rwl/Request.h b/src/librbd/cache/pwl/rwl/Request.h
new file mode 100644
index 000000000..0a5c610d6
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/Request.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+
+#include "librbd/cache/pwl/Request.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+class C_WriteRequest : public pwl::C_WriteRequest<T> {
+public:
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req) {}
+protected:
+ //Plain writes will allocate one buffer per request extent
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+};
+
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+ C_CompAndWriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ const char *get_name() const override {
+ return "C_CompAndWriteRequest";
+ }
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<U> &req);
+};
+
+template <typename T>
+class C_WriteSameRequest : public pwl::C_WriteSameRequest<T> {
+public:
+ C_WriteSameRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
diff --git a/src/librbd/cache/pwl/rwl/WriteLog.cc b/src/librbd/cache/pwl/rwl/WriteLog.cc
new file mode 100644
index 000000000..dd623c9ad
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/WriteLog.cc
@@ -0,0 +1,1014 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "WriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/plugin/Api.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLog: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+using namespace librbd::cache::pwl;
+namespace rwl {
+
+const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
+
+template <typename I>
+Builder<AbstractWriteLog<I>>* WriteLog<I>::create_builder() {
+ m_builderobj = new Builder<This>();
+ return m_builderobj;
+}
+
+template <typename I>
+WriteLog<I>::WriteLog(
+ I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+ ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api)
+: AbstractWriteLog<I>(image_ctx, cache_state, create_builder(), image_writeback,
+ plugin_api),
+ m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl))
+{
+}
+
+template <typename I>
+WriteLog<I>::~WriteLog() {
+ m_log_pool = nullptr;
+ delete m_builderobj;
+}
+
+template <typename I>
+void WriteLog<I>::collect_read_extents(
+ uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+ Extent hit_extent, pwl::C_ReadRequest *read_ctx) {
+ /* Make a bl for this hit extent. This will add references to the
+ * write_entry->pmem_bp */
+ buffer::list hit_bl;
+
+ /* Create buffer object referring to pmem pool for this read hit */
+ auto write_entry = map_entry.log_entry;
+
+ buffer::list entry_bl_copy;
+ write_entry->copy_cache_bl(&entry_bl_copy);
+ entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
+ ceph_assert(hit_bl.length() == entry_hit_length);
+
+ /* Add hit extent to read extents */
+ auto hit_extent_buf = std::make_shared<ImageExtentBuf>(hit_extent, hit_bl);
+ read_ctx->read_extents.push_back(hit_extent_buf);
+}
+
+template <typename I>
+void WriteLog<I>::complete_read(
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, Context *ctx) {
+ ctx->complete(0);
+}
+
+/*
+ * Allocate the (already reserved) write log entries for a set of operations.
+ *
+ * Locking:
+ * Acquires lock
+ */
+template <typename I>
+void WriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
+{
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+
+ ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
+
+ /* Allocate the (already reserved) log entries */
+ std::unique_lock locker(m_lock);
+
+ for (auto &operation : ops) {
+ uint32_t entry_index = this->m_first_free_entry;
+ this->m_first_free_entry = (this->m_first_free_entry + 1) % this->m_total_log_entries;
+ auto &log_entry = operation->get_log_entry();
+ log_entry->log_entry_index = entry_index;
+ log_entry->ram_entry.entry_index = entry_index;
+ log_entry->cache_entry = &pmem_log_entries[entry_index];
+ log_entry->ram_entry.set_entry_valid(true);
+ m_log_entries.push_back(log_entry);
+ ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+ }
+ if (m_cache_state->empty && !m_log_entries.empty()) {
+ m_cache_state->empty = false;
+ this->update_image_cache_state();
+ this->write_image_cache_state(locker);
+ }
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+int WriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
+{
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogOperationsVector entries_to_flush;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ int ret = 0;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
+
+ if (ops.empty()) {
+ return 0;
+ }
+ entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
+
+ /* Write log entries to ring and persist */
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (!entries_to_flush.empty()) {
+ /* Flush these and reset the list if the current entry wraps to the
+ * tail of the ring */
+ if (entries_to_flush.back()->get_log_entry()->log_entry_index >
+ operation->get_log_entry()->log_entry_index) {
+ ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
+ << "operation=[" << *operation << "]" << dendl;
+ flush_op_log_entries(entries_to_flush);
+ entries_to_flush.clear();
+ now = ceph_clock_now();
+ }
+ }
+ ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "from " << &operation->get_log_entry()->ram_entry << " "
+ << "to " << operation->get_log_entry()->cache_entry << " "
+ << "operation=[" << *operation << "]" << dendl;
+ ldout(m_image_ctx.cct, 05) << "APPENDING: index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "operation=[" << *operation << "]" << dendl;
+ operation->log_append_start_time = now;
+ *operation->get_log_entry()->cache_entry = operation->get_log_entry()->ram_entry;
+ ldout(m_image_ctx.cct, 20) << "APPENDING: index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "pmem_entry=[" << *operation->get_log_entry()->cache_entry
+ << "]" << dendl;
+ entries_to_flush.push_back(operation);
+ }
+ flush_op_log_entries(entries_to_flush);
+
+ /* Drain once for all */
+ pmemobj_drain(m_log_pool);
+
+ /*
+ * Atomically advance the log head pointer and publish the
+ * allocations for all the data buffers they refer to.
+ */
+ utime_t tx_start = ceph_clock_now();
+ TX_BEGIN(m_log_pool) {
+ D_RW(pool_root)->first_free_entry = this->m_first_free_entry;
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
+ pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+ }
+ }
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(cct) << "failed to commit " << ops.size()
+ << " log entries (" << this->m_log_pool_name << ")" << dendl;
+ ceph_assert(false);
+ ret = -EIO;
+ } TX_FINALLY {
+ } TX_END;
+
+ utime_t tx_end = ceph_clock_now();
+ m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start);
+ m_perfcounter->hinc(
+ l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
+ for (auto &operation : ops) {
+ operation->log_append_comp_time = tx_end;
+ }
+
+ return ret;
+}
+
+/*
+ * Flush the persistent write log entries set of ops. The entries must
+ * be contiguous in persistent memory.
+ */
+template <typename I>
+void WriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
+{
+ if (ops.empty()) {
+ return;
+ }
+
+ if (ops.size() > 1) {
+ ceph_assert(ops.front()->get_log_entry()->cache_entry < ops.back()->get_log_entry()->cache_entry);
+ }
+
+ ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
+ << "start address="
+ << ops.front()->get_log_entry()->cache_entry << " "
+ << "bytes="
+ << ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry))
+ << dendl;
+ pmemobj_flush(m_log_pool,
+ ops.front()->get_log_entry()->cache_entry,
+ ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry)));
+}
+
+template <typename I>
+void WriteLog<I>::remove_pool_file() {
+ if (m_log_pool) {
+ ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
+ pmemobj_close(m_log_pool);
+ }
+ if (m_cache_state->clean) {
+ ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << this->m_log_pool_name << dendl;
+ if (remove(this->m_log_pool_name.c_str()) != 0) {
+ lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << this->m_log_pool_name << "\": "
+ << pmemobj_errormsg() << dendl;
+ } else {
+ m_cache_state->present = false;
+ }
+ } else {
+ ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << this->m_log_pool_name << dendl;
+ }
+}
+
+template <typename I>
+bool WriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
+ if ((m_log_pool =
+ pmemobj_create(this->m_log_pool_name.c_str(),
+ this->m_pwl_pool_layout_name,
+ this->m_log_pool_size,
+ (S_IWUSR | S_IRUSR))) == NULL) {
+ lderr(cct) << "failed to create pool (" << this->m_log_pool_name << ")"
+ << pmemobj_errormsg() << dendl;
+ m_cache_state->present = false;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* TODO: filter/replace errnos that are meaningless to the caller */
+ on_finish->complete(-errno);
+ return false;
+ }
+ m_cache_state->present = true;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ /* new pool, calculate and store metadata */
+ size_t effective_pool_size = (size_t)(this->m_log_pool_size * USABLE_SIZE);
+ size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogCacheEntry);
+ uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
+ if (num_small_writes > MAX_LOG_ENTRIES) {
+ num_small_writes = MAX_LOG_ENTRIES;
+ }
+ if (num_small_writes <= 2) {
+ lderr(cct) << "num_small_writes needs to > 2" << dendl;
+ on_finish->complete(-EINVAL);
+ return false;
+ }
+ this->m_bytes_allocated_cap = effective_pool_size;
+ /* Log ring empty */
+ m_first_free_entry = 0;
+ m_first_valid_entry = 0;
+ TX_BEGIN(m_log_pool) {
+ TX_ADD(pool_root);
+ D_RW(pool_root)->header.layout_version = RWL_LAYOUT_VERSION;
+ D_RW(pool_root)->log_entries =
+ TX_ZALLOC(struct WriteLogCacheEntry,
+ sizeof(struct WriteLogCacheEntry) * num_small_writes);
+ D_RW(pool_root)->pool_size = this->m_log_pool_size;
+ D_RW(pool_root)->flushed_sync_gen = this->m_flushed_sync_gen;
+ D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
+ D_RW(pool_root)->num_log_entries = num_small_writes;
+ D_RW(pool_root)->first_free_entry = m_first_free_entry;
+ D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
+ } TX_ONCOMMIT {
+ this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ this->m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
+ } TX_ONABORT {
+ this->m_total_log_entries = 0;
+ this->m_free_log_entries = 0;
+ lderr(cct) << "failed to initialize pool (" << this->m_log_pool_name << ")" << dendl;
+ on_finish->complete(-pmemobj_tx_errno());
+ return false;
+ } TX_FINALLY {
+ } TX_END;
+ } else {
+ ceph_assert(m_cache_state->present);
+ /* Open existing pool */
+ if ((m_log_pool =
+ pmemobj_open(this->m_log_pool_name.c_str(),
+ this->m_pwl_pool_layout_name)) == NULL) {
+ lderr(cct) << "failed to open pool (" << this->m_log_pool_name << "): "
+ << pmemobj_errormsg() << dendl;
+ on_finish->complete(-errno);
+ return false;
+ }
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ if (D_RO(pool_root)->header.layout_version != RWL_LAYOUT_VERSION) {
+ // TODO: will handle upgrading version in the future
+ lderr(cct) << "Pool layout version is "
+ << D_RO(pool_root)->header.layout_version
+ << " expected " << RWL_LAYOUT_VERSION << dendl;
+ on_finish->complete(-EINVAL);
+ return false;
+ }
+ if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
+ lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
+ << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
+ on_finish->complete(-EINVAL);
+ return false;
+ }
+ this->m_log_pool_size = D_RO(pool_root)->pool_size;
+ this->m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
+ this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ m_first_free_entry = D_RO(pool_root)->first_free_entry;
+ m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
+ if (m_first_free_entry < m_first_valid_entry) {
+ /* Valid entries wrap around the end of the ring, so first_free is lower
+ * than first_valid. If first_valid was == first_free+1, the entry at
+ * first_free would be empty. The last entry is never used, so in
+ * that case there would be zero free log entries. */
+ this->m_free_log_entries = this->m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
+ } else {
+ /* first_valid is <= first_free. If they are == we have zero valid log
+ * entries, and n-1 free log entries */
+ this->m_free_log_entries = this->m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
+ }
+ size_t effective_pool_size = (size_t)(this->m_log_pool_size * USABLE_SIZE);
+ this->m_bytes_allocated_cap = effective_pool_size;
+ load_existing_entries(later);
+ m_cache_state->clean = this->m_dirty_log_entries.empty();
+ m_cache_state->empty = m_log_entries.empty();
+ }
+ return true;
+}
+
+/*
+ * Loads the log entries from an existing log.
+ *
+ * Creates the in-memory structures to represent the state of the
+ * re-opened log.
+ *
+ * Finds the last appended sync point, and any sync points referred to
+ * in log entries, but missing from the log. These missing sync points
+ * are created and scheduled for append. Some rudimentary consistency
+ * checking is done.
+ *
+ * Rebuilds the m_blocks_to_log_entries map, to make log entries
+ * readable.
+ *
+ * Places all writes on the dirty entries list, which causes them all
+ * to be flushed.
+ *
+ */
+
+template <typename I>
+void WriteLog<I>::load_existing_entries(DeferredContexts &later) {
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+ uint64_t entry_index = m_first_valid_entry;
+ /* The map below allows us to find sync point log entries by sync
+ * gen number, which is necessary so write entries can be linked to
+ * their sync points. */
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+ /* The map below tracks sync points referred to in writes but not
+ * appearing in the sync_point_entries map. We'll use this to
+ * determine which sync points are missing and need to be
+ * created. */
+ std::map<uint64_t, bool> missing_sync_points;
+
+ /*
+ * Read the existing log entries. Construct an in-memory log entry
+ * object of the appropriate type for each. Add these to the global
+ * log entries list.
+ *
+ * Write entries will not link to their sync points yet. We'll do
+ * that in the next pass. Here we'll accumulate a map of sync point
+ * gen numbers that are referred to in writes but do not appearing in
+ * the log.
+ */
+ while (entry_index != m_first_free_entry) {
+ WriteLogCacheEntry *pmem_entry = &pmem_log_entries[entry_index];
+ std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+ ceph_assert(pmem_entry->entry_index == entry_index);
+
+ this->update_entries(&log_entry, pmem_entry, missing_sync_points,
+ sync_point_entries, entry_index);
+
+ log_entry->ram_entry = *pmem_entry;
+ log_entry->cache_entry = pmem_entry;
+ log_entry->log_entry_index = entry_index;
+ log_entry->completed = true;
+
+ m_log_entries.push_back(log_entry);
+
+ entry_index = (entry_index + 1) % this->m_total_log_entries;
+ }
+
+ this->update_sync_points(missing_sync_points, sync_point_entries, later);
+}
+
+template <typename I>
+void WriteLog<I>::inc_allocated_cached_bytes(
+ std::shared_ptr<pwl::GenericLogEntry> log_entry) {
+ if (log_entry->is_write_entry()) {
+ this->m_bytes_allocated += std::max(log_entry->write_bytes(), MIN_WRITE_ALLOC_SIZE);
+ this->m_bytes_cached += log_entry->write_bytes();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::write_data_to_buffer(
+ std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+ WriteLogCacheEntry *pmem_entry) {
+ ws_entry->cache_buffer = D_RW(pmem_entry->write_data);
+}
+
+/**
+ * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
+ * that are eligible to be retired. Returns true if anything was
+ * retired.
+ */
+template <typename I>
+bool WriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogEntriesVector retiring_entries;
+ uint32_t initial_first_valid_entry;
+ uint32_t first_valid_entry;
+
+ std::lock_guard retire_locker(this->m_log_retire_lock);
+ ldout(cct, 20) << "Look for entries to retire" << dendl;
+ {
+ /* Entry readers can't be added while we hold m_entry_reader_lock */
+ RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
+ std::lock_guard locker(m_lock);
+ initial_first_valid_entry = this->m_first_valid_entry;
+ first_valid_entry = this->m_first_valid_entry;
+ auto entry = m_log_entries.front();
+ while (!m_log_entries.empty() &&
+ retiring_entries.size() < frees_per_tx &&
+ this->can_retire_entry(entry)) {
+ if (entry->log_entry_index != first_valid_entry) {
+ lderr(cct) << "Retiring entry index (" << entry->log_entry_index
+ << ") and first valid log entry index (" << first_valid_entry
+ << ") must be ==." << dendl;
+ }
+ ceph_assert(entry->log_entry_index == first_valid_entry);
+ first_valid_entry = (first_valid_entry + 1) % this->m_total_log_entries;
+ m_log_entries.pop_front();
+ retiring_entries.push_back(entry);
+ /* Remove entry from map so there will be no more readers */
+ if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
+ auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
+ if (gen_write_entry) {
+ this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
+ }
+ }
+ entry = m_log_entries.front();
+ }
+ }
+
+ if (retiring_entries.size()) {
+ ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ utime_t tx_start;
+ utime_t tx_end;
+ /* Advance first valid entry and release buffers */
+ {
+ uint64_t flushed_sync_gen;
+ std::lock_guard append_locker(this->m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = this->m_flushed_sync_gen;
+ }
+
+ tx_start = ceph_clock_now();
+ TX_BEGIN(m_log_pool) {
+ if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+ ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
+ << D_RO(pool_root)->flushed_sync_gen << " to "
+ << flushed_sync_gen << dendl;
+ D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+ }
+ D_RW(pool_root)->first_valid_entry = first_valid_entry;
+ for (auto &entry: retiring_entries) {
+ if (entry->write_bytes()) {
+ ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
+ << "." << entry->ram_entry.write_data.oid.off << dendl;
+ TX_FREE(entry->ram_entry.write_data);
+ } else {
+ ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
+ }
+ }
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(cct) << "failed to commit free of" << retiring_entries.size()
+ << " log entries (" << this->m_log_pool_name << ")" << dendl;
+ ceph_assert(false);
+ } TX_FINALLY {
+ } TX_END;
+ tx_end = ceph_clock_now();
+ }
+ m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start);
+ m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(),
+ retiring_entries.size());
+
+ bool need_update_state = false;
+ /* Update runtime copy of first_valid, and free entries counts */
+ {
+ std::lock_guard locker(m_lock);
+
+ ceph_assert(this->m_first_valid_entry == initial_first_valid_entry);
+ this->m_first_valid_entry = first_valid_entry;
+ this->m_free_log_entries += retiring_entries.size();
+ if (!m_cache_state->empty && m_log_entries.empty()) {
+ m_cache_state->empty = true;
+ this->update_image_cache_state();
+ need_update_state = true;
+ }
+ for (auto &entry: retiring_entries) {
+ if (entry->write_bytes()) {
+ ceph_assert(this->m_bytes_cached >= entry->write_bytes());
+ this->m_bytes_cached -= entry->write_bytes();
+ uint64_t entry_allocation_size = entry->write_bytes();
+ if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
+ entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
+ }
+ ceph_assert(this->m_bytes_allocated >= entry_allocation_size);
+ this->m_bytes_allocated -= entry_allocation_size;
+ }
+ }
+ this->m_alloc_failed_since_retire = false;
+ this->wake_up();
+ }
+ if (need_update_state) {
+ std::unique_lock locker(m_lock);
+ this->write_image_cache_state(locker);
+ }
+ } else {
+ ldout(cct, 20) << "Nothing to retire" << dendl;
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+void WriteLog<I>::construct_flush_entries(pwl::GenericLogEntries entries_to_flush,
+ DeferredContexts &post_unlock,
+ bool has_write_entry) {
+ bool invalidating = this->m_invalidating; // snapshot so we behave consistently
+
+ for (auto &log_entry : entries_to_flush) {
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, log_entry, invalidating]
+ (GuardedRequestFunctionContext &guard_ctx) {
+ log_entry->m_cell = guard_ctx.cell;
+ Context *ctx = this->construct_flush_entry(log_entry, invalidating);
+
+ if (!invalidating) {
+ ctx = new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback(this->m_image_writeback, ctx);
+ }), 0);
+ });
+ }
+
+ ctx->complete(0);
+ });
+ this->detain_flush_guard_request(log_entry, guarded_ctx);
+ }
+}
+
+const unsigned long int ops_flushed_together = 4;
+/*
+ * Performs the pmem buffer flush on all scheduled ops, then schedules
+ * the log event append operation for all of them.
+ */
+template <typename I>
+void WriteLog<I>::flush_then_append_scheduled_ops(void)
+{
+ GenericLogOperations ops;
+ bool ops_remain = false;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ do {
+ {
+ ops.clear();
+ std::lock_guard locker(m_lock);
+ if (m_ops_to_flush.size()) {
+ auto last_in_batch = m_ops_to_flush.begin();
+ unsigned int ops_to_flush = m_ops_to_flush.size();
+ if (ops_to_flush > ops_flushed_together) {
+ ops_to_flush = ops_flushed_together;
+ }
+ ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
+ std::advance(last_in_batch, ops_to_flush);
+ ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
+ ops_remain = !m_ops_to_flush.empty();
+ ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
+ << m_ops_to_flush.size() << " remain" << dendl;
+ } else {
+ ops_remain = false;
+ }
+ }
+ if (ops_remain) {
+ enlist_op_flusher();
+ }
+
+ /* Ops subsequently scheduled for flush may finish before these,
+ * which is fine. We're unconcerned with completion order until we
+ * get to the log message append step. */
+ if (ops.size()) {
+ flush_pmem_buffer(ops);
+ schedule_append_ops(ops, nullptr);
+ }
+ } while (ops_remain);
+ append_scheduled_ops();
+}
+
+/*
+ * Performs the log event append operation for all of the scheduled
+ * events.
+ */
+template <typename I>
+void WriteLog<I>::append_scheduled_ops(void) {
+ GenericLogOperations ops;
+ int append_result = 0;
+ bool ops_remain = false;
+ bool appending = false; /* true if we set m_appending */
+ ldout(m_image_ctx.cct, 20) << dendl;
+ do {
+ ops.clear();
+ this->append_scheduled(ops, ops_remain, appending, true);
+
+ if (ops.size()) {
+ std::lock_guard locker(this->m_log_append_lock);
+ alloc_op_log_entries(ops);
+ append_result = append_op_log_entries(ops);
+ }
+
+ int num_ops = ops.size();
+ if (num_ops) {
+ /* New entries may be flushable. Completion will wake up flusher. */
+ this->complete_op_log_entries(std::move(ops), append_result);
+ }
+ } while (ops_remain);
+}
+
+template <typename I>
+void WriteLog<I>::enlist_op_flusher()
+{
+ this->m_async_flush_ops++;
+ this->m_async_op_tracker.start_op();
+ Context *flush_ctx = new LambdaContext([this](int r) {
+ flush_then_append_scheduled_ops();
+ this->m_async_flush_ops--;
+ this->m_async_op_tracker.finish_op();
+ });
+ this->m_work_queue.queue(flush_ctx);
+}
+
+template <typename I>
+void WriteLog<I>::setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush,
+ C_BlockIORequestT *req) {
+ if (do_early_flush) {
+ /* This caller is waiting for persist, so we'll use their thread to
+ * expedite it */
+ flush_pmem_buffer(ops);
+ this->schedule_append(ops);
+ } else {
+ /* This is probably not still the caller's thread, so do the payload
+ * flushing/replicating later. */
+ schedule_flush_and_append(ops);
+ }
+}
+
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template <typename I>
+void WriteLog<I>::schedule_append_ops(GenericLogOperations &ops, C_BlockIORequestT *req)
+{
+ bool need_finisher;
+ GenericLogOperationsVector appending;
+
+ std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+ {
+ std::lock_guard locker(m_lock);
+
+ need_finisher = this->m_ops_to_append.empty() && !this->m_appending;
+ this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
+ }
+
+ if (need_finisher) {
+ //enlist op appender
+ this->m_async_append_ops++;
+ this->m_async_op_tracker.start_op();
+ Context *append_ctx = new LambdaContext([this](int r) {
+ append_scheduled_ops();
+ this->m_async_append_ops--;
+ this->m_async_op_tracker.finish_op();
+ });
+ this->m_work_queue.queue(append_ctx);
+ }
+
+ for (auto &op : appending) {
+ op->appending();
+ }
+}
+
+/*
+ * Takes custody of ops. They'll all get their pmem blocks flushed,
+ * then get their log entries appended.
+ */
+template <typename I>
+void WriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
+{
+ GenericLogOperations to_flush(ops.begin(), ops.end());
+ bool need_finisher;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ {
+ std::lock_guard locker(m_lock);
+
+ need_finisher = m_ops_to_flush.empty();
+ m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
+ }
+
+ if (need_finisher) {
+ enlist_op_flusher();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::process_work() {
+ CephContext *cct = m_image_ctx.cct;
+ int max_iterations = 4;
+ bool wake_up_requested = false;
+ uint64_t aggressive_high_water_bytes = this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER;
+ uint64_t low_water_bytes = this->m_bytes_allocated_cap * RETIRE_LOW_WATER;
+ uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
+ uint64_t low_water_entries = this->m_total_log_entries * RETIRE_LOW_WATER;
+
+ ldout(cct, 20) << dendl;
+
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_requested = false;
+ }
+ if (this->m_alloc_failed_since_retire || this->m_invalidating ||
+ this->m_bytes_allocated > high_water_bytes ||
+ (m_log_entries.size() > high_water_entries)) {
+ int retired = 0;
+ utime_t started = ceph_clock_now();
+ ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
+ << ", allocated > high_water="
+ << (this->m_bytes_allocated > high_water_bytes)
+ << ", allocated_entries > high_water="
+ << (m_log_entries.size() > high_water_entries)
+ << dendl;
+ while (this->m_alloc_failed_since_retire || this->m_invalidating ||
+ (this->m_bytes_allocated > high_water_bytes) ||
+ (m_log_entries.size() > high_water_entries) ||
+ (((this->m_bytes_allocated > low_water_bytes) ||
+ (m_log_entries.size() > low_water_entries)) &&
+ (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
+ if (!retire_entries((this->m_shutting_down || this->m_invalidating ||
+ (this->m_bytes_allocated > aggressive_high_water_bytes) ||
+ (m_log_entries.size() > aggressive_high_water_entries) ||
+ this->m_alloc_failed_since_retire)
+ ? MAX_ALLOC_PER_TRANSACTION
+ : MAX_FREE_PER_TRANSACTION)) {
+ break;
+ }
+ retired++;
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+ }
+ ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
+ }
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+
+ {
+ std::lock_guard locker(m_lock);
+ wake_up_requested = this->m_wake_up_requested;
+ }
+ } while (wake_up_requested && --max_iterations > 0);
+
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_scheduled = false;
+ /* Reschedule if it's still requested */
+ if (this->m_wake_up_requested) {
+ this->wake_up();
+ }
+ }
+}
+
+/*
+ * Flush the pmem regions for the data blocks of a set of operations
+ *
+ * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
+ */
+template <typename I>
+template <typename V>
+void WriteLog<I>::flush_pmem_buffer(V& ops)
+{
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ operation->buf_persist_start_time = now;
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: "
+ << *operation << dendl;
+ }
+ }
+
+ for (auto &operation : ops) {
+ if(operation->is_writing_op()) {
+ auto log_entry = static_pointer_cast<WriteLogEntry>(operation->get_log_entry());
+ pmemobj_flush(m_log_pool, log_entry->cache_buffer, log_entry->write_bytes());
+ }
+ }
+
+ /* Drain once for all */
+ pmemobj_drain(m_log_pool);
+
+ now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ operation->buf_persist_comp_time = now;
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: "
+ << *operation << dendl;
+ }
+ }
+}
+
+/**
+ * Update/persist the last flushed sync point in the log
+ */
+template <typename I>
+void WriteLog<I>::persist_last_flushed_sync_gen()
+{
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ uint64_t flushed_sync_gen;
+
+ std::lock_guard append_locker(this->m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = this->m_flushed_sync_gen;
+ }
+
+ if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+ ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
+ << D_RO(pool_root)->flushed_sync_gen << " to "
+ << flushed_sync_gen << dendl;
+ TX_BEGIN(m_log_pool) {
+ D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
+ ceph_assert(false);
+ } TX_FINALLY {
+ } TX_END;
+ }
+}
+
+template <typename I>
+void WriteLog<I>::reserve_cache(C_BlockIORequestT *req,
+ bool &alloc_succeeds, bool &no_space) {
+ std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+ for (auto &buffer : buffers) {
+ utime_t before_reserve = ceph_clock_now();
+ buffer.buffer_oid = pmemobj_reserve(m_log_pool,
+ &buffer.buffer_alloc_action,
+ buffer.allocation_size,
+ 0 /* Object type */);
+ buffer.allocation_lat = ceph_clock_now() - before_reserve;
+ if (TOID_IS_NULL(buffer.buffer_oid)) {
+ if (!req->has_io_waited_for_buffers()) {
+ req->set_io_waited_for_buffers(true);
+ }
+ ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
+ << pmemobj_errormsg() << ". "
+ << *req << dendl;
+ alloc_succeeds = false;
+ no_space = true; /* Entries need to be retired */
+
+ if (this->m_free_log_entries == this->m_total_log_entries - 1) {
+ /* When the cache is empty, there is still no space to allocate.
+ * Defragment. */
+ pmemobj_defrag(m_log_pool, NULL, 0, NULL);
+ }
+ break;
+ } else {
+ buffer.allocated = true;
+ }
+ ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
+ << "." << buffer.buffer_oid.oid.off
+ << ", size=" << buffer.allocation_size << dendl;
+ }
+}
+
+template<typename I>
+void WriteLog<I>::copy_bl_to_buffer(
+ WriteRequestResources *resources, std::unique_ptr<WriteLogOperationSet> &op_set) {
+ auto allocation = resources->buffers.begin();
+ for (auto &operation : op_set->operations) {
+ operation->copy_bl_to_cache_buffer(allocation);
+ allocation++;
+ }
+}
+
+template <typename I>
+bool WriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+ bool alloc_succeeds = true;
+ uint64_t bytes_allocated = 0;
+ uint64_t bytes_cached = 0;
+ uint64_t bytes_dirtied = 0;
+ uint64_t num_lanes = 0;
+ uint64_t num_unpublished_reserves = 0;
+ uint64_t num_log_entries = 0;
+
+ ldout(m_image_ctx.cct, 20) << dendl;
+ // Setup buffer, and get all the number of required resources
+ req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated,
+ &num_lanes, &num_log_entries, &num_unpublished_reserves);
+
+ alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied,
+ bytes_allocated, num_lanes, num_log_entries,
+ num_unpublished_reserves);
+
+ std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+ if (!alloc_succeeds) {
+ /* On alloc failure, free any buffers we did allocate */
+ for (auto &buffer : buffers) {
+ if (buffer.allocated) {
+ pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
+ }
+ }
+ }
+
+ req->set_allocated(alloc_succeeds);
+ return alloc_succeeds;
+}
+
+template <typename I>
+void WriteLog<I>::complete_user_request(Context *&user_req, int r) {
+ user_req->complete(r);
+ // Set user_req as null as it is deleted
+ user_req = nullptr;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::rwl::WriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/rwl/WriteLog.h b/src/librbd/cache/pwl/rwl/WriteLog.h
new file mode 100644
index 000000000..5083a2568
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/WriteLog.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+
+#include <functional>
+#include <libpmemobj.h>
+#include <list>
+#include "common/Timer.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/rwl/Builder.h"
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename ImageCtxT>
+class WriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+ WriteLog(
+ ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api);
+ ~WriteLog();
+ WriteLog(const WriteLog&) = delete;
+ WriteLog &operator=(const WriteLog&) = delete;
+
+ typedef io::Extent Extent;
+ using This = AbstractWriteLog<ImageCtxT>;
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+ void copy_bl_to_buffer(
+ WriteRequestResources *resources, std::unique_ptr<WriteLogOperationSet> &op_set) override;
+ void complete_user_request(Context *&user_req, int r) override;
+private:
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_FlushRequestT = pwl::C_FlushRequest<This>;
+ using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
+
+ PMEMobjpool *m_log_pool = nullptr;
+ Builder<This> *m_builderobj;
+ const char* m_pwl_pool_layout_name;
+ const uint64_t MAX_EXTENT_SIZE = 1048576;
+
+ Builder<This>* create_builder();
+ void remove_pool_file();
+ void load_existing_entries(pwl::DeferredContexts &later);
+ void alloc_op_log_entries(pwl::GenericLogOperations &ops);
+ int append_op_log_entries(pwl::GenericLogOperations &ops);
+ void flush_then_append_scheduled_ops(void);
+ void enlist_op_flusher();
+ void flush_op_log_entries(pwl::GenericLogOperationsVector &ops);
+ template <typename V>
+ void flush_pmem_buffer(V& ops);
+ void inc_allocated_cached_bytes(
+ std::shared_ptr<pwl::GenericLogEntry> log_entry) override;
+protected:
+ using AbstractWriteLog<ImageCtxT>::m_lock;
+ using AbstractWriteLog<ImageCtxT>::m_log_entries;
+ using AbstractWriteLog<ImageCtxT>::m_image_ctx;
+ using AbstractWriteLog<ImageCtxT>::m_perfcounter;
+ using AbstractWriteLog<ImageCtxT>::m_ops_to_flush;
+ using AbstractWriteLog<ImageCtxT>::m_cache_state;
+ using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
+ using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
+
+ void process_work() override;
+ void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) override;
+ void append_scheduled_ops(void) override;
+ void reserve_cache(C_BlockIORequestT *req,
+ bool &alloc_succeeds, bool &no_space) override;
+ void collect_read_extents(
+ uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+ Extent hit_extent, pwl::C_ReadRequest *read_ctx) override;
+ void complete_read(
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, Context *ctx) override;
+ bool retire_entries(const unsigned long int frees_per_tx) override;
+ void persist_last_flushed_sync_gen() override;
+ bool alloc_resources(C_BlockIORequestT *req) override;
+ void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) override;
+ void setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush,
+ C_BlockIORequestT *req) override;
+ void construct_flush_entries(pwl::GenericLogEntries entries_to_flush,
+ DeferredContexts &post_unlock,
+ bool has_write_entry) override;
+ bool initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
+ void write_data_to_buffer(
+ std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+ pwl::WriteLogCacheEntry *pmem_entry) override;
+ uint64_t get_max_extent() override {
+ return MAX_EXTENT_SIZE;
+ }
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::rwl::WriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
diff --git a/src/librbd/cache/pwl/ssd/Builder.h b/src/librbd/cache/pwl/ssd/Builder.h
new file mode 100644
index 000000000..07b3fb869
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Builder.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
+
+#include <iostream>
+#include "LogEntry.h"
+#include "ReadRequest.h"
+#include "Request.h"
+#include "LogOperation.h"
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/Builder.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename T>
+class Builder : public pwl::Builder<T> {
+public:
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ image_offset_bytes, write_bytes, data_length);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes, data_length);
+ }
+ pwl::C_WriteRequest<T> *create_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteSameRequest<T> *create_writesame_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteRequest<T> *create_comp_and_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_CompAndWriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> write_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, cct, write_log_entry);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, data_len, cct,
+ writesame_log_entry);
+ }
+ std::shared_ptr<pwl::DiscardLogOperation> create_discard_log_operation(
+ std::shared_ptr<SyncPoint> sync_point, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t discard_granularity_bytes,
+ utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) {
+ return std::make_shared<DiscardLogOperation>(
+ sync_point, image_offset_bytes, write_bytes, discard_granularity_bytes,
+ dispatch_time, perfcounter, cct);
+ }
+ C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived,
+ PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) {
+ return new C_ReadRequest(cct, arrived, perfcounter, bl, on_finish);
+ }
+};
+
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
diff --git a/src/librbd/cache/pwl/ssd/LogEntry.cc b/src/librbd/cache/pwl/ssd/LogEntry.cc
new file mode 100644
index 000000000..0e6edd87b
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/LogEntry.cc
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/ssd/LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLogEntry: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+void WriteLogEntry::init_cache_bl(
+ bufferlist &src_bl, uint64_t off, uint64_t len) {
+ cache_bl.clear();
+ cache_bl.substr_of(src_bl, off, len);
+}
+
+buffer::list& WriteLogEntry::get_cache_bl() {
+ return cache_bl;
+}
+
+void WriteLogEntry::copy_cache_bl(bufferlist *out) {
+ std::lock_guard locker(m_entry_bl_lock);
+ *out = cache_bl;
+}
+
+void WriteLogEntry::remove_cache_bl() {
+ std::lock_guard locker(m_entry_bl_lock);
+ cache_bl.clear();
+}
+
+unsigned int WriteLogEntry::get_aligned_data_size() const {
+ if (cache_bl.length()) {
+ return round_up_to(cache_bl.length(), MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ return round_up_to(write_bytes(), MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+void WriteLogEntry::writeback_bl(
+ librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist&& bl) {
+ image_writeback.aio_write({{ram_entry.image_offset_bytes,
+ ram_entry.write_bytes}},
+ std::move(bl), 0, ctx);
+}
+
+void WriteSameLogEntry::writeback_bl(
+ librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) {
+ image_writeback.aio_writesame(ram_entry.image_offset_bytes,
+ ram_entry.write_bytes,
+ std::move(bl), 0, ctx);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/ssd/LogEntry.h b/src/librbd/cache/pwl/ssd/LogEntry.h
new file mode 100644
index 000000000..8e26f661f
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/LogEntry.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// // vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
+
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+namespace ssd {
+
+class WriteLogEntry : public pwl::WriteLogEntry {
+public:
+ WriteLogEntry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {}
+ WriteLogEntry(
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {}
+ WriteLogEntry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes,
+ write_bytes, data_length) {}
+ WriteLogEntry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteLogEntry() {}
+ WriteLogEntry(const WriteLogEntry&) = delete;
+ WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+ void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) override;
+ void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) override;
+ buffer::list &get_cache_bl() override;
+ void copy_cache_bl(bufferlist *out) override;
+ void remove_cache_bl() override;
+ unsigned int get_aligned_data_size() const override;
+ void inc_bl_refs() { bl_refs++; };
+ void dec_bl_refs() { bl_refs--; };
+ unsigned int reader_count() const override {
+ return bl_refs;
+ }
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+public:
+ WriteSameLogEntry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes,
+ write_bytes, data_length) {}
+ WriteSameLogEntry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteSameLogEntry() {}
+ WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+ WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+ void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
diff --git a/src/librbd/cache/pwl/ssd/LogOperation.cc b/src/librbd/cache/pwl/ssd/LogOperation.cc
new file mode 100644
index 000000000..c8080e37d
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/LogOperation.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LogOperation.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::LogOperation: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+void DiscardLogOperation::init_op(
+ uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist,
+ Context *write_append) {
+ log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num);
+ if (persist_on_flush) {
+ this->on_write_append = new LambdaContext(
+ [write_persist, write_append] (int r) {
+ write_append->complete(r);
+ write_persist->complete(r);
+ });
+ } else {
+ this->on_write_append = write_append;
+ this->on_write_persist = write_persist;
+ }
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/ssd/LogOperation.h b/src/librbd/cache/pwl/ssd/LogOperation.h
new file mode 100644
index 000000000..dbc89aa73
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/LogOperation.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_LOG_OPERATION_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_LOG_OPERATION_H
+
+#include "librbd/cache/pwl/LogOperation.h"
+
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+class DiscardLogOperation : public pwl::DiscardLogOperation {
+public:
+ DiscardLogOperation(
+ std::shared_ptr<SyncPoint> sync_point, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t discard_granularity_bytes,
+ utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct)
+ : pwl::DiscardLogOperation(sync_point, image_offset_bytes, write_bytes,
+ discard_granularity_bytes, dispatch_time,
+ perfcounter, cct) {}
+ void init_op(
+ uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist,
+ Context *write_append) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_LOG_OPERATION_H
diff --git a/src/librbd/cache/pwl/ssd/ReadRequest.cc b/src/librbd/cache/pwl/ssd/ReadRequest.cc
new file mode 100644
index 000000000..1a80a8d8c
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/ReadRequest.cc
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReadRequest.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::ReadRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+void C_ReadRequest::finish(int r) {
+ ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
+ int hits = 0;
+ int misses = 0;
+ int hit_bytes = 0;
+ int miss_bytes = 0;
+ if (r >= 0) {
+ /*
+ * At this point the miss read has completed. We'll iterate through
+ * m_read_extents and produce *m_out_bl by assembling pieces of m_miss_bl
+ * and the individual hit extent bufs in the read extents that represent
+ * hits.
+ */
+ uint64_t miss_bl_offset = 0;
+ for (auto extent : read_extents) {
+ if (extent->m_bl.length()) {
+ /* This was a hit */
+ bufferlist data_bl;
+ if (extent->writesame) {
+ int data_len = extent->m_bl.length();
+ int read_buffer_offset = extent->truncate_offset;
+ if (extent->need_to_truncate && extent->truncate_offset >= data_len) {
+ read_buffer_offset = (extent->truncate_offset) % data_len;
+ }
+ // build data and truncate
+ bufferlist temp_bl;
+ uint64_t total_left_bytes = read_buffer_offset + extent->second;
+ while (total_left_bytes > 0) {
+ temp_bl.append(extent->m_bl);
+ total_left_bytes = total_left_bytes - data_len;
+ }
+ data_bl.substr_of(temp_bl, read_buffer_offset, extent->second);
+ m_out_bl->claim_append(data_bl);
+ } else if (extent->need_to_truncate) {
+ assert(extent->m_bl.length() >= extent->truncate_offset + extent->second);
+ data_bl.substr_of(extent->m_bl, extent->truncate_offset, extent->second);
+ m_out_bl->claim_append(data_bl);
+ } else {
+ assert(extent->second == extent->m_bl.length());
+ m_out_bl->claim_append(extent->m_bl);
+ }
+ ++hits;
+ hit_bytes += extent->second;
+ } else {
+ /* This was a miss. */
+ ++misses;
+ miss_bytes += extent->second;
+ bufferlist miss_extent_bl;
+ miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent->second);
+ /* Add this read miss bufferlist to the output bufferlist */
+ m_out_bl->claim_append(miss_extent_bl);
+ /* Consume these bytes in the read miss bufferlist */
+ miss_bl_offset += extent->second;
+ }
+ }
+ }
+ ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
+ utime_t now = ceph_clock_now();
+ ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
+ m_on_finish->complete(r);
+ m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes);
+ m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time);
+ if (!misses) {
+ m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1);
+ m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time);
+ } else {
+ if (hits) {
+ m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1);
+ }
+ }
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/ssd/ReadRequest.h b/src/librbd/cache/pwl/ssd/ReadRequest.h
new file mode 100644
index 000000000..345c4aa65
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/ReadRequest.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H
+
+#include "librbd/cache/pwl/ReadRequest.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+typedef std::vector<pwl::ImageExtentBuf> ImageExtentBufs;
+
+class C_ReadRequest : public pwl::C_ReadRequest {
+protected:
+ using pwl::C_ReadRequest::m_cct;
+ using pwl::C_ReadRequest::m_on_finish;
+ using pwl::C_ReadRequest::m_out_bl;
+ using pwl::C_ReadRequest::m_arrived_time;
+ using pwl::C_ReadRequest::m_perfcounter;
+public:
+ C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish)
+ : pwl::C_ReadRequest(cct, arrived, perfcounter, out_bl, on_finish) {}
+ void finish(int r) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H
diff --git a/src/librbd/cache/pwl/ssd/Request.cc b/src/librbd/cache/pwl/ssd/Request.cc
new file mode 100644
index 000000000..e92e547c8
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Request.cc
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::Request: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+
+ *bytes_cached = 0;
+ *bytes_allocated = 0;
+ *number_log_entries = this->image_extents.size();
+
+ for (auto &extent : this->image_extents) {
+ *bytes_cached += extent.second;
+ *bytes_allocated += round_up_to(extent.second, MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ *bytes_dirtied = *bytes_cached;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req
+ << "cmp_bl=" << req.cmp_bl << ", "
+ << "read_bl=" << req.read_bl << ", "
+ << "compare_succeeded=" << req.compare_succeeded << ", "
+ << "mismatch_offset=" << req.mismatch_offset;
+ return os;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ ceph_assert(this->image_extents.size() == 1);
+ *number_log_entries = 1;
+ *bytes_dirtied = this->image_extents[0].second;
+ *bytes_cached = this->bl.length();
+ *bytes_allocated = round_up_to(*bytes_cached, MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ssd::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::ssd::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::ssd::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
diff --git a/src/librbd/cache/pwl/ssd/Request.h b/src/librbd/cache/pwl/ssd/Request.h
new file mode 100644
index 000000000..9bb3e85b9
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Request.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_SSD_REQUEST_H
+#define CEPH_LIBRBD_CACHE_SSD_REQUEST_H
+
+#include "librbd/cache/pwl/Request.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+
+template<typename T>
+class AbstractWriteLog;
+
+namespace ssd {
+
+template <typename T>
+class C_WriteRequest : public pwl::C_WriteRequest<T> {
+public:
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req) {}
+protected:
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+};
+
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+ C_CompAndWriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset,fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ const char *get_name() const override {
+ return "C_CompAndWriteRequest";
+ }
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<U> &req);
+};
+
+template <typename T>
+class C_WriteSameRequest : public pwl::C_WriteSameRequest<T> {
+public:
+ C_WriteSameRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_SSD_REQUEST_H
diff --git a/src/librbd/cache/pwl/ssd/Types.h b/src/librbd/cache/pwl/ssd/Types.h
new file mode 100644
index 000000000..3ebad1fd9
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Types.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H
+#define CEPH_LIBRBD_CACHE_SSD_TYPES_H
+
+#include "acconfig.h"
+
+#include "librbd/io/Types.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+struct SuperBlock{
+ WriteLogPoolRoot root;
+
+ DENC(SuperBlock, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.root, p);
+ DENC_FINISH(p);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_object("super", root);
+ }
+
+ static void generate_test_instances(list<SuperBlock*>& ls) {
+ ls.push_back(new SuperBlock());
+ ls.push_back(new SuperBlock);
+ ls.back()->root.layout_version = 3;
+ ls.back()->root.cur_sync_gen = 1;
+ ls.back()->root.pool_size = 10737418240;
+ ls.back()->root.flushed_sync_gen = 1;
+ ls.back()->root.block_size = 4096;
+ ls.back()->root.num_log_entries = 0;
+ ls.back()->root.first_free_entry = 30601;
+ ls.back()->root.first_valid_entry = 2;
+ }
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+WRITE_CLASS_DENC(librbd::cache::pwl::ssd::SuperBlock)
+
+#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H
diff --git a/src/librbd/cache/pwl/ssd/WriteLog.cc b/src/librbd/cache/pwl/ssd/WriteLog.cc
new file mode 100644
index 000000000..00626506a
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/WriteLog.cc
@@ -0,0 +1,1158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "WriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLog: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+using namespace librbd::cache::pwl;
+
+static bool is_valid_pool_root(const WriteLogPoolRoot& root) {
+ return root.pool_size % MIN_WRITE_ALLOC_SSD_SIZE == 0 &&
+ root.first_valid_entry >= DATA_RING_BUFFER_OFFSET &&
+ root.first_valid_entry < root.pool_size &&
+ root.first_valid_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0 &&
+ root.first_free_entry >= DATA_RING_BUFFER_OFFSET &&
+ root.first_free_entry < root.pool_size &&
+ root.first_free_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0;
+}
+
+template <typename I>
+Builder<AbstractWriteLog<I>>* WriteLog<I>::create_builder() {
+ m_builderobj = new Builder<This>();
+ return m_builderobj;
+}
+
+template <typename I>
+WriteLog<I>::WriteLog(
+ I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api)
+ : AbstractWriteLog<I>(image_ctx, cache_state, create_builder(),
+ image_writeback, plugin_api)
+{
+}
+
+template <typename I>
+WriteLog<I>::~WriteLog() {
+ delete m_builderobj;
+}
+
+template <typename I>
+void WriteLog<I>::collect_read_extents(
+ uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read,
+ uint64_t entry_hit_length, Extent hit_extent,
+ pwl::C_ReadRequest *read_ctx) {
+ // Make a bl for this hit extent. This will add references to the
+ // write_entry->cache_bl */
+ ldout(m_image_ctx.cct, 5) << dendl;
+ auto write_entry = static_pointer_cast<WriteLogEntry>(map_entry.log_entry);
+ buffer::list hit_bl;
+ write_entry->copy_cache_bl(&hit_bl);
+ bool writesame = write_entry->is_writesame_entry();
+ auto hit_extent_buf = std::make_shared<ImageExtentBuf>(
+ hit_extent, hit_bl, true, read_buffer_offset, writesame);
+ read_ctx->read_extents.push_back(hit_extent_buf);
+
+ if (!hit_bl.length()) {
+ ldout(m_image_ctx.cct, 5) << "didn't hit RAM" << dendl;
+ auto read_extent = read_ctx->read_extents.back();
+ write_entry->inc_bl_refs();
+ log_entries_to_read.push_back(std::move(write_entry));
+ bls_to_read.push_back(&read_extent->m_bl);
+ }
+}
+
+template <typename I>
+void WriteLog<I>::complete_read(
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read,
+ Context *ctx) {
+ if (!log_entries_to_read.empty()) {
+ aio_read_data_blocks(log_entries_to_read, bls_to_read, ctx);
+ } else {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+int WriteLog<I>::create_and_open_bdev() {
+ CephContext *cct = m_image_ctx.cct;
+
+ bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb,
+ nullptr, nullptr, nullptr);
+ int r = bdev->open(this->m_log_pool_name);
+ if (r < 0) {
+ lderr(cct) << "failed to open bdev" << dendl;
+ delete bdev;
+ return r;
+ }
+
+ ceph_assert(this->m_log_pool_size % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ if (bdev->get_size() != this->m_log_pool_size) {
+ lderr(cct) << "size mismatch: bdev size " << bdev->get_size()
+ << " (block size " << bdev->get_block_size()
+ << ") != pool size " << this->m_log_pool_size << dendl;
+ bdev->close();
+ delete bdev;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+template <typename I>
+bool WriteLog<I>::initialize_pool(Context *on_finish,
+ pwl::DeferredContexts &later) {
+ int r;
+ CephContext *cct = m_image_ctx.cct;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
+ int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644);
+ bool succeed = true;
+ if (fd >= 0) {
+ if (truncate(this->m_log_pool_name.c_str(),
+ this->m_log_pool_size) != 0) {
+ succeed = false;
+ }
+ ::close(fd);
+ } else {
+ succeed = false;
+ }
+ if (!succeed) {
+ m_cache_state->present = false;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* TODO: filter/replace errnos that are meaningless to the caller */
+ on_finish->complete(-errno);
+ return false;
+ }
+
+ r = create_and_open_bdev();
+ if (r < 0) {
+ on_finish->complete(r);
+ return false;
+ }
+ m_cache_state->present = true;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* new pool, calculate and store metadata */
+
+ /* Keep ring buffer at least MIN_WRITE_ALLOC_SSD_SIZE bytes free.
+ * In this way, when all ring buffer spaces are allocated,
+ * m_first_free_entry and m_first_valid_entry will not be equal.
+ * Equal only means the cache is empty. */
+ this->m_bytes_allocated_cap = this->m_log_pool_size -
+ DATA_RING_BUFFER_OFFSET - MIN_WRITE_ALLOC_SSD_SIZE;
+ /* Log ring empty */
+ m_first_free_entry = DATA_RING_BUFFER_OFFSET;
+ m_first_valid_entry = DATA_RING_BUFFER_OFFSET;
+
+ auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+ new_root->layout_version = SSD_LAYOUT_VERSION;
+ new_root->pool_size = this->m_log_pool_size;
+ new_root->flushed_sync_gen = this->m_flushed_sync_gen;
+ new_root->block_size = MIN_WRITE_ALLOC_SSD_SIZE;
+ new_root->first_free_entry = m_first_free_entry;
+ new_root->first_valid_entry = m_first_valid_entry;
+ new_root->num_log_entries = 0;
+ pool_root = *new_root;
+
+ r = update_pool_root_sync(new_root);
+ if (r != 0) {
+ lderr(cct) << "failed to initialize pool ("
+ << this->m_log_pool_name << ")" << dendl;
+ bdev->close();
+ delete bdev;
+ on_finish->complete(r);
+ return false;
+ }
+ } else {
+ ceph_assert(m_cache_state->present);
+ r = create_and_open_bdev();
+ if (r < 0) {
+ on_finish->complete(r);
+ return false;
+ }
+
+ bufferlist bl;
+ SuperBlock superblock;
+ ::IOContext ioctx(cct, nullptr);
+ r = bdev->read(0, MIN_WRITE_ALLOC_SSD_SIZE, &bl, &ioctx, false);
+ if (r < 0) {
+ lderr(cct) << "Read ssd cache superblock failed " << dendl;
+ goto error_handle;
+ }
+ auto p = bl.cbegin();
+ decode(superblock, p);
+ pool_root = superblock.root;
+ ldout(cct, 1) << "Decoded root: pool_size=" << pool_root.pool_size
+ << " first_valid_entry=" << pool_root.first_valid_entry
+ << " first_free_entry=" << pool_root.first_free_entry
+ << " flushed_sync_gen=" << pool_root.flushed_sync_gen
+ << dendl;
+ ceph_assert(is_valid_pool_root(pool_root));
+ if (pool_root.layout_version != SSD_LAYOUT_VERSION) {
+ lderr(cct) << "Pool layout version is "
+ << pool_root.layout_version
+ << " expected " << SSD_LAYOUT_VERSION
+ << dendl;
+ goto error_handle;
+ }
+ if (pool_root.block_size != MIN_WRITE_ALLOC_SSD_SIZE) {
+ lderr(cct) << "Pool block size is " << pool_root.block_size
+ << " expected " << MIN_WRITE_ALLOC_SSD_SIZE
+ << dendl;
+ goto error_handle;
+ }
+
+ this->m_log_pool_size = pool_root.pool_size;
+ this->m_flushed_sync_gen = pool_root.flushed_sync_gen;
+ this->m_first_valid_entry = pool_root.first_valid_entry;
+ this->m_first_free_entry = pool_root.first_free_entry;
+ this->m_bytes_allocated_cap = this->m_log_pool_size -
+ DATA_RING_BUFFER_OFFSET -
+ MIN_WRITE_ALLOC_SSD_SIZE;
+
+ load_existing_entries(later);
+ m_cache_state->clean = this->m_dirty_log_entries.empty();
+ m_cache_state->empty = m_log_entries.empty();
+ }
+ return true;
+
+error_handle:
+ bdev->close();
+ delete bdev;
+ on_finish->complete(-EINVAL);
+ return false;
+}
+
+template <typename I>
+void WriteLog<I>::remove_pool_file() {
+ ceph_assert(bdev);
+ bdev->close();
+ delete bdev;
+ bdev = nullptr;
+ ldout(m_image_ctx.cct, 5) << "block device is closed" << dendl;
+
+ if (m_cache_state->clean) {
+ ldout(m_image_ctx.cct, 5) << "Removing empty pool file: "
+ << this->m_log_pool_name << dendl;
+ if (remove(this->m_log_pool_name.c_str()) != 0) {
+ lderr(m_image_ctx.cct) << "failed to remove empty pool \""
+ << this->m_log_pool_name << "\": " << dendl;
+ } else {
+ m_cache_state->present = false;
+ }
+ } else {
+ ldout(m_image_ctx.cct, 5) << "Not removing pool file: "
+ << this->m_log_pool_name << dendl;
+ }
+}
+
+template <typename I>
+void WriteLog<I>::load_existing_entries(pwl::DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+ std::map<uint64_t, bool> missing_sync_points;
+
+ // Iterate through the log_entries and append all the write_bytes
+ // of each entry to fetch the pos of next 4k of log_entries. Iterate
+ // through the log entries and append them to the in-memory vector
+ for (uint64_t next_log_pos = this->m_first_valid_entry;
+ next_log_pos != this->m_first_free_entry; ) {
+ // read the entries from SSD cache and decode
+ bufferlist bl_entries;
+ ::IOContext ioctx_entry(cct, nullptr);
+ bdev->read(next_log_pos, MIN_WRITE_ALLOC_SSD_SIZE, &bl_entries,
+ &ioctx_entry, false);
+ std::vector<WriteLogCacheEntry> ssd_log_entries;
+ auto pl = bl_entries.cbegin();
+ decode(ssd_log_entries, pl);
+ ldout(cct, 5) << "decoded ssd log entries" << dendl;
+ uint64_t curr_log_pos = next_log_pos;
+ std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+
+ for (auto it = ssd_log_entries.begin(); it != ssd_log_entries.end(); ++it) {
+ this->update_entries(&log_entry, &*it, missing_sync_points,
+ sync_point_entries, curr_log_pos);
+ log_entry->ram_entry = *it;
+ log_entry->log_entry_index = curr_log_pos;
+ log_entry->completed = true;
+ m_log_entries.push_back(log_entry);
+ next_log_pos += round_up_to(it->write_bytes, MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ // along with the write_bytes, add control block size too
+ next_log_pos += MIN_WRITE_ALLOC_SSD_SIZE;
+ if (next_log_pos >= this->m_log_pool_size) {
+ next_log_pos = next_log_pos % this->m_log_pool_size + DATA_RING_BUFFER_OFFSET;
+ }
+ }
+ this->update_sync_points(missing_sync_points, sync_point_entries, later);
+ if (m_first_valid_entry > m_first_free_entry) {
+ m_bytes_allocated = this->m_log_pool_size - m_first_valid_entry +
+ m_first_free_entry - DATA_RING_BUFFER_OFFSET;
+ } else {
+ m_bytes_allocated = m_first_free_entry - m_first_valid_entry;
+ }
+}
+
+// For SSD we don't calc m_bytes_allocated in this
+template <typename I>
+void WriteLog<I>::inc_allocated_cached_bytes(
+ std::shared_ptr<pwl::GenericLogEntry> log_entry) {
+ if (log_entry->is_write_entry()) {
+ this->m_bytes_cached += log_entry->write_bytes();
+ }
+}
+
+template <typename I>
+bool WriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+ bool alloc_succeeds = true;
+ uint64_t bytes_allocated = 0;
+ uint64_t bytes_cached = 0;
+ uint64_t bytes_dirtied = 0;
+ uint64_t num_lanes = 0;
+ uint64_t num_unpublished_reserves = 0;
+ uint64_t num_log_entries = 0;
+
+ // Setup buffer, and get all the number of required resources
+ req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated,
+ &num_lanes, &num_log_entries,
+ &num_unpublished_reserves);
+
+ ceph_assert(!num_lanes);
+ if (num_log_entries) {
+ bytes_allocated += num_log_entries * MIN_WRITE_ALLOC_SSD_SIZE;
+ num_log_entries = 0;
+ }
+ ceph_assert(!num_unpublished_reserves);
+
+ alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied,
+ bytes_allocated, num_lanes,
+ num_log_entries,
+ num_unpublished_reserves);
+ req->set_allocated(alloc_succeeds);
+ return alloc_succeeds;
+}
+
+template <typename I>
+bool WriteLog<I>::has_sync_point_logs(GenericLogOperations &ops) {
+ for (auto &op : ops) {
+ if (op->get_log_entry()->is_sync_point()) {
+ return true;
+ break;
+ }
+ }
+ return false;
+}
+
+template<typename I>
+void WriteLog<I>::enlist_op_appender() {
+ this->m_async_append_ops++;
+ this->m_async_op_tracker.start_op();
+ Context *append_ctx = new LambdaContext([this](int r) {
+ append_scheduled_ops();
+ });
+ this->m_work_queue.queue(append_ctx);
+}
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template<typename I>
+void WriteLog<I>::schedule_append_ops(GenericLogOperations &ops, C_BlockIORequestT *req) {
+ bool need_finisher = false;
+ GenericLogOperationsVector appending;
+
+ std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+ {
+ std::lock_guard locker(m_lock);
+
+ bool persist_on_flush = this->get_persist_on_flush();
+ need_finisher = !this->m_appending &&
+ ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) ||
+ !persist_on_flush);
+
+ // Only flush logs into SSD when there is internal/external flush request
+ if (!need_finisher) {
+ need_finisher = has_sync_point_logs(ops);
+ }
+ this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
+
+ // To preserve the order of overlapping IOs, release_cell() may be
+ // called only after the ops are added to m_ops_to_append.
+ // As soon as m_lock is released, the appended ops can be picked up
+ // by append_scheduled_ops() in another thread and req can be freed.
+ if (req != nullptr) {
+ if (persist_on_flush) {
+ req->complete_user_request(0);
+ }
+ req->release_cell();
+ }
+ }
+
+ if (need_finisher) {
+ this->enlist_op_appender();
+ }
+
+ for (auto &op : appending) {
+ op->appending();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::setup_schedule_append(pwl::GenericLogOperationsVector &ops,
+ bool do_early_flush,
+ C_BlockIORequestT *req) {
+ this->schedule_append(ops, req);
+}
+
+template <typename I>
+void WriteLog<I>::append_scheduled_ops(void) {
+ GenericLogOperations ops;
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ bool ops_remain = false; //no-op variable for SSD
+ bool appending = false; //no-op variable for SSD
+ this->append_scheduled(ops, ops_remain, appending);
+
+ if (ops.size()) {
+ alloc_op_log_entries(ops);
+ append_op_log_entries(ops);
+ } else {
+ this->m_async_append_ops--;
+ this->m_async_op_tracker.finish_op();
+ }
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+void WriteLog<I>::append_op_log_entries(GenericLogOperations &ops) {
+ ceph_assert(!ops.empty());
+ ldout(m_image_ctx.cct, 20) << dendl;
+ Context *ctx = new LambdaContext([this, ops](int r) {
+ assert(r == 0);
+ ldout(m_image_ctx.cct, 20) << "Finished root update " << dendl;
+
+ auto captured_ops = std::move(ops);
+ this->complete_op_log_entries(std::move(captured_ops), r);
+
+ bool need_finisher = false;
+ {
+ std::lock_guard locker1(m_lock);
+ bool persist_on_flush = this->get_persist_on_flush();
+ need_finisher = ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) ||
+ !persist_on_flush);
+
+ if (!need_finisher) {
+ need_finisher = has_sync_point_logs(this->m_ops_to_append);
+ }
+ }
+
+ if (need_finisher) {
+ this->enlist_op_appender();
+ }
+ this->m_async_update_superblock--;
+ this->m_async_op_tracker.finish_op();
+ });
+ uint64_t *new_first_free_entry = new(uint64_t);
+ Context *append_ctx = new LambdaContext(
+ [this, new_first_free_entry, ops, ctx](int r) {
+ std::shared_ptr<WriteLogPoolRoot> new_root;
+ {
+ ldout(m_image_ctx.cct, 20) << "Finished appending at "
+ << *new_first_free_entry << dendl;
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ operation->log_append_comp_time = now;
+ }
+
+ std::lock_guard locker(this->m_log_append_lock);
+ std::lock_guard locker1(m_lock);
+ assert(this->m_appending);
+ this->m_appending = false;
+ new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+ pool_root.first_free_entry = *new_first_free_entry;
+ new_root->first_free_entry = *new_first_free_entry;
+ delete new_first_free_entry;
+ schedule_update_root(new_root, ctx);
+ }
+ this->m_async_append_ops--;
+ this->m_async_op_tracker.finish_op();
+ });
+ // Append logs and update first_free_update
+ append_ops(ops, append_ctx, new_first_free_entry);
+
+ if (ops.size()) {
+ this->dispatch_deferred_writes();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::release_ram(std::shared_ptr<GenericLogEntry> log_entry) {
+ log_entry->remove_cache_bl();
+}
+
+template <typename I>
+void WriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops) {
+ std::unique_lock locker(m_lock);
+
+ for (auto &operation : ops) {
+ auto &log_entry = operation->get_log_entry();
+ log_entry->ram_entry.set_entry_valid(true);
+ m_log_entries.push_back(log_entry);
+ ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+ }
+ if (m_cache_state->empty && !m_log_entries.empty()) {
+ m_cache_state->empty = false;
+ this->update_image_cache_state();
+ this->write_image_cache_state(locker);
+ }
+}
+
+template <typename I>
+void WriteLog<I>::construct_flush_entries(pwl::GenericLogEntries entries_to_flush,
+ DeferredContexts &post_unlock,
+ bool has_write_entry) {
+ // snapshot so we behave consistently
+ bool invalidating = this->m_invalidating;
+
+ if (invalidating || !has_write_entry) {
+ for (auto &log_entry : entries_to_flush) {
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, log_entry, invalidating]
+ (GuardedRequestFunctionContext &guard_ctx) {
+ log_entry->m_cell = guard_ctx.cell;
+ Context *ctx = this->construct_flush_entry(log_entry, invalidating);
+
+ if (!invalidating) {
+ ctx = new LambdaContext([this, log_entry, ctx](int r) {
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback(this->m_image_writeback, ctx);
+ }), 0);
+ });
+ }
+ ctx->complete(0);
+ });
+ this->detain_flush_guard_request(log_entry, guarded_ctx);
+ }
+ } else {
+ int count = entries_to_flush.size();
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> write_entries;
+ std::vector<bufferlist *> read_bls;
+
+ write_entries.reserve(count);
+ read_bls.reserve(count);
+
+ for (auto &log_entry : entries_to_flush) {
+ if (log_entry->is_write_entry()) {
+ bufferlist *bl = new bufferlist;
+ auto write_entry = static_pointer_cast<WriteLogEntry>(log_entry);
+ write_entry->inc_bl_refs();
+ write_entries.push_back(write_entry);
+ read_bls.push_back(bl);
+ }
+ }
+
+ Context *ctx = new LambdaContext(
+ [this, entries_to_flush, read_bls](int r) {
+ int i = 0;
+ GuardedRequestFunctionContext *guarded_ctx = nullptr;
+
+ for (auto &log_entry : entries_to_flush) {
+ if (log_entry->is_write_entry()) {
+ bufferlist captured_entry_bl;
+ captured_entry_bl.claim_append(*read_bls[i]);
+ delete read_bls[i++];
+
+ guarded_ctx = new GuardedRequestFunctionContext([this, log_entry, captured_entry_bl]
+ (GuardedRequestFunctionContext &guard_ctx) {
+ log_entry->m_cell = guard_ctx.cell;
+ Context *ctx = this->construct_flush_entry(log_entry, false);
+
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, entry_bl=std::move(captured_entry_bl), ctx](int r) {
+ auto captured_entry_bl = std::move(entry_bl);
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback_bl(this->m_image_writeback, ctx,
+ std::move(captured_entry_bl));
+ }), 0);
+ });
+ } else {
+ guarded_ctx = new GuardedRequestFunctionContext([this, log_entry]
+ (GuardedRequestFunctionContext &guard_ctx) {
+ log_entry->m_cell = guard_ctx.cell;
+ Context *ctx = this->construct_flush_entry(log_entry, false);
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback(this->m_image_writeback, ctx);
+ }), 0);
+ });
+ }
+ this->detain_flush_guard_request(log_entry, guarded_ctx);
+ }
+ });
+
+ aio_read_data_blocks(write_entries, read_bls, ctx);
+ }
+}
+
+template <typename I>
+void WriteLog<I>::process_work() {
+ CephContext *cct = m_image_ctx.cct;
+ int max_iterations = 4;
+ bool wake_up_requested = false;
+ uint64_t aggressive_high_water_bytes =
+ this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER;
+
+ ldout(cct, 20) << dendl;
+
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_requested = false;
+ }
+ if (this->m_alloc_failed_since_retire || (this->m_shutting_down) ||
+ this->m_invalidating || m_bytes_allocated > high_water_bytes) {
+ ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
+ << ", allocated > high_water="
+ << (m_bytes_allocated > high_water_bytes)
+ << dendl;
+ retire_entries((this->m_shutting_down || this->m_invalidating ||
+ m_bytes_allocated > aggressive_high_water_bytes)
+ ? MAX_ALLOC_PER_TRANSACTION : MAX_FREE_PER_TRANSACTION);
+ }
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+ {
+ std::lock_guard locker(m_lock);
+ wake_up_requested = this->m_wake_up_requested;
+ }
+ } while (wake_up_requested && --max_iterations > 0);
+
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_scheduled = false;
+ // Reschedule if it's still requested
+ if (this->m_wake_up_requested) {
+ this->wake_up();
+ }
+ }
+}
+
+/**
+ * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
+ * that are eligible to be retired. Returns true if anything was
+ * retired.
+ *
+*/
+template <typename I>
+bool WriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogEntriesVector retiring_entries;
+ uint64_t initial_first_valid_entry;
+ uint64_t first_valid_entry;
+
+ std::lock_guard retire_locker(this->m_log_retire_lock);
+ ldout(cct, 20) << "Look for entries to retire" << dendl;
+ {
+ // Entry readers can't be added while we hold m_entry_reader_lock
+ RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
+ std::lock_guard locker(m_lock);
+ initial_first_valid_entry = m_first_valid_entry;
+ first_valid_entry = m_first_valid_entry;
+ while (retiring_entries.size() < frees_per_tx && !m_log_entries.empty()) {
+ GenericLogEntriesVector retiring_subentries;
+ uint64_t control_block_pos = m_log_entries.front()->log_entry_index;
+ uint64_t data_length = 0;
+ for (auto it = m_log_entries.begin(); it != m_log_entries.end(); ++it) {
+ if (this->can_retire_entry(*it)) {
+ // log_entry_index is valid after appending to SSD
+ if ((*it)->log_entry_index != control_block_pos) {
+ ldout(cct, 20) << "Old log_entry_index is " << control_block_pos
+ << ",New log_entry_index is "
+ << (*it)->log_entry_index
+ << ",data length is " << data_length << dendl;
+ ldout(cct, 20) << "The log entry is " << *(*it) << dendl;
+ if ((*it)->log_entry_index < control_block_pos) {
+ ceph_assert((*it)->log_entry_index ==
+ (control_block_pos + data_length + MIN_WRITE_ALLOC_SSD_SIZE) %
+ this->m_log_pool_size + DATA_RING_BUFFER_OFFSET);
+ } else {
+ ceph_assert((*it)->log_entry_index == control_block_pos +
+ data_length + MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ break;
+ } else {
+ retiring_subentries.push_back(*it);
+ if ((*it)->is_write_entry()) {
+ data_length += (*it)->get_aligned_data_size();
+ }
+ }
+ } else {
+ retiring_subentries.clear();
+ break;
+ }
+ }
+ // SSD: retiring_subentries in a span
+ if (!retiring_subentries.empty()) {
+ for (auto it = retiring_subentries.begin();
+ it != retiring_subentries.end(); it++) {
+ ceph_assert(m_log_entries.front() == *it);
+ m_log_entries.pop_front();
+ if ((*it)->write_bytes() > 0 || (*it)->bytes_dirty() > 0) {
+ auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(*it);
+ if (gen_write_entry) {
+ this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
+ }
+ }
+ }
+
+ ldout(cct, 20) << "span with " << retiring_subentries.size()
+ << " entries: control_block_pos=" << control_block_pos
+ << " data_length=" << data_length
+ << dendl;
+ retiring_entries.insert(
+ retiring_entries.end(), retiring_subentries.begin(),
+ retiring_subentries.end());
+
+ first_valid_entry = control_block_pos + data_length +
+ MIN_WRITE_ALLOC_SSD_SIZE;
+ if (first_valid_entry >= this->m_log_pool_size) {
+ first_valid_entry = first_valid_entry % this->m_log_pool_size +
+ DATA_RING_BUFFER_OFFSET;
+ }
+ } else {
+ break;
+ }
+ }
+ }
+ if (retiring_entries.size()) {
+ ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries"
+ << dendl;
+
+ // Advance first valid entry and release buffers
+ uint64_t flushed_sync_gen;
+ std::lock_guard append_locker(this->m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = this->m_flushed_sync_gen;
+ }
+
+ ceph_assert(first_valid_entry != initial_first_valid_entry);
+ auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+ new_root->flushed_sync_gen = flushed_sync_gen;
+ new_root->first_valid_entry = first_valid_entry;
+ pool_root.flushed_sync_gen = flushed_sync_gen;
+ pool_root.first_valid_entry = first_valid_entry;
+
+ Context *ctx = new LambdaContext(
+ [this, first_valid_entry, initial_first_valid_entry,
+ retiring_entries](int r) {
+ uint64_t allocated_bytes = 0;
+ uint64_t cached_bytes = 0;
+ uint64_t former_log_pos = 0;
+ for (auto &entry : retiring_entries) {
+ ceph_assert(entry->log_entry_index != 0);
+ if (entry->log_entry_index != former_log_pos ) {
+ // Space for control blocks
+ allocated_bytes += MIN_WRITE_ALLOC_SSD_SIZE;
+ former_log_pos = entry->log_entry_index;
+ }
+ if (entry->is_write_entry()) {
+ cached_bytes += entry->write_bytes();
+ // space for userdata
+ allocated_bytes += entry->get_aligned_data_size();
+ }
+ }
+ bool need_update_state = false;
+ {
+ std::lock_guard locker(m_lock);
+ m_first_valid_entry = first_valid_entry;
+ ceph_assert(m_first_valid_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ ceph_assert(this->m_bytes_allocated >= allocated_bytes);
+ this->m_bytes_allocated -= allocated_bytes;
+ ceph_assert(this->m_bytes_cached >= cached_bytes);
+ this->m_bytes_cached -= cached_bytes;
+ if (!m_cache_state->empty && m_log_entries.empty()) {
+ m_cache_state->empty = true;
+ this->update_image_cache_state();
+ need_update_state = true;
+ }
+
+ ldout(m_image_ctx.cct, 20)
+ << "Finished root update: " << "initial_first_valid_entry="
+ << initial_first_valid_entry << ", " << "m_first_valid_entry="
+ << m_first_valid_entry << "," << "release space = "
+ << allocated_bytes << "," << "m_bytes_allocated="
+ << m_bytes_allocated << "," << "release cached space="
+ << cached_bytes << "," << "m_bytes_cached="
+ << this->m_bytes_cached << dendl;
+
+ this->m_alloc_failed_since_retire = false;
+ this->wake_up();
+ }
+ if (need_update_state) {
+ std::unique_lock locker(m_lock);
+ this->write_image_cache_state(locker);
+ }
+
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+ m_async_update_superblock--;
+ this->m_async_op_tracker.finish_op();
+ });
+
+ std::lock_guard locker(m_lock);
+ schedule_update_root(new_root, ctx);
+ } else {
+ ldout(cct, 20) << "Nothing to retire" << dendl;
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+void WriteLog<I>::append_ops(GenericLogOperations &ops, Context *ctx,
+ uint64_t* new_first_free_entry) {
+ GenericLogEntriesVector log_entries;
+ CephContext *cct = m_image_ctx.cct;
+ uint64_t span_payload_len = 0;
+ uint64_t bytes_to_free = 0;
+ ldout(cct, 20) << "Appending " << ops.size() << " log entries." << dendl;
+
+ *new_first_free_entry = pool_root.first_free_entry;
+ AioTransContext* aio = new AioTransContext(cct, ctx);
+
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ operation->log_append_start_time = now;
+ auto log_entry = operation->get_log_entry();
+
+ if (log_entries.size() == CONTROL_BLOCK_MAX_LOG_ENTRIES ||
+ span_payload_len >= SPAN_MAX_DATA_LEN) {
+ if (log_entries.size() > 1) {
+ bytes_to_free += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE;
+ }
+ write_log_entries(log_entries, aio, new_first_free_entry);
+ log_entries.clear();
+ span_payload_len = 0;
+ }
+ log_entries.push_back(log_entry);
+ span_payload_len += log_entry->write_bytes();
+ }
+ if (!span_payload_len || !log_entries.empty()) {
+ if (log_entries.size() > 1) {
+ bytes_to_free += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE;
+ }
+ write_log_entries(log_entries, aio, new_first_free_entry);
+ }
+
+ {
+ std::lock_guard locker1(m_lock);
+ m_first_free_entry = *new_first_free_entry;
+ m_bytes_allocated -= bytes_to_free;
+ }
+
+ bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+void WriteLog<I>::write_log_entries(GenericLogEntriesVector log_entries,
+ AioTransContext *aio, uint64_t *pos) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(m_image_ctx.cct, 20) << "pos=" << *pos << dendl;
+ ceph_assert(*pos >= DATA_RING_BUFFER_OFFSET &&
+ *pos < this->m_log_pool_size &&
+ *pos % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+
+ // The first block is for log entries
+ uint64_t control_block_pos = *pos;
+ *pos += MIN_WRITE_ALLOC_SSD_SIZE;
+ if (*pos == this->m_log_pool_size) {
+ *pos = DATA_RING_BUFFER_OFFSET;
+ }
+
+ std::vector<WriteLogCacheEntry> persist_log_entries;
+ bufferlist data_bl;
+ for (auto &log_entry : log_entries) {
+ log_entry->log_entry_index = control_block_pos;
+ // Append data buffer for write operations
+ if (log_entry->is_write_entry()) {
+ auto write_entry = static_pointer_cast<WriteLogEntry>(log_entry);
+ auto cache_bl = write_entry->get_cache_bl();
+ auto align_size = write_entry->get_aligned_data_size();
+ data_bl.append(cache_bl);
+ data_bl.append_zero(align_size - cache_bl.length());
+
+ write_entry->ram_entry.write_data_pos = *pos;
+ *pos += align_size;
+ if (*pos >= this->m_log_pool_size) {
+ *pos = *pos % this->m_log_pool_size + DATA_RING_BUFFER_OFFSET;
+ }
+ }
+ // push_back _after_ setting write_data_pos
+ persist_log_entries.push_back(log_entry->ram_entry);
+ }
+
+ //aio write
+ bufferlist bl;
+ encode(persist_log_entries, bl);
+ ceph_assert(bl.length() <= MIN_WRITE_ALLOC_SSD_SIZE);
+ bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+ bl.append(data_bl);
+ ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ if (control_block_pos + bl.length() > this->m_log_pool_size) {
+ //exceeds border, need to split
+ uint64_t size = bl.length();
+ bufferlist bl1;
+ bl.splice(0, this->m_log_pool_size - control_block_pos, &bl1);
+ ceph_assert(bl.length() == (size - bl1.length()));
+
+ ldout(cct, 20) << "write " << control_block_pos << "~"
+ << size << " spans boundary, split into "
+ << control_block_pos << "~" << bl1.length()
+ << " and " << DATA_RING_BUFFER_OFFSET << "~"
+ << bl.length() << dendl;
+ bdev->aio_write(control_block_pos, bl1, &aio->ioc, false,
+ WRITE_LIFE_NOT_SET);
+ bdev->aio_write(DATA_RING_BUFFER_OFFSET, bl, &aio->ioc, false,
+ WRITE_LIFE_NOT_SET);
+ } else {
+ ldout(cct, 20) << "write " << control_block_pos << "~"
+ << bl.length() << dendl;
+ bdev->aio_write(control_block_pos, bl, &aio->ioc, false,
+ WRITE_LIFE_NOT_SET);
+ }
+}
+
+template <typename I>
+void WriteLog<I>::schedule_update_root(
+ std::shared_ptr<WriteLogPoolRoot> root, Context *ctx) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 15) << "New root: pool_size=" << root->pool_size
+ << " first_valid_entry=" << root->first_valid_entry
+ << " first_free_entry=" << root->first_free_entry
+ << " flushed_sync_gen=" << root->flushed_sync_gen
+ << dendl;
+ ceph_assert(is_valid_pool_root(*root));
+
+ bool need_finisher;
+ {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ need_finisher = m_poolroot_to_update.empty() && !m_updating_pool_root;
+ std::shared_ptr<WriteLogPoolRootUpdate> entry =
+ std::make_shared<WriteLogPoolRootUpdate>(root, ctx);
+ this->m_async_update_superblock++;
+ this->m_async_op_tracker.start_op();
+ m_poolroot_to_update.emplace_back(entry);
+ }
+ if (need_finisher) {
+ enlist_op_update_root();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::enlist_op_update_root() {
+ Context *append_ctx = new LambdaContext([this](int r) {
+ update_root_scheduled_ops();
+ });
+ this->m_work_queue.queue(append_ctx);
+}
+
+template <typename I>
+void WriteLog<I>::update_root_scheduled_ops() {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ std::shared_ptr<WriteLogPoolRoot> root;
+ WriteLogPoolRootUpdateList root_updates;
+ Context *ctx = nullptr;
+ {
+ std::lock_guard locker(m_lock);
+ if (m_updating_pool_root) {
+ /* Another thread is appending */
+ ldout(m_image_ctx.cct, 15) << "Another thread is updating pool root"
+ << dendl;
+ return;
+ }
+ if (m_poolroot_to_update.size()) {
+ m_updating_pool_root = true;
+ root_updates.swap(m_poolroot_to_update);
+ }
+ }
+ ceph_assert(!root_updates.empty());
+ ldout(m_image_ctx.cct, 15) << "Update root number: " << root_updates.size()
+ << dendl;
+ // We just update the last one, and call all the completions.
+ auto entry = root_updates.back();
+ root = entry->root;
+
+ ctx = new LambdaContext([this, updates = std::move(root_updates)](int r) {
+ ldout(m_image_ctx.cct, 15) << "Start to callback." << dendl;
+ for (auto it = updates.begin(); it != updates.end(); it++) {
+ Context *it_ctx = (*it)->ctx;
+ it_ctx->complete(r);
+ }
+ });
+ Context *append_ctx = new LambdaContext([this, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "Finish the update of pool root." << dendl;
+ bool need_finisher = false;;
+ assert(r == 0);
+ {
+ std::lock_guard locker(m_lock);
+ m_updating_pool_root = false;
+ need_finisher = !m_poolroot_to_update.empty();
+ }
+ if (need_finisher) {
+ enlist_op_update_root();
+ }
+ ctx->complete(r);
+ });
+ AioTransContext* aio = new AioTransContext(m_image_ctx.cct, append_ctx);
+ update_pool_root(root, aio);
+}
+
+template <typename I>
+void WriteLog<I>::update_pool_root(std::shared_ptr<WriteLogPoolRoot> root,
+ AioTransContext *aio) {
+ bufferlist bl;
+ SuperBlock superblock;
+ superblock.root = *root;
+ encode(superblock, bl);
+ bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+ ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ bdev->aio_write(0, bl, &aio->ioc, false, WRITE_LIFE_NOT_SET);
+ bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+int WriteLog<I>::update_pool_root_sync(
+ std::shared_ptr<WriteLogPoolRoot> root) {
+ bufferlist bl;
+ SuperBlock superblock;
+ superblock.root = *root;
+ encode(superblock, bl);
+ bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+ ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ return bdev->write(0, bl, false);
+}
+
+template <typename I>
+void WriteLog<I>::aio_read_data_block(std::shared_ptr<GenericWriteLogEntry> log_entry,
+ bufferlist *bl, Context *ctx) {
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> log_entries = {std::move(log_entry)};
+ std::vector<bufferlist *> bls {bl};
+ aio_read_data_blocks(log_entries, bls, ctx);
+}
+
+template <typename I>
+void WriteLog<I>::aio_read_data_blocks(
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries,
+ std::vector<bufferlist *> &bls, Context *ctx) {
+ ceph_assert(log_entries.size() == bls.size());
+
+ //get the valid part
+ Context *read_ctx = new LambdaContext(
+ [log_entries, bls, ctx](int r) {
+ for (unsigned int i = 0; i < log_entries.size(); i++) {
+ bufferlist valid_data_bl;
+ auto write_entry = static_pointer_cast<WriteLogEntry>(log_entries[i]);
+ auto length = write_entry->ram_entry.is_write() ? write_entry->ram_entry.write_bytes
+ : write_entry->ram_entry.ws_datalen;
+
+ valid_data_bl.substr_of(*bls[i], 0, length);
+ bls[i]->clear();
+ bls[i]->append(valid_data_bl);
+ write_entry->dec_bl_refs();
+ }
+ ctx->complete(r);
+ });
+
+ CephContext *cct = m_image_ctx.cct;
+ AioTransContext *aio = new AioTransContext(cct, read_ctx);
+ for (unsigned int i = 0; i < log_entries.size(); i++) {
+ WriteLogCacheEntry *log_entry = &log_entries[i]->ram_entry;
+
+ ceph_assert(log_entry->is_write() || log_entry->is_writesame());
+ uint64_t len = log_entry->is_write() ? log_entry->write_bytes :
+ log_entry->ws_datalen;
+ uint64_t align_len = round_up_to(len, MIN_WRITE_ALLOC_SSD_SIZE);
+
+ ldout(cct, 20) << "entry i=" << i << " " << log_entry->write_data_pos
+ << "~" << len << dendl;
+ ceph_assert(log_entry->write_data_pos >= DATA_RING_BUFFER_OFFSET &&
+ log_entry->write_data_pos < pool_root.pool_size);
+ ceph_assert(align_len);
+ if (log_entry->write_data_pos + align_len > pool_root.pool_size) {
+ // spans boundary, need to split
+ uint64_t len1 = pool_root.pool_size - log_entry->write_data_pos;
+ uint64_t len2 = align_len - len1;
+
+ ldout(cct, 20) << "read " << log_entry->write_data_pos << "~"
+ << align_len << " spans boundary, split into "
+ << log_entry->write_data_pos << "~" << len1
+ << " and " << DATA_RING_BUFFER_OFFSET << "~"
+ << len2 << dendl;
+ bdev->aio_read(log_entry->write_data_pos, len1, bls[i], &aio->ioc);
+ bdev->aio_read(DATA_RING_BUFFER_OFFSET, len2, bls[i], &aio->ioc);
+ } else {
+ ldout(cct, 20) << "read " << log_entry->write_data_pos << "~"
+ << align_len << dendl;
+ bdev->aio_read(log_entry->write_data_pos, align_len, bls[i], &aio->ioc);
+ }
+ }
+ bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+void WriteLog<I>::complete_user_request(Context *&user_req, int r) {
+ m_image_ctx.op_work_queue->queue(user_req, r);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ssd::WriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/ssd/WriteLog.h b/src/librbd/cache/pwl/ssd/WriteLog.h
new file mode 100644
index 000000000..69cc36662
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/WriteLog.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
+
+#include "blk/BlockDevice.h"
+#include "common/AsyncOpTracker.h"
+#include "common/Checksummer.h"
+#include "common/environment.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/ssd/Builder.h"
+#include "librbd/cache/pwl/ssd/Types.h"
+#include <functional>
+#include <list>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename ImageCtxT>
+class WriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+ WriteLog(ImageCtxT &image_ctx,
+ librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api);
+ ~WriteLog();
+ WriteLog(const WriteLog&) = delete;
+ WriteLog &operator=(const WriteLog&) = delete;
+
+ typedef io::Extent Extent;
+ using This = AbstractWriteLog<ImageCtxT>;
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+ bool alloc_resources(C_BlockIORequestT *req) override;
+ void setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush,
+ C_BlockIORequestT *req) override;
+ void complete_user_request(Context *&user_req, int r) override;
+
+protected:
+ using AbstractWriteLog<ImageCtxT>::m_lock;
+ using AbstractWriteLog<ImageCtxT>::m_log_entries;
+ using AbstractWriteLog<ImageCtxT>::m_image_ctx;
+ using AbstractWriteLog<ImageCtxT>::m_cache_state;
+ using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
+ using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
+ using AbstractWriteLog<ImageCtxT>::m_bytes_allocated;
+
+ bool initialize_pool(Context *on_finish,
+ pwl::DeferredContexts &later) override;
+ void process_work() override;
+ void append_scheduled_ops(void) override;
+ void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) override;
+ void remove_pool_file() override;
+ void release_ram(std::shared_ptr<GenericLogEntry> log_entry) override;
+
+private:
+ class AioTransContext {
+ public:
+ Context *on_finish;
+ ::IOContext ioc;
+ explicit AioTransContext(CephContext* cct, Context *cb)
+ : on_finish(cb), ioc(cct, this) {}
+
+ ~AioTransContext(){}
+
+ void aio_finish() {
+ on_finish->complete(ioc.get_return_value());
+ delete this;
+ }
+ }; //class AioTransContext
+
+ struct WriteLogPoolRootUpdate {
+ std::shared_ptr<pwl::WriteLogPoolRoot> root;
+ Context *ctx;
+ WriteLogPoolRootUpdate(std::shared_ptr<pwl::WriteLogPoolRoot> r,
+ Context* c)
+ : root(r), ctx(c) {}
+ };
+
+ using WriteLogPoolRootUpdateList = std::list<std::shared_ptr<WriteLogPoolRootUpdate>>;
+ WriteLogPoolRootUpdateList m_poolroot_to_update; /* pool root list to update to SSD */
+ bool m_updating_pool_root = false;
+
+ std::atomic<int> m_async_update_superblock = {0};
+ BlockDevice *bdev = nullptr;
+ pwl::WriteLogPoolRoot pool_root;
+ Builder<This> *m_builderobj;
+
+ Builder<This>* create_builder();
+ int create_and_open_bdev();
+ void load_existing_entries(pwl::DeferredContexts &later);
+ void inc_allocated_cached_bytes(
+ std::shared_ptr<pwl::GenericLogEntry> log_entry) override;
+ void collect_read_extents(
+ uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+ Extent hit_extent, pwl::C_ReadRequest *read_ctx) override;
+ void complete_read(
+ std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read,
+ std::vector<bufferlist*> &bls_to_read, Context *ctx) override;
+ void enlist_op_appender();
+ bool retire_entries(const unsigned long int frees_per_tx);
+ bool has_sync_point_logs(GenericLogOperations &ops);
+ void append_op_log_entries(GenericLogOperations &ops);
+ void alloc_op_log_entries(GenericLogOperations &ops);
+ void construct_flush_entries(pwl::GenericLogEntries entires_to_flush,
+ DeferredContexts &post_unlock,
+ bool has_write_entry) override;
+ void append_ops(GenericLogOperations &ops, Context *ctx,
+ uint64_t* new_first_free_entry);
+ void write_log_entries(GenericLogEntriesVector log_entries,
+ AioTransContext *aio, uint64_t *pos);
+ void schedule_update_root(std::shared_ptr<WriteLogPoolRoot> root,
+ Context *ctx);
+ void enlist_op_update_root();
+ void update_root_scheduled_ops();
+ int update_pool_root_sync(std::shared_ptr<pwl::WriteLogPoolRoot> root);
+ void update_pool_root(std::shared_ptr<WriteLogPoolRoot> root,
+ AioTransContext *aio);
+ void aio_read_data_block(std::shared_ptr<GenericWriteLogEntry> log_entry,
+ bufferlist *bl, Context *ctx);
+ void aio_read_data_blocks(std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries,
+ std::vector<bufferlist *> &bls, Context *ctx);
+ static void aio_cache_cb(void *priv, void *priv2) {
+ AioTransContext *c = static_cast<AioTransContext*>(priv2);
+ c->aio_finish();
+ }
+};//class WriteLog
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ssd::WriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
diff --git a/src/librbd/crypto/BlockCrypto.cc b/src/librbd/crypto/BlockCrypto.cc
new file mode 100644
index 000000000..0b74db04a
--- /dev/null
+++ b/src/librbd/crypto/BlockCrypto.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/crypto/BlockCrypto.h"
+#include "include/byteorder.h"
+#include "include/ceph_assert.h"
+#include "include/scope_guard.h"
+
+#include <stdlib.h>
+
+namespace librbd {
+namespace crypto {
+
+template <typename T>
+BlockCrypto<T>::BlockCrypto(CephContext* cct, DataCryptor<T>* data_cryptor,
+ uint64_t block_size, uint64_t data_offset)
+ : m_cct(cct), m_data_cryptor(data_cryptor), m_block_size(block_size),
+ m_data_offset(data_offset), m_iv_size(data_cryptor->get_iv_size()) {
+ ceph_assert(isp2(block_size));
+ ceph_assert((block_size % data_cryptor->get_block_size()) == 0);
+ ceph_assert((block_size % 512) == 0);
+}
+
+template <typename T>
+BlockCrypto<T>::~BlockCrypto() {
+ if (m_data_cryptor != nullptr) {
+ delete m_data_cryptor;
+ m_data_cryptor = nullptr;
+ }
+}
+
+template <typename T>
+int BlockCrypto<T>::crypt(ceph::bufferlist* data, uint64_t image_offset,
+ CipherMode mode) {
+ if (image_offset % m_block_size != 0) {
+ lderr(m_cct) << "image offset: " << image_offset
+ << " not aligned to block size: " << m_block_size << dendl;
+ return -EINVAL;
+ }
+ if (data->length() % m_block_size != 0) {
+ lderr(m_cct) << "data length: " << data->length()
+ << " not aligned to block size: " << m_block_size << dendl;
+ return -EINVAL;
+ }
+
+ unsigned char* iv = (unsigned char*)alloca(m_iv_size);
+ memset(iv, 0, m_iv_size);
+
+ bufferlist src = *data;
+ data->clear();
+
+ auto ctx = m_data_cryptor->get_context(mode);
+ if (ctx == nullptr) {
+ lderr(m_cct) << "unable to get crypt context" << dendl;
+ return -EIO;
+ }
+
+ auto sg = make_scope_guard([&] {
+ m_data_cryptor->return_context(ctx, mode); });
+
+ auto sector_number = image_offset / 512;
+ auto appender = data->get_contiguous_appender(src.length());
+ unsigned char* out_buf_ptr = nullptr;
+ unsigned char* leftover_block = (unsigned char*)alloca(m_block_size);
+ uint32_t leftover_size = 0;
+ for (auto buf = src.buffers().begin(); buf != src.buffers().end(); ++buf) {
+ auto in_buf_ptr = reinterpret_cast<const unsigned char*>(buf->c_str());
+ auto remaining_buf_bytes = buf->length();
+ while (remaining_buf_bytes > 0) {
+ if (leftover_size == 0) {
+ auto block_offset_le = init_le64(sector_number);
+ memcpy(iv, &block_offset_le, sizeof(block_offset_le));
+ auto r = m_data_cryptor->init_context(ctx, iv, m_iv_size);
+ if (r != 0) {
+ lderr(m_cct) << "unable to init cipher's IV" << dendl;
+ return r;
+ }
+
+ out_buf_ptr = reinterpret_cast<unsigned char*>(
+ appender.get_pos_add(m_block_size));
+ sector_number += m_block_size / 512;
+ }
+
+ if (leftover_size > 0 || remaining_buf_bytes < m_block_size) {
+ auto copy_size = std::min(
+ (uint32_t)m_block_size - leftover_size, remaining_buf_bytes);
+ memcpy(leftover_block + leftover_size, in_buf_ptr, copy_size);
+ in_buf_ptr += copy_size;
+ leftover_size += copy_size;
+ remaining_buf_bytes -= copy_size;
+ }
+
+ int crypto_output_length = 0;
+ if (leftover_size == 0) {
+ crypto_output_length = m_data_cryptor->update_context(
+ ctx, in_buf_ptr, out_buf_ptr, m_block_size);
+
+ in_buf_ptr += m_block_size;
+ remaining_buf_bytes -= m_block_size;
+ } else if (leftover_size == m_block_size) {
+ crypto_output_length = m_data_cryptor->update_context(
+ ctx, leftover_block, out_buf_ptr, m_block_size);
+ leftover_size = 0;
+ }
+
+ if (crypto_output_length < 0) {
+ lderr(m_cct) << "crypt update failed" << dendl;
+ return crypto_output_length;
+ }
+
+ out_buf_ptr += crypto_output_length;
+ }
+ }
+
+ return 0;
+}
+
+template <typename T>
+int BlockCrypto<T>::encrypt(ceph::bufferlist* data, uint64_t image_offset) {
+ return crypt(data, image_offset, CipherMode::CIPHER_MODE_ENC);
+}
+
+template <typename T>
+int BlockCrypto<T>::decrypt(ceph::bufferlist* data, uint64_t image_offset) {
+ return crypt(data, image_offset, CipherMode::CIPHER_MODE_DEC);
+}
+
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::BlockCrypto<EVP_CIPHER_CTX>;
diff --git a/src/librbd/crypto/BlockCrypto.h b/src/librbd/crypto/BlockCrypto.h
new file mode 100644
index 000000000..0bbdd2524
--- /dev/null
+++ b/src/librbd/crypto/BlockCrypto.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_BLOCK_CRYPTO_H
+#define CEPH_LIBRBD_CRYPTO_BLOCK_CRYPTO_H
+
+#include "include/Context.h"
+#include "librbd/crypto/CryptoInterface.h"
+#include "librbd/crypto/openssl/DataCryptor.h"
+
+namespace librbd {
+namespace crypto {
+
+template <typename T>
+class BlockCrypto : public CryptoInterface {
+
+public:
+ static BlockCrypto* create(CephContext* cct, DataCryptor<T>* data_cryptor,
+ uint32_t block_size, uint64_t data_offset) {
+ return new BlockCrypto(cct, data_cryptor, block_size, data_offset);
+ }
+ BlockCrypto(CephContext* cct, DataCryptor<T>* data_cryptor,
+ uint64_t block_size, uint64_t data_offset);
+ ~BlockCrypto();
+
+ int encrypt(ceph::bufferlist* data, uint64_t image_offset) override;
+ int decrypt(ceph::bufferlist* data, uint64_t image_offset) override;
+
+ uint64_t get_block_size() const override {
+ return m_block_size;
+ }
+
+ uint64_t get_data_offset() const override {
+ return m_data_offset;
+ }
+
+ const unsigned char* get_key() const override {
+ return m_data_cryptor->get_key();
+ }
+
+ int get_key_length() const override {
+ return m_data_cryptor->get_key_length();
+ }
+
+private:
+ CephContext* m_cct;
+ DataCryptor<T>* m_data_cryptor;
+ uint64_t m_block_size;
+ uint64_t m_data_offset;
+ uint32_t m_iv_size;
+
+ int crypt(ceph::bufferlist* data, uint64_t image_offset, CipherMode mode);
+};
+
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::BlockCrypto<EVP_CIPHER_CTX>;
+
+#endif //CEPH_LIBRBD_CRYPTO_BLOCK_CRYPTO_H
diff --git a/src/librbd/crypto/CryptoContextPool.cc b/src/librbd/crypto/CryptoContextPool.cc
new file mode 100644
index 000000000..b303a54ec
--- /dev/null
+++ b/src/librbd/crypto/CryptoContextPool.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/crypto/CryptoContextPool.h"
+
+namespace librbd {
+namespace crypto {
+
+template <typename T>
+CryptoContextPool<T>::CryptoContextPool(DataCryptor<T>* data_cryptor,
+ uint32_t pool_size)
+ : m_data_cryptor(data_cryptor), m_encrypt_contexts(pool_size),
+ m_decrypt_contexts(pool_size) {
+}
+
+template <typename T>
+CryptoContextPool<T>::~CryptoContextPool() {
+ T* ctx;
+ while (m_encrypt_contexts.pop(ctx)) {
+ m_data_cryptor->return_context(ctx, CipherMode::CIPHER_MODE_ENC);
+ }
+ while (m_decrypt_contexts.pop(ctx)) {
+ m_data_cryptor->return_context(ctx, CipherMode::CIPHER_MODE_DEC);
+ }
+}
+
+template <typename T>
+T* CryptoContextPool<T>::get_context(CipherMode mode) {
+ T* ctx;
+ if (!get_contexts(mode).pop(ctx)) {
+ ctx = m_data_cryptor->get_context(mode);
+ }
+ return ctx;
+}
+
+template <typename T>
+void CryptoContextPool<T>::return_context(T* ctx, CipherMode mode) {
+ if (!get_contexts(mode).push(ctx)) {
+ m_data_cryptor->return_context(ctx, mode);
+ }
+}
+
+} // namespace crypto
+} // namespace librbd
diff --git a/src/librbd/crypto/CryptoContextPool.h b/src/librbd/crypto/CryptoContextPool.h
new file mode 100644
index 000000000..c0ebce0c2
--- /dev/null
+++ b/src/librbd/crypto/CryptoContextPool.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_CONTEXT_POOL_H
+#define CEPH_LIBRBD_CRYPTO_CRYPTO_CONTEXT_POOL_H
+
+#include "librbd/crypto/DataCryptor.h"
+#include "common/allocator.h"
+#include "include/ceph_assert.h"
+#include <boost/lockfree/queue.hpp>
+
+namespace librbd {
+namespace crypto {
+
+template <typename T>
+class CryptoContextPool : public DataCryptor<T> {
+
+public:
+ CryptoContextPool(DataCryptor<T>* data_cryptor, uint32_t pool_size);
+ ~CryptoContextPool();
+
+ T* get_context(CipherMode mode) override;
+ void return_context(T* ctx, CipherMode mode) override;
+
+ inline uint32_t get_block_size() const override {
+ return m_data_cryptor->get_block_size();
+ }
+ inline uint32_t get_iv_size() const override {
+ return m_data_cryptor->get_iv_size();
+ }
+ inline int get_key_length() const override {
+ return m_data_cryptor->get_key_length();
+ }
+ inline const unsigned char* get_key() const override {
+ return m_data_cryptor->get_key();
+ }
+ inline int init_context(T* ctx, const unsigned char* iv,
+ uint32_t iv_length) const override {
+ return m_data_cryptor->init_context(ctx, iv, iv_length);
+ }
+ inline int update_context(T* ctx, const unsigned char* in,
+ unsigned char* out,
+ uint32_t len) const override {
+ return m_data_cryptor->update_context(ctx, in, out, len);
+ }
+
+ typedef boost::lockfree::queue<
+ T*,
+ boost::lockfree::allocator<ceph::allocator<void>>> ContextQueue;
+
+private:
+ DataCryptor<T>* m_data_cryptor;
+ ContextQueue m_encrypt_contexts;
+ ContextQueue m_decrypt_contexts;
+
+ inline ContextQueue& get_contexts(CipherMode mode) {
+ switch(mode) {
+ case CIPHER_MODE_ENC:
+ return m_encrypt_contexts;
+ case CIPHER_MODE_DEC:
+ return m_decrypt_contexts;
+ default:
+ ceph_assert(false);
+ }
+ }
+};
+
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_CONTEXT_POOL_H
diff --git a/src/librbd/crypto/CryptoImageDispatch.cc b/src/librbd/crypto/CryptoImageDispatch.cc
new file mode 100644
index 000000000..15513bf55
--- /dev/null
+++ b/src/librbd/crypto/CryptoImageDispatch.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/crypto/CryptoImageDispatch.h"
+
+namespace librbd {
+namespace crypto {
+
+CryptoImageDispatch::CryptoImageDispatch(
+ uint64_t data_offset) : m_data_offset(data_offset) {
+}
+
+
+void CryptoImageDispatch::remap_extents(
+ io::Extents& image_extents, io::ImageExtentsMapType type) {
+ if (type == io::IMAGE_EXTENTS_MAP_TYPE_LOGICAL_TO_PHYSICAL) {
+ for (auto& extent: image_extents) {
+ extent.first += m_data_offset;
+ }
+ } else if (type == io::IMAGE_EXTENTS_MAP_TYPE_PHYSICAL_TO_LOGICAL) {
+ for (auto& extent: image_extents) {
+ extent.first -= m_data_offset;
+ }
+ }
+}
+
+} // namespace crypto
+} // namespace librbd
diff --git a/src/librbd/crypto/CryptoImageDispatch.h b/src/librbd/crypto/CryptoImageDispatch.h
new file mode 100644
index 000000000..dae3dac85
--- /dev/null
+++ b/src/librbd/crypto/CryptoImageDispatch.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_CRYPTO_CRYPTO_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+
+namespace librbd {
+namespace crypto {
+
+class CryptoImageDispatch : public io::ImageDispatchInterface {
+public:
+ static CryptoImageDispatch* create(uint64_t data_offset) {
+ return new CryptoImageDispatch(data_offset);
+ }
+ CryptoImageDispatch(uint64_t data_offset);
+
+ io::ImageDispatchLayer get_dispatch_layer() const override {
+ return io::IMAGE_DISPATCH_LAYER_CRYPTO;
+ }
+
+ void shut_down(Context* on_finish) override {
+ on_finish->complete(0);
+ }
+
+ bool read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&bl, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&bl, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool list_snaps(
+ io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void remap_extents(io::Extents& image_extents,
+ io::ImageExtentsMapType type) override;
+
+private:
+ uint64_t m_data_offset;
+
+};
+
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_IMAGE_DISPATCH_H
diff --git a/src/librbd/crypto/CryptoInterface.h b/src/librbd/crypto/CryptoInterface.h
new file mode 100644
index 000000000..170a5bf28
--- /dev/null
+++ b/src/librbd/crypto/CryptoInterface.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_INTERFACE_H
+#define CEPH_LIBRBD_CRYPTO_CRYPTO_INTERFACE_H
+
+#include "common/RefCountedObj.h"
+#include "include/buffer.h"
+#include "include/intarith.h"
+#include "librbd/io/Types.h"
+
+namespace librbd {
+namespace crypto {
+
+class CryptoInterface : public RefCountedObject {
+
+public:
+ virtual int encrypt(ceph::bufferlist* data, uint64_t image_offset) = 0;
+ virtual int decrypt(ceph::bufferlist* data, uint64_t image_offset) = 0;
+ virtual uint64_t get_block_size() const = 0;
+ virtual uint64_t get_data_offset() const = 0;
+ virtual const unsigned char* get_key() const = 0;
+ virtual int get_key_length() const = 0;
+
+ inline std::pair<uint64_t, uint64_t> get_pre_and_post_align(
+ uint64_t off, uint64_t len) {
+ if (len == 0) {
+ return std::make_pair(0, 0);
+ }
+ auto block_size = get_block_size();
+ return std::make_pair(p2phase(off, block_size),
+ p2nphase(off + len, block_size));
+ }
+
+ inline std::pair<uint64_t, uint64_t> align(uint64_t off, uint64_t len) {
+ auto aligns = get_pre_and_post_align(off, len);
+ return std::make_pair(off - aligns.first,
+ len + aligns.first + aligns.second);
+ }
+
+ inline bool is_aligned(uint64_t off, uint64_t len) {
+ auto aligns = get_pre_and_post_align(off, len);
+ return aligns.first == 0 && aligns.second == 0;
+ }
+
+ inline bool is_aligned(const io::ReadExtents& extents) {
+ for (const auto& extent: extents) {
+ if (!is_aligned(extent.offset, extent.length)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ inline void align_extents(const io::ReadExtents& extents,
+ io::ReadExtents* aligned_extents) {
+ for (const auto& extent: extents) {
+ auto aligned = align(extent.offset, extent.length);
+ aligned_extents->emplace_back(aligned.first, aligned.second);
+ }
+ }
+
+ inline int decrypt_aligned_extent(io::ReadExtent& extent,
+ uint64_t image_offset) {
+ if (extent.length == 0 || extent.bl.length() == 0) {
+ return 0;
+ }
+
+ if (extent.extent_map.empty()) {
+ extent.extent_map.emplace_back(extent.offset, extent.bl.length());
+ }
+
+ ceph::bufferlist result_bl;
+ io::Extents result_extent_map;
+
+ ceph::bufferlist curr_block_bl;
+ auto curr_offset = extent.offset;
+ auto curr_block_start_offset = curr_offset;
+ auto curr_block_end_offset = curr_offset;
+
+ // this will add a final loop iteration for decrypting the last extent
+ extent.extent_map.emplace_back(
+ extent.offset + extent.length + get_block_size(), 0);
+
+ for (auto [off, len]: extent.extent_map) {
+ auto [aligned_off, aligned_len] = align(off, len);
+ if (aligned_off > curr_block_end_offset) {
+ curr_block_bl.append_zero(curr_block_end_offset - curr_offset);
+ auto curr_block_length = curr_block_bl.length();
+ if (curr_block_length > 0) {
+ auto r = decrypt(
+ &curr_block_bl,
+ image_offset + curr_block_start_offset - extent.offset);
+ if (r != 0) {
+ return r;
+ }
+
+ curr_block_bl.splice(0, curr_block_length, &result_bl);
+ result_extent_map.emplace_back(
+ curr_block_start_offset, curr_block_length);
+ }
+
+ curr_block_start_offset = aligned_off;
+ curr_block_end_offset = aligned_off + aligned_len;
+ curr_offset = aligned_off;
+ }
+
+ curr_block_bl.append_zero(off - curr_offset);
+ extent.bl.splice(0, len, &curr_block_bl);
+ curr_offset = off + len;
+ curr_block_end_offset = aligned_off + aligned_len;
+ }
+
+ extent.bl = std::move(result_bl);
+ extent.extent_map = std::move(result_extent_map);
+
+ return 0;
+ }
+};
+
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_INTERFACE_H
diff --git a/src/librbd/crypto/CryptoObjectDispatch.cc b/src/librbd/crypto/CryptoObjectDispatch.cc
new file mode 100644
index 000000000..244f52dec
--- /dev/null
+++ b/src/librbd/crypto/CryptoObjectDispatch.cc
@@ -0,0 +1,661 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/crypto/CryptoObjectDispatch.h"
+#include "include/ceph_assert.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/crypto/CryptoInterface.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::CryptoObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+
+using librbd::util::create_context_callback;
+using librbd::util::data_object_name;
+
+template <typename I>
+struct C_AlignedObjectReadRequest : public Context {
+ I* image_ctx;
+ ceph::ref_t<CryptoInterface> crypto;
+ uint64_t object_no;
+ io::ReadExtents* extents;
+ IOContext io_context;
+ const ZTracer::Trace parent_trace;
+ uint64_t* version;
+ Context* on_finish;
+ io::ObjectDispatchSpec* req;
+ bool disable_read_from_parent;
+
+ C_AlignedObjectReadRequest(
+ I* image_ctx, ceph::ref_t<CryptoInterface> crypto,
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ Context* on_dispatched
+ ) : image_ctx(image_ctx), crypto(crypto), object_no(object_no),
+ extents(extents), io_context(io_context),
+ parent_trace(parent_trace), version(version),
+ on_finish(on_dispatched) {
+ disable_read_from_parent =
+ ((read_flags & io::READ_FLAG_DISABLE_READ_FROM_PARENT) != 0);
+ read_flags |= io::READ_FLAG_DISABLE_READ_FROM_PARENT;
+
+ auto ctx = create_context_callback<
+ C_AlignedObjectReadRequest<I>,
+ &C_AlignedObjectReadRequest<I>::handle_read>(this);
+
+ req = io::ObjectDispatchSpec::create_read(
+ image_ctx, io::OBJECT_DISPATCH_LAYER_CRYPTO, object_no,
+ extents, io_context, op_flags, read_flags, parent_trace,
+ version, ctx);
+ }
+
+ void send() {
+ req->send();
+ }
+
+ void finish(int r) override {
+ ldout(image_ctx->cct, 20) << "aligned read r=" << r << dendl;
+ on_finish->complete(r);
+ }
+
+ void handle_read(int r) {
+ auto cct = image_ctx->cct;
+ ldout(cct, 20) << "aligned read r=" << r << dendl;
+ if (r == 0) {
+ for (auto& extent: *extents) {
+ auto crypto_ret = crypto->decrypt_aligned_extent(
+ extent,
+ io::util::get_file_offset(
+ image_ctx, object_no, extent.offset));
+ if (crypto_ret != 0) {
+ ceph_assert(crypto_ret < 0);
+ r = crypto_ret;
+ break;
+ }
+ r += extent.length;
+ }
+ }
+
+ if (r == -ENOENT && !disable_read_from_parent) {
+ io::util::read_parent<I>(
+ image_ctx, object_no, extents,
+ io_context->read_snap().value_or(CEPH_NOSNAP),
+ parent_trace, this);
+ } else {
+ complete(r);
+ }
+ }
+};
+
+template <typename I>
+struct C_UnalignedObjectReadRequest : public Context {
+ CephContext* cct;
+ io::ReadExtents* extents;
+ Context* on_finish;
+ io::ReadExtents aligned_extents;
+ io::ObjectDispatchSpec* req;
+
+ C_UnalignedObjectReadRequest(
+ I* image_ctx, ceph::ref_t<CryptoInterface> crypto,
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ Context* on_dispatched) : cct(image_ctx->cct), extents(extents),
+ on_finish(on_dispatched) {
+ crypto->align_extents(*extents, &aligned_extents);
+
+ // send the aligned read back to get decrypted
+ req = io::ObjectDispatchSpec::create_read(
+ image_ctx,
+ io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO),
+ object_no, &aligned_extents, io_context, op_flags, read_flags,
+ parent_trace, version, this);
+ }
+
+ void send() {
+ req->send();
+ }
+
+ void remove_alignment_data() {
+ for (uint64_t i = 0; i < extents->size(); ++i) {
+ auto& extent = (*extents)[i];
+ auto& aligned_extent = aligned_extents[i];
+ if (aligned_extent.extent_map.empty()) {
+ uint64_t cut_offset = extent.offset - aligned_extent.offset;
+ int64_t padding_count =
+ cut_offset + extent.length - aligned_extent.bl.length();
+ if (padding_count > 0) {
+ aligned_extent.bl.append_zero(padding_count);
+ }
+ aligned_extent.bl.splice(cut_offset, extent.length, &extent.bl);
+ } else {
+ for (auto [off, len]: aligned_extent.extent_map) {
+ ceph::bufferlist tmp;
+ aligned_extent.bl.splice(0, len, &tmp);
+
+ uint64_t bytes_to_skip = 0;
+ if (off < extent.offset) {
+ bytes_to_skip = extent.offset - off;
+ if (len <= bytes_to_skip) {
+ continue;
+ }
+ off += bytes_to_skip;
+ len -= bytes_to_skip;
+ }
+
+ len = std::min(len, extent.offset + extent.length - off);
+ if (len == 0) {
+ continue;
+ }
+
+ if (len > 0) {
+ tmp.splice(bytes_to_skip, len, &extent.bl);
+ extent.extent_map.emplace_back(off, len);
+ }
+ }
+ }
+ }
+ }
+
+ void finish(int r) override {
+ ldout(cct, 20) << "unaligned read r=" << r << dendl;
+ if (r >= 0) {
+ remove_alignment_data();
+
+ r = 0;
+ for (auto& extent: *extents) {
+ r += extent.length;
+ }
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+struct C_UnalignedObjectWriteRequest : public Context {
+ I* image_ctx;
+ ceph::ref_t<CryptoInterface> crypto;
+ uint64_t object_no;
+ uint64_t object_off;
+ ceph::bufferlist data;
+ ceph::bufferlist cmp_data;
+ uint64_t* mismatch_offset;
+ IOContext io_context;
+ int op_flags;
+ int write_flags;
+ std::optional<uint64_t> assert_version;
+ const ZTracer::Trace parent_trace;
+ int* object_dispatch_flags;
+ uint64_t* journal_tid;
+ Context* on_finish;
+ bool may_copyup;
+ ceph::bufferlist aligned_data;
+ io::ReadExtents extents;
+ uint64_t version;
+ C_UnalignedObjectReadRequest<I>* read_req;
+ bool object_exists;
+
+ C_UnalignedObjectWriteRequest(
+ I* image_ctx, ceph::ref_t<CryptoInterface> crypto,
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ ceph::bufferlist&& cmp_data, uint64_t* mismatch_offset,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, Context* on_dispatched, bool may_copyup
+ ) : image_ctx(image_ctx), crypto(crypto), object_no(object_no),
+ object_off(object_off), data(data), cmp_data(cmp_data),
+ mismatch_offset(mismatch_offset), io_context(io_context),
+ op_flags(op_flags), write_flags(write_flags),
+ assert_version(assert_version), parent_trace(parent_trace),
+ object_dispatch_flags(object_dispatch_flags),
+ journal_tid(journal_tid), on_finish(on_dispatched),
+ may_copyup(may_copyup) {
+ // build read extents
+ auto [pre_align, post_align] = crypto->get_pre_and_post_align(
+ object_off, data.length());
+ if (pre_align != 0) {
+ extents.emplace_back(object_off - pre_align, pre_align);
+ }
+ if (post_align != 0) {
+ extents.emplace_back(object_off + data.length(), post_align);
+ }
+ if (cmp_data.length() != 0) {
+ extents.emplace_back(object_off, cmp_data.length());
+ }
+
+ auto ctx = create_context_callback<
+ C_UnalignedObjectWriteRequest<I>,
+ &C_UnalignedObjectWriteRequest<I>::handle_read>(this);
+
+ read_req = new C_UnalignedObjectReadRequest<I>(
+ image_ctx, crypto, object_no, &extents, io_context,
+ 0, io::READ_FLAG_DISABLE_READ_FROM_PARENT, parent_trace,
+ &version, 0, ctx);
+ }
+
+ void send() {
+ read_req->send();
+ }
+
+ bool check_cmp_data() {
+ if (cmp_data.length() == 0) {
+ return true;
+ }
+
+ auto& cmp_extent = extents.back();
+ io::util::unsparsify(image_ctx->cct, &cmp_extent.bl,
+ cmp_extent.extent_map, cmp_extent.offset,
+ cmp_extent.length);
+
+ std::optional<uint64_t> found_mismatch = std::nullopt;
+
+ auto it1 = cmp_data.cbegin();
+ auto it2 = cmp_extent.bl.cbegin();
+ for (uint64_t idx = 0; idx < cmp_data.length(); ++idx) {
+ if (*it1 != *it2) {
+ found_mismatch = std::make_optional(idx);
+ break;
+ }
+ ++it1;
+ ++it2;
+ }
+
+ extents.pop_back();
+
+ if (found_mismatch.has_value()) {
+ if (mismatch_offset != nullptr) {
+ *mismatch_offset = found_mismatch.value();
+ }
+ complete(-EILSEQ);
+ return false;
+ }
+
+ return true;
+ }
+
+ bool check_create_exclusive() {
+ bool exclusive =
+ ((write_flags & io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0);
+ if (exclusive && object_exists) {
+ complete(-EEXIST);
+ return false;
+ }
+ return true;
+ }
+
+ bool check_version() {
+ int r = 0;
+ if (assert_version.has_value()) {
+ if (!object_exists) {
+ r = -ENOENT;
+ } else if (assert_version.value() < version) {
+ r = -ERANGE;
+ } else if (assert_version.value() > version) {
+ r = -EOVERFLOW;
+ }
+ }
+
+ if (r != 0) {
+ complete(r);
+ return false;
+ }
+ return true;
+ }
+
+ void build_aligned_data() {
+ auto [pre_align, post_align] = crypto->get_pre_and_post_align(
+ object_off, data.length());
+ if (pre_align != 0) {
+ auto &extent = extents.front();
+ io::util::unsparsify(image_ctx->cct, &extent.bl, extent.extent_map,
+ extent.offset, extent.length);
+ extent.bl.splice(0, pre_align, &aligned_data);
+ }
+ aligned_data.append(data);
+ if (post_align != 0) {
+ auto &extent = extents.back();
+ io::util::unsparsify(image_ctx->cct, &extent.bl, extent.extent_map,
+ extent.offset, extent.length);
+ extent.bl.splice(0, post_align, &aligned_data);
+ }
+ }
+
+ void handle_copyup(int r) {
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ complete(r);
+ } else {
+ restart_request(false);
+ }
+ }
+
+ void handle_read(int r) {
+ ldout(image_ctx->cct, 20) << "unaligned write r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ if (may_copyup) {
+ auto ctx = create_context_callback<
+ C_UnalignedObjectWriteRequest<I>,
+ &C_UnalignedObjectWriteRequest<I>::handle_copyup>(this);
+ if (io::util::trigger_copyup(
+ image_ctx, object_no, io_context, ctx)) {
+ return;
+ }
+ delete ctx;
+ }
+ object_exists = false;
+ } else if (r < 0) {
+ complete(r);
+ return;
+ } else {
+ object_exists = true;
+ }
+
+ if (!check_create_exclusive() || !check_version() || !check_cmp_data()) {
+ return;
+ }
+
+ build_aligned_data();
+
+ auto aligned_off = crypto->align(object_off, data.length()).first;
+ auto new_write_flags = write_flags;
+ auto new_assert_version = std::make_optional(version);
+ if (!object_exists) {
+ new_write_flags |= io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE;
+ new_assert_version = std::nullopt;
+ }
+
+ auto ctx = create_context_callback<
+ C_UnalignedObjectWriteRequest<I>,
+ &C_UnalignedObjectWriteRequest<I>::handle_write>(this);
+
+ // send back aligned write back to get encrypted and committed
+ auto write_req = io::ObjectDispatchSpec::create_write(
+ image_ctx,
+ io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO),
+ object_no, aligned_off, std::move(aligned_data), io_context,
+ op_flags, new_write_flags, new_assert_version,
+ journal_tid == nullptr ? 0 : *journal_tid, parent_trace, ctx);
+ write_req->send();
+ }
+
+ void restart_request(bool may_copyup) {
+ auto req = new C_UnalignedObjectWriteRequest<I>(
+ image_ctx, crypto, object_no, object_off,
+ std::move(data), std::move(cmp_data),
+ mismatch_offset, io_context, op_flags, write_flags,
+ assert_version, parent_trace,
+ object_dispatch_flags, journal_tid, this, may_copyup);
+ req->send();
+ }
+
+ void handle_write(int r) {
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ bool exclusive = write_flags & io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE;
+ bool restart = false;
+ if (r == -ERANGE && !assert_version.has_value()) {
+ restart = true;
+ } else if (r == -EEXIST && !exclusive) {
+ restart = true;
+ }
+
+ if (restart) {
+ restart_request(may_copyup);
+ } else {
+ complete(r);
+ }
+ }
+
+ void finish(int r) override {
+ ldout(image_ctx->cct, 20) << "unaligned write r=" << r << dendl;
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+CryptoObjectDispatch<I>::CryptoObjectDispatch(
+ I* image_ctx, ceph::ref_t<CryptoInterface> crypto)
+ : m_image_ctx(image_ctx), m_crypto(crypto) {
+}
+
+template <typename I>
+void CryptoObjectDispatch<I>::shut_down(Context* on_finish) {
+ if (m_crypto != nullptr) {
+ m_crypto->put();
+ m_crypto = nullptr;
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool CryptoObjectDispatch<I>::read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << *extents << dendl;
+ ceph_assert(m_crypto != nullptr);
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ if (m_crypto->is_aligned(*extents)) {
+ auto req = new C_AlignedObjectReadRequest<I>(
+ m_image_ctx, m_crypto, object_no, extents, io_context,
+ op_flags, read_flags, parent_trace, version, object_dispatch_flags,
+ on_dispatched);
+ req->send();
+ } else {
+ auto req = new C_UnalignedObjectReadRequest<I>(
+ m_image_ctx, m_crypto, object_no, extents, io_context,
+ op_flags, read_flags, parent_trace, version, object_dispatch_flags,
+ on_dispatched);
+ req->send();
+ }
+
+ return true;
+}
+
+template <typename I>
+bool CryptoObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+ ceph_assert(m_crypto != nullptr);
+
+ if (m_crypto->is_aligned(object_off, data.length())) {
+ auto r = m_crypto->encrypt(
+ &data,
+ io::util::get_file_offset(m_image_ctx, object_no, object_off));
+ *dispatch_result = r == 0 ? io::DISPATCH_RESULT_CONTINUE
+ : io::DISPATCH_RESULT_COMPLETE;
+ on_dispatched->complete(r);
+ } else {
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ auto req = new C_UnalignedObjectWriteRequest<I>(
+ m_image_ctx, m_crypto, object_no, object_off, std::move(data), {},
+ nullptr, io_context, op_flags, write_flags, assert_version,
+ parent_trace, object_dispatch_flags, journal_tid, on_dispatched,
+ true);
+ req->send();
+ }
+
+ return true;
+}
+
+template <typename I>
+bool CryptoObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+ ceph_assert(m_crypto != nullptr);
+
+ // convert to regular write
+ io::LightweightObjectExtent extent(object_no, object_off, object_len, 0);
+ extent.buffer_extents = std::move(buffer_extents);
+
+ bufferlist ws_data;
+ io::util::assemble_write_same_extent(extent, data, &ws_data, true);
+
+ auto ctx = new LambdaContext(
+ [on_finish_ctx=on_dispatched](int r) {
+ on_finish_ctx->complete(r);
+ });
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ auto req = io::ObjectDispatchSpec::create_write(
+ m_image_ctx,
+ io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO),
+ object_no, object_off, std::move(ws_data), io_context, op_flags, 0,
+ std::nullopt, 0, parent_trace, ctx);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool CryptoObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << write_data.length()
+ << dendl;
+ ceph_assert(m_crypto != nullptr);
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ auto req = new C_UnalignedObjectWriteRequest<I>(
+ m_image_ctx, m_crypto, object_no, object_off, std::move(write_data),
+ std::move(cmp_data), mismatch_offset, io_context, op_flags, 0,
+ std::nullopt, parent_trace, object_dispatch_flags, journal_tid,
+ on_dispatched, true);
+ req->send();
+
+ return true;
+}
+
+template <typename I>
+bool CryptoObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+ ceph_assert(m_crypto != nullptr);
+
+ // convert to write-same
+ auto ctx = new LambdaContext(
+ [on_finish_ctx=on_dispatched](int r) {
+ on_finish_ctx->complete(r);
+ });
+
+ bufferlist bl;
+ const int buffer_size = 4096;
+ bl.append_zero(buffer_size);
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ auto req = io::ObjectDispatchSpec::create_write_same(
+ m_image_ctx,
+ io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO),
+ object_no, object_off, object_len, {{0, object_len}}, std::move(bl),
+ io_context, *object_dispatch_flags, 0, parent_trace, ctx);
+ req->send();
+ return true;
+}
+
+template <typename I>
+int CryptoObjectDispatch<I>::prepare_copyup(
+ uint64_t object_no,
+ io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) {
+ ceph::bufferlist current_bl;
+ current_bl.append_zero(m_image_ctx->get_object_size());
+
+ for (auto& [key, extent_map]: *snapshot_sparse_bufferlist) {
+ // update current_bl with data from extent_map
+ for (auto& extent : extent_map) {
+ auto &sbe = extent.get_val();
+ if (sbe.state == io::SPARSE_EXTENT_STATE_DATA) {
+ current_bl.begin(extent.get_off()).copy_in(extent.get_len(), sbe.bl);
+ } else if (sbe.state == io::SPARSE_EXTENT_STATE_ZEROED) {
+ ceph::bufferlist zeros;
+ zeros.append_zero(extent.get_len());
+ current_bl.begin(extent.get_off()).copy_in(extent.get_len(), zeros);
+ }
+ }
+
+ // encrypt
+ io::SparseBufferlist encrypted_sparse_bufferlist;
+ for (auto& extent : extent_map) {
+ auto [aligned_off, aligned_len] = m_crypto->align(
+ extent.get_off(), extent.get_len());
+
+ io::Extents image_extents;
+ io::util::extent_to_file(
+ m_image_ctx, object_no, aligned_off, aligned_len, image_extents);
+
+ ceph::bufferlist encrypted_bl;
+ uint64_t position = 0;
+ for (auto [image_offset, image_length]: image_extents) {
+ ceph::bufferlist aligned_bl;
+ aligned_bl.substr_of(current_bl, aligned_off + position, image_length);
+ aligned_bl.rebuild(); // to deep copy aligned_bl from current_bl
+ position += image_length;
+
+ auto r = m_crypto->encrypt(&aligned_bl, image_offset);
+ if (r != 0) {
+ return r;
+ }
+
+ encrypted_bl.append(aligned_bl);
+ }
+
+ encrypted_sparse_bufferlist.insert(
+ aligned_off, aligned_len, {io::SPARSE_EXTENT_STATE_DATA, aligned_len,
+ std::move(encrypted_bl)});
+ }
+
+ // replace original plaintext sparse bufferlist with encrypted one
+ extent_map.clear();
+ extent_map.insert(std::move(encrypted_sparse_bufferlist));
+ }
+
+ return 0;
+}
+
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::CryptoObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/CryptoObjectDispatch.h b/src/librbd/crypto/CryptoObjectDispatch.h
new file mode 100644
index 000000000..1c5a4646d
--- /dev/null
+++ b/src/librbd/crypto/CryptoObjectDispatch.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_CRYPTO_CRYPTO_OBJECT_DISPATCH_H
+
+#include "librbd/crypto/CryptoInterface.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace crypto {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CryptoObjectDispatch : public io::ObjectDispatchInterface {
+public:
+ static CryptoObjectDispatch* create(
+ ImageCtxT* image_ctx, ceph::ref_t<CryptoInterface> crypto) {
+ return new CryptoObjectDispatch(image_ctx, crypto);
+ }
+
+ CryptoObjectDispatch(ImageCtxT* image_ctx,
+ ceph::ref_t<CryptoInterface> crypto);
+
+ io::ObjectDispatchLayer get_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_CRYPTO;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override {
+ return false;
+ }
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+ ceph::ref_t<CryptoInterface> m_crypto;
+
+};
+
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::CryptoObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_OBJECT_DISPATCH_H
diff --git a/src/librbd/crypto/DataCryptor.h b/src/librbd/crypto/DataCryptor.h
new file mode 100644
index 000000000..ffcc57ce4
--- /dev/null
+++ b/src/librbd/crypto/DataCryptor.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H
+#define CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H
+
+#include "include/int_types.h"
+#include "librbd/crypto/Types.h"
+
+namespace librbd {
+namespace crypto {
+
+template <typename T>
+class DataCryptor {
+
+public:
+
+ virtual ~DataCryptor() = default;
+
+ virtual uint32_t get_block_size() const = 0;
+ virtual uint32_t get_iv_size() const = 0;
+ virtual const unsigned char* get_key() const = 0;
+ virtual int get_key_length() const = 0;
+
+ virtual T* get_context(CipherMode mode) = 0;
+ virtual void return_context(T* ctx, CipherMode mode) = 0;
+
+ virtual int init_context(T* ctx, const unsigned char* iv,
+ uint32_t iv_length) const = 0;
+ virtual int update_context(T* ctx, const unsigned char* in,
+ unsigned char* out, uint32_t len) const = 0;
+};
+
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H
diff --git a/src/librbd/crypto/EncryptionFormat.h b/src/librbd/crypto/EncryptionFormat.h
new file mode 100644
index 000000000..ba57a9252
--- /dev/null
+++ b/src/librbd/crypto/EncryptionFormat.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_ENCRYPTION_FORMAT_H
+#define CEPH_LIBRBD_CRYPTO_ENCRYPTION_FORMAT_H
+
+#include "common/ref.h"
+
+struct Context;
+
+namespace librbd {
+namespace crypto {
+
+struct CryptoInterface;
+
+template <typename ImageCtxT>
+struct EncryptionFormat {
+ virtual ~EncryptionFormat() {
+ }
+
+ virtual void format(ImageCtxT* ictx, Context* on_finish) = 0;
+ virtual void load(ImageCtxT* ictx, Context* on_finish) = 0;
+
+ virtual ceph::ref_t<CryptoInterface> get_crypto() = 0;
+};
+
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_ENCRYPTION_FORMAT_H
diff --git a/src/librbd/crypto/FormatRequest.cc b/src/librbd/crypto/FormatRequest.cc
new file mode 100644
index 000000000..53dda58aa
--- /dev/null
+++ b/src/librbd/crypto/FormatRequest.cc
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "FormatRequest.h"
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/crypto/ShutDownCryptoRequest.h"
+#include "librbd/crypto/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Types.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::FormatRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+FormatRequest<I>::FormatRequest(
+ I* image_ctx, std::unique_ptr<EncryptionFormat<I>> format,
+ Context* on_finish) : m_image_ctx(image_ctx),
+ m_format(std::move(format)),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void FormatRequest<I>::send() {
+ if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ lderr(m_image_ctx->cct) << "cannot use encryption with journal" << dendl;
+ finish(-ENOTSUP);
+ return;
+ }
+
+ if (m_image_ctx->crypto == nullptr) {
+ format();
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ FormatRequest<I>, &FormatRequest<I>::handle_shutdown_crypto>(this);
+ auto *req = ShutDownCryptoRequest<I>::create(m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void FormatRequest<I>::handle_shutdown_crypto(int r) {
+ if (r != 0) {
+ lderr(m_image_ctx->cct) << "unable to unload existing crypto: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ format();
+}
+
+template <typename I>
+void FormatRequest<I>::format() {
+ auto ctx = create_context_callback<
+ FormatRequest<I>, &FormatRequest<I>::handle_format>(this);
+ m_format->format(m_image_ctx, ctx);
+}
+
+template <typename I>
+void FormatRequest<I>::handle_format(int r) {
+ if (r != 0) {
+ lderr(m_image_ctx->cct) << "unable to format image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ flush();
+}
+
+template <typename I>
+void FormatRequest<I>::flush() {
+ auto ctx = create_context_callback<
+ FormatRequest<I>, &FormatRequest<I>::handle_flush>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ *m_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+}
+
+template <typename I>
+void FormatRequest<I>::handle_flush(int r) {
+ if (r != 0) {
+ lderr(m_image_ctx->cct) << "unable to flush image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void FormatRequest<I>::finish(int r) {
+ if (r == 0) {
+ util::set_crypto(m_image_ctx, m_format->get_crypto());
+ }
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::FormatRequest<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/FormatRequest.h b/src/librbd/crypto/FormatRequest.h
new file mode 100644
index 000000000..cfd7978d8
--- /dev/null
+++ b/src/librbd/crypto/FormatRequest.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_FORMAT_REQUEST_H
+#define CEPH_LIBRBD_CRYPTO_FORMAT_REQUEST_H
+
+#include "include/rbd/librbd.hpp"
+#include "librbd/crypto/EncryptionFormat.h"
+
+struct Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace crypto {
+
+template <typename I>
+class FormatRequest {
+public:
+ static FormatRequest* create(
+ I* image_ctx, std::unique_ptr<EncryptionFormat<I>> format,
+ Context* on_finish) {
+ return new FormatRequest(image_ctx, std::move(format), on_finish);
+ }
+
+ FormatRequest(I* image_ctx, std::unique_ptr<EncryptionFormat<I>> format,
+ Context* on_finish);
+ void send();
+ void handle_shutdown_crypto(int r);
+ void format();
+ void handle_format(int r);
+ void flush();
+ void handle_flush(int r);
+ void finish(int r);
+
+private:
+ I* m_image_ctx;
+
+ std::unique_ptr<EncryptionFormat<I>> m_format;
+ Context* m_on_finish;
+};
+
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::FormatRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_FORMAT_REQUEST_H
diff --git a/src/librbd/crypto/LoadRequest.cc b/src/librbd/crypto/LoadRequest.cc
new file mode 100644
index 000000000..c42011f62
--- /dev/null
+++ b/src/librbd/crypto/LoadRequest.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LoadRequest.h"
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/crypto/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::LoadRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+LoadRequest<I>::LoadRequest(
+ I* image_ctx, std::unique_ptr<EncryptionFormat<I>> format,
+ Context* on_finish) : m_image_ctx(image_ctx),
+ m_format(std::move(format)),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void LoadRequest<I>::send() {
+ if (m_image_ctx->crypto != nullptr) {
+ lderr(m_image_ctx->cct) << "encryption already loaded" << dendl;
+ finish(-EEXIST);
+ return;
+ }
+
+ auto ictx = m_image_ctx;
+ while (ictx != nullptr) {
+ if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+ lderr(m_image_ctx->cct) << "cannot use encryption with journal."
+ << " image name: " << ictx->name << dendl;
+ finish(-ENOTSUP);
+ return;
+ }
+ ictx = ictx->parent;
+ }
+
+ auto ctx = create_context_callback<
+ LoadRequest<I>, &LoadRequest<I>::finish>(this);
+ m_format->load(m_image_ctx, ctx);
+}
+
+template <typename I>
+void LoadRequest<I>::finish(int r) {
+
+ if (r == 0) {
+ // load crypto layers to image and its ancestors
+ auto crypto = m_format->get_crypto();
+ auto ictx = m_image_ctx;
+ while (ictx != nullptr) {
+ util::set_crypto(ictx, crypto);
+ ictx = ictx->parent;
+ }
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::LoadRequest<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/LoadRequest.h b/src/librbd/crypto/LoadRequest.h
new file mode 100644
index 000000000..50d1dad84
--- /dev/null
+++ b/src/librbd/crypto/LoadRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_LOAD_REQUEST_H
+#define CEPH_LIBRBD_CRYPTO_LOAD_REQUEST_H
+
+#include "include/rbd/librbd.hpp"
+#include "librbd/crypto/CryptoInterface.h"
+#include "librbd/crypto/EncryptionFormat.h"
+
+struct Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace crypto {
+
+template <typename I>
+class LoadRequest {
+public:
+ static LoadRequest* create(
+ I* image_ctx, std::unique_ptr<EncryptionFormat<I>> format,
+ Context* on_finish) {
+ return new LoadRequest(image_ctx, std::move(format), on_finish);
+ }
+
+ LoadRequest(I* image_ctx, std::unique_ptr<EncryptionFormat<I>> format,
+ Context* on_finish);
+ void send();
+ void finish(int r);
+
+private:
+ I* m_image_ctx;
+ std::unique_ptr<EncryptionFormat<I>> m_format;
+ Context* m_on_finish;
+};
+
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::LoadRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_LOAD_REQUEST_H
diff --git a/src/librbd/crypto/ShutDownCryptoRequest.cc b/src/librbd/crypto/ShutDownCryptoRequest.cc
new file mode 100644
index 000000000..9277308e4
--- /dev/null
+++ b/src/librbd/crypto/ShutDownCryptoRequest.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ShutDownCryptoRequest.h"
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/crypto/CryptoImageDispatch.h"
+#include "librbd/crypto/CryptoObjectDispatch.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::ShutDownCryptoRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+ShutDownCryptoRequest<I>::ShutDownCryptoRequest(
+ I* image_ctx, Context* on_finish) : m_image_ctx(image_ctx),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void ShutDownCryptoRequest<I>::send() {
+ shut_down_object_dispatch();
+}
+
+template <typename I>
+void ShutDownCryptoRequest<I>::shut_down_object_dispatch() {
+ if (!m_image_ctx->io_object_dispatcher->exists(
+ io::OBJECT_DISPATCH_LAYER_CRYPTO)) {
+ finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ShutDownCryptoRequest<I>,
+ &ShutDownCryptoRequest<I>::handle_shut_down_object_dispatch>(this);
+
+ m_image_ctx->io_object_dispatcher->shut_down_dispatch(
+ io::OBJECT_DISPATCH_LAYER_CRYPTO, ctx);
+}
+
+template <typename I>
+void ShutDownCryptoRequest<I>::handle_shut_down_object_dispatch(int r) {
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to shut down object dispatch: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ shut_down_image_dispatch();
+}
+
+template <typename I>
+void ShutDownCryptoRequest<I>::shut_down_image_dispatch() {
+ if (!m_image_ctx->io_image_dispatcher->exists(
+ io::IMAGE_DISPATCH_LAYER_CRYPTO)) {
+ finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ShutDownCryptoRequest<I>,
+ &ShutDownCryptoRequest<I>::handle_shut_down_image_dispatch>(this);
+ m_image_ctx->io_image_dispatcher->shut_down_dispatch(
+ io::IMAGE_DISPATCH_LAYER_CRYPTO, ctx);
+}
+
+template <typename I>
+void ShutDownCryptoRequest<I>::handle_shut_down_image_dispatch(int r) {
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to shut down image dispatch: "
+ << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+}
+
+template <typename I>
+void ShutDownCryptoRequest<I>::finish(int r) {
+ if (r == 0) {
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_image_ctx->crypto = nullptr;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::ShutDownCryptoRequest<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/ShutDownCryptoRequest.h b/src/librbd/crypto/ShutDownCryptoRequest.h
new file mode 100644
index 000000000..cf402c1b4
--- /dev/null
+++ b/src/librbd/crypto/ShutDownCryptoRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_SHUT_DOWN_CRYPTO_REQUEST_H
+#define CEPH_LIBRBD_CRYPTO_SHUT_DOWN_CRYPTO_REQUEST_H
+
+#include "librbd/crypto/CryptoInterface.h"
+
+struct Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace crypto {
+
+class CryptoInterface;
+
+template <typename I>
+class ShutDownCryptoRequest {
+public:
+ static ShutDownCryptoRequest* create(I* image_ctx, Context* on_finish) {
+ return new ShutDownCryptoRequest(image_ctx, on_finish);
+ }
+
+ ShutDownCryptoRequest(I* image_ctx, Context* on_finish);
+ void send();
+ void shut_down_object_dispatch();
+ void handle_shut_down_object_dispatch(int r);
+ void shut_down_image_dispatch();
+ void handle_shut_down_image_dispatch(int r);
+ void finish(int r);
+
+private:
+ I* m_image_ctx;
+ Context* m_on_finish;
+};
+
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::ShutDownCryptoRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_SHUT_DOWN_CRYPTO_REQUEST_H
diff --git a/src/librbd/crypto/Types.h b/src/librbd/crypto/Types.h
new file mode 100644
index 000000000..93d9c172c
--- /dev/null
+++ b/src/librbd/crypto/Types.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_TYPES_H
+#define CEPH_LIBRBD_CRYPTO_TYPES_H
+
+namespace librbd {
+namespace crypto {
+
+enum CipherMode {
+ CIPHER_MODE_ENC,
+ CIPHER_MODE_DEC,
+};
+
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H
diff --git a/src/librbd/crypto/Utils.cc b/src/librbd/crypto/Utils.cc
new file mode 100644
index 000000000..76cc9f1f5
--- /dev/null
+++ b/src/librbd/crypto/Utils.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Utils.h"
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/crypto/BlockCrypto.h"
+#include "librbd/crypto/CryptoImageDispatch.h"
+#include "librbd/crypto/CryptoObjectDispatch.h"
+#include "librbd/crypto/openssl/DataCryptor.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::util: " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+namespace util {
+
+template <typename I>
+void set_crypto(I *image_ctx, ceph::ref_t<CryptoInterface> crypto) {
+ {
+ std::unique_lock image_locker{image_ctx->image_lock};
+ ceph_assert(image_ctx->crypto == nullptr);
+ image_ctx->crypto = crypto.get();
+ }
+ auto object_dispatch = CryptoObjectDispatch<I>::create(image_ctx, crypto);
+ auto image_dispatch = CryptoImageDispatch::create(crypto->get_data_offset());
+ image_ctx->io_object_dispatcher->register_dispatch(object_dispatch);
+ image_ctx->io_image_dispatcher->register_dispatch(image_dispatch);
+}
+
+int build_crypto(
+ CephContext* cct, const unsigned char* key, uint32_t key_length,
+ uint64_t block_size, uint64_t data_offset,
+ ceph::ref_t<CryptoInterface>* result_crypto) {
+ const char* cipher_suite;
+ switch (key_length) {
+ case 32:
+ cipher_suite = "aes-128-xts";
+ break;
+ case 64:
+ cipher_suite = "aes-256-xts";
+ break;
+ default:
+ lderr(cct) << "unsupported key length: " << key_length << dendl;
+ return -ENOTSUP;
+ }
+
+ auto data_cryptor = new openssl::DataCryptor(cct);
+ int r = data_cryptor->init(cipher_suite, key, key_length);
+ if (r != 0) {
+ lderr(cct) << "error initializing data cryptor: " << cpp_strerror(r)
+ << dendl;
+ delete data_cryptor;
+ return r;
+ }
+
+ *result_crypto = BlockCrypto<EVP_CIPHER_CTX>::create(
+ cct, data_cryptor, block_size, data_offset);
+ return 0;
+}
+
+} // namespace util
+} // namespace crypto
+} // namespace librbd
+
+template void librbd::crypto::util::set_crypto(
+ librbd::ImageCtx *image_ctx, ceph::ref_t<CryptoInterface> crypto);
diff --git a/src/librbd/crypto/Utils.h b/src/librbd/crypto/Utils.h
new file mode 100644
index 000000000..aed3d4767
--- /dev/null
+++ b/src/librbd/crypto/Utils.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_UTILS_H
+#define CEPH_LIBRBD_CRYPTO_UTILS_H
+
+#include "include/Context.h"
+#include "librbd/crypto/CryptoInterface.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace crypto {
+namespace util {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+void set_crypto(ImageCtxT *image_ctx, ceph::ref_t<CryptoInterface> crypto);
+
+int build_crypto(
+ CephContext* cct, const unsigned char* key, uint32_t key_length,
+ uint64_t block_size, uint64_t data_offset,
+ ceph::ref_t<CryptoInterface>* result_crypto);
+
+} // namespace util
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_UTILS_H
diff --git a/src/librbd/crypto/luks/EncryptionFormat.cc b/src/librbd/crypto/luks/EncryptionFormat.cc
new file mode 100644
index 000000000..8b1b1580c
--- /dev/null
+++ b/src/librbd/crypto/luks/EncryptionFormat.cc
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "EncryptionFormat.h"
+#include "include/compat.h"
+#include "librbd/crypto/luks/FormatRequest.h"
+#include "librbd/crypto/luks/LoadRequest.h"
+
+namespace librbd {
+namespace crypto {
+namespace luks {
+
+template <typename I>
+EncryptionFormat<I>::EncryptionFormat(
+ encryption_algorithm_t alg,
+ std::string&& passphrase) : m_alg(alg),
+ m_passphrase(std::move(passphrase)) {
+}
+
+template <typename I>
+EncryptionFormat<I>::~EncryptionFormat() {
+ ceph_memzero_s(
+ &m_passphrase[0], m_passphrase.capacity(), m_passphrase.size());
+}
+
+template <typename I>
+void EncryptionFormat<I>::format(I* image_ctx, Context* on_finish) {
+ auto req = luks::FormatRequest<I>::create(
+ image_ctx, get_format(), m_alg, std::move(m_passphrase), &m_crypto,
+ on_finish, false);
+ req->send();
+}
+
+template <typename I>
+void EncryptionFormat<I>::load(I* image_ctx, Context* on_finish) {
+ auto req = luks::LoadRequest<I>::create(
+ image_ctx, get_format(), std::move(m_passphrase), &m_crypto,
+ on_finish);
+ req->send();
+}
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::luks::EncryptionFormat<librbd::ImageCtx>;
+template class librbd::crypto::luks::LUKS1EncryptionFormat<librbd::ImageCtx>;
+template class librbd::crypto::luks::LUKS2EncryptionFormat<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/luks/EncryptionFormat.h b/src/librbd/crypto/luks/EncryptionFormat.h
new file mode 100644
index 000000000..8c45cf9ca
--- /dev/null
+++ b/src/librbd/crypto/luks/EncryptionFormat.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_LUKS_ENCRYPTION_FORMAT_H
+#define CEPH_LIBRBD_CRYPTO_LUKS_ENCRYPTION_FORMAT_H
+
+#include "include/rbd/librbd.hpp"
+#include "librbd/crypto/EncryptionFormat.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace crypto {
+namespace luks {
+
+template <typename ImageCtxT>
+class EncryptionFormat : public crypto::EncryptionFormat<ImageCtxT> {
+
+public:
+ EncryptionFormat(encryption_algorithm_t alg, std::string&& passphrase);
+ ~EncryptionFormat();
+
+ void format(ImageCtxT* ictx, Context* on_finish) override;
+ void load(ImageCtxT* ictx, Context* on_finish) override;
+
+ ceph::ref_t<CryptoInterface> get_crypto() override {
+ return m_crypto;
+ }
+
+private:
+ virtual encryption_format_t get_format() = 0;
+
+ encryption_algorithm_t m_alg;
+ std::string m_passphrase;
+ ceph::ref_t<CryptoInterface> m_crypto;
+};
+
+template <typename ImageCtxT>
+class LUKS1EncryptionFormat : public EncryptionFormat<ImageCtxT> {
+ using EncryptionFormat<ImageCtxT>::EncryptionFormat;
+
+ encryption_format_t get_format() override {
+ return RBD_ENCRYPTION_FORMAT_LUKS1;
+ }
+};
+
+template <typename ImageCtxT>
+class LUKS2EncryptionFormat : public EncryptionFormat<ImageCtxT> {
+ using EncryptionFormat<ImageCtxT>::EncryptionFormat;
+
+ encryption_format_t get_format() override {
+ return RBD_ENCRYPTION_FORMAT_LUKS2;
+ }
+};
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::luks::EncryptionFormat<librbd::ImageCtx>;
+extern template class librbd::crypto::luks::LUKS1EncryptionFormat<
+ librbd::ImageCtx>;
+extern template class librbd::crypto::luks::LUKS2EncryptionFormat<
+ librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_LUKS_ENCRYPTION_FORMAT_H
diff --git a/src/librbd/crypto/luks/FormatRequest.cc b/src/librbd/crypto/luks/FormatRequest.cc
new file mode 100644
index 000000000..be2452fd9
--- /dev/null
+++ b/src/librbd/crypto/luks/FormatRequest.cc
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "FormatRequest.h"
+
+#include <stdlib.h>
+#include <openssl/rand.h>
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "librbd/Utils.h"
+#include "librbd/crypto/Utils.h"
+#include "librbd/crypto/luks/Header.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::luks::FormatRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+namespace luks {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+FormatRequest<I>::FormatRequest(
+ I* image_ctx, encryption_format_t format, encryption_algorithm_t alg,
+ std::string&& passphrase, ceph::ref_t<CryptoInterface>* result_crypto,
+ Context* on_finish,
+ bool insecure_fast_mode) : m_image_ctx(image_ctx), m_format(format),
+ m_alg(alg),
+ m_passphrase(std::move(passphrase)),
+ m_result_crypto(result_crypto),
+ m_on_finish(on_finish),
+ m_insecure_fast_mode(insecure_fast_mode),
+ m_header(image_ctx->cct) {
+}
+
+template <typename I>
+void FormatRequest<I>::send() {
+ const char* type;
+ size_t sector_size;
+ switch (m_format) {
+ case RBD_ENCRYPTION_FORMAT_LUKS1:
+ type = CRYPT_LUKS1;
+ sector_size = 512;
+ break;
+ case RBD_ENCRYPTION_FORMAT_LUKS2:
+ type = CRYPT_LUKS2;
+ sector_size = 4096;
+ break;
+ default:
+ lderr(m_image_ctx->cct) << "unsupported format type: " << m_format
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ const char* cipher;
+ size_t key_size;
+ switch (m_alg) {
+ case RBD_ENCRYPTION_ALGORITHM_AES128:
+ cipher = "aes";
+ key_size = 32;
+ break;
+ case RBD_ENCRYPTION_ALGORITHM_AES256:
+ cipher = "aes";
+ key_size = 64;
+ break;
+ default:
+ lderr(m_image_ctx->cct) << "unsupported cipher algorithm: " << m_alg
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ // generate encryption key
+ unsigned char* key = (unsigned char*)alloca(key_size);
+ if (RAND_bytes((unsigned char *)key, key_size) != 1) {
+ lderr(m_image_ctx->cct) << "cannot generate random encryption key"
+ << dendl;
+ finish(-EAGAIN);
+ return;
+ }
+
+ // setup interface with libcryptsetup
+ auto r = m_header.init();
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ // format (create LUKS header)
+ r = m_header.format(type, cipher, reinterpret_cast<char*>(key), key_size,
+ "xts-plain64", sector_size,
+ m_image_ctx->get_object_size(), m_insecure_fast_mode);
+ if (r != 0) {
+ finish(r);
+ return;
+ }
+
+ m_image_ctx->image_lock.lock_shared();
+ uint64_t image_size = m_image_ctx->get_image_size(CEPH_NOSNAP);
+ m_image_ctx->image_lock.unlock_shared();
+
+ if (m_header.get_data_offset() >= image_size) {
+ lderr(m_image_ctx->cct) << "image is too small. format requires more than "
+ << m_header.get_data_offset() << " bytes" << dendl;
+ finish(-ENOSPC);
+ return;
+ }
+
+ // add keyslot (volume key encrypted with passphrase)
+ r = m_header.add_keyslot(m_passphrase.c_str(), m_passphrase.size());
+ if (r != 0) {
+ finish(r);
+ return;
+ }
+
+ r = util::build_crypto(m_image_ctx->cct, key, key_size,
+ m_header.get_sector_size(),
+ m_header.get_data_offset(), m_result_crypto);
+ ceph_memzero_s(key, key_size, key_size);
+ if (r != 0) {
+ finish(r);
+ return;
+ }
+
+ // read header from libcryptsetup interface
+ ceph::bufferlist bl;
+ r = m_header.read(&bl);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ // write header to offset 0 of the image
+ auto ctx = create_context_callback<
+ FormatRequest<I>, &FormatRequest<I>::handle_write_header>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_WRITE);
+
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_write(
+ *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp,
+ {{0, bl.length()}}, std::move(bl),
+ m_image_ctx->get_data_io_context(), 0, trace);
+ req->send();
+}
+
+template <typename I>
+void FormatRequest<I>::handle_write_header(int r) {
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "error writing header to image: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void FormatRequest<I>::finish(int r) {
+ ceph_memzero_s(
+ &m_passphrase[0], m_passphrase.capacity(), m_passphrase.size());
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::luks::FormatRequest<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/luks/FormatRequest.h b/src/librbd/crypto/luks/FormatRequest.h
new file mode 100644
index 000000000..a782dc818
--- /dev/null
+++ b/src/librbd/crypto/luks/FormatRequest.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_LUKS_FORMAT_REQUEST_H
+#define CEPH_LIBRBD_CRYPTO_LUKS_FORMAT_REQUEST_H
+
+#include "include/rbd/librbd.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/crypto/CryptoInterface.h"
+#include "librbd/crypto/luks/Header.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace crypto {
+namespace luks {
+
+template <typename I>
+class FormatRequest {
+public:
+ static FormatRequest* create(
+ I* image_ctx, encryption_format_t format,
+ encryption_algorithm_t alg, std::string&& passphrase,
+ ceph::ref_t<CryptoInterface>* result_crypto, Context* on_finish,
+ bool insecure_fast_mode) {
+ return new FormatRequest(image_ctx, format, alg, std::move(passphrase),
+ result_crypto, on_finish, insecure_fast_mode);
+ }
+
+ FormatRequest(I* image_ctx, encryption_format_t format,
+ encryption_algorithm_t alg, std::string&& passphrase,
+ ceph::ref_t<CryptoInterface>* result_crypto,
+ Context* on_finish, bool insecure_fast_mode);
+ void send();
+ void finish(int r);
+
+private:
+ I* m_image_ctx;
+
+ encryption_format_t m_format;
+ encryption_algorithm_t m_alg;
+ std::string m_passphrase;
+ ceph::ref_t<CryptoInterface>* m_result_crypto;
+ Context* m_on_finish;
+ bool m_insecure_fast_mode;
+ Header m_header;
+
+ void handle_write_header(int r);
+};
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::luks::FormatRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_LUKS_FORMAT_REQUEST_H
diff --git a/src/librbd/crypto/luks/Header.cc b/src/librbd/crypto/luks/Header.cc
new file mode 100644
index 000000000..6d00074ef
--- /dev/null
+++ b/src/librbd/crypto/luks/Header.cc
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Header.h"
+
+#include <errno.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include "common/dout.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::luks::Header: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+namespace luks {
+
+Header::Header(CephContext* cct) : m_cct(cct), m_fd(-1), m_cd(nullptr) {
+}
+
+Header::~Header() {
+ if (m_fd != -1) {
+ close(m_fd);
+ m_fd = -1;
+ }
+ if (m_cd != nullptr) {
+ crypt_free(m_cd);
+ m_cd = nullptr;
+ }
+}
+
+void Header::libcryptsetup_log_wrapper(int level, const char* msg, void* header) {
+ ((Header*)header)->libcryptsetup_log(level, msg);
+}
+
+void Header::libcryptsetup_log(int level, const char* msg) {
+ switch (level) {
+ case CRYPT_LOG_NORMAL:
+ ldout(m_cct, 5) << "[libcryptsetup] " << msg << dendl;
+ break;
+ case CRYPT_LOG_ERROR:
+ lderr(m_cct) << "[libcryptsetup] " << msg << dendl;
+ break;
+ case CRYPT_LOG_VERBOSE:
+ ldout(m_cct, 10) << "[libcryptsetup] " << msg << dendl;
+ break;
+ case CRYPT_LOG_DEBUG:
+ ldout(m_cct, 20) << "[libcryptsetup] " << msg << dendl;
+ break;
+ }
+}
+
+int Header::init() {
+ // create anonymous file
+ m_fd = syscall(SYS_memfd_create, "LibcryptsetupInterface", 0);
+ if (m_fd == -1) {
+ lderr(m_cct) << "error creating anonymous file: " << cpp_strerror(-errno)
+ << dendl;
+ return -errno;
+ }
+ std::string path =
+ "/proc/" + std::to_string(getpid()) + "/fd/" + std::to_string(m_fd);
+
+ if (m_cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
+ crypt_set_debug_level(CRYPT_DEBUG_ALL);
+ }
+
+ // init libcryptsetup handle
+ auto r = crypt_init(&m_cd, path.c_str());
+ if (r != 0) {
+ lderr(m_cct) << "crypt_init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // redirect logging
+ crypt_set_log_callback(m_cd, &libcryptsetup_log_wrapper, this);
+
+ return 0;
+}
+
+int Header::write(const ceph::bufferlist& bl) {
+ ceph_assert(m_fd != -1);
+
+ auto r = bl.write_fd(m_fd);
+ if (r != 0) {
+ lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+}
+
+ssize_t Header::read(ceph::bufferlist* bl) {
+ ceph_assert(m_fd != -1);
+
+ // get current header size
+ struct stat st;
+ ssize_t r = fstat(m_fd, &st);
+ if (r < 0) {
+ r = -errno;
+ lderr(m_cct) << "failed to stat anonymous file: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = bl->read_fd(m_fd, st.st_size);
+ if (r < 0) {
+ lderr(m_cct) << "error reading header: " << cpp_strerror(r) << dendl;
+ }
+
+ ldout(m_cct, 20) << "read size = " << r << dendl;
+ return r;
+}
+
+int Header::format(const char* type, const char* alg, const char* key,
+ size_t key_size, const char* cipher_mode,
+ uint32_t sector_size, uint32_t data_alignment,
+ bool insecure_fast_mode) {
+ ceph_assert(m_cd != nullptr);
+
+ ldout(m_cct, 20) << "sector size: " << sector_size << ", data alignment: "
+ << data_alignment << dendl;
+
+ // required for passing libcryptsetup device size check
+ if (ftruncate(m_fd, 4096) != 0) {
+ lderr(m_cct) << "failed to truncate anonymous file: "
+ << cpp_strerror(-errno) << dendl;
+ return -errno;
+ }
+
+ struct crypt_params_luks1 luks1params;
+ struct crypt_params_luks2 luks2params;
+
+#ifdef LIBCRYPTSETUP_LEGACY_DATA_ALIGNMENT
+ size_t converted_data_alignment = data_alignment / sector_size;
+#else
+ size_t converted_data_alignment = data_alignment / 512;
+#endif
+
+ void* params = nullptr;
+ if (strcmp(type, CRYPT_LUKS1) == 0) {
+ memset(&luks1params, 0, sizeof(luks1params));
+ luks1params.data_alignment = converted_data_alignment;
+ params = &luks1params;
+ } else if (strcmp(type, CRYPT_LUKS2) == 0) {
+ memset(&luks2params, 0, sizeof(luks2params));
+ luks2params.data_alignment = converted_data_alignment;
+ luks2params.sector_size = sector_size;
+ params = &luks2params;
+ }
+
+ // this mode should be used for testing only
+ if (insecure_fast_mode) {
+ struct crypt_pbkdf_type pbkdf;
+ memset(&pbkdf, 0, sizeof(pbkdf));
+ pbkdf.type = CRYPT_KDF_PBKDF2;
+ pbkdf.flags = CRYPT_PBKDF_NO_BENCHMARK;
+ pbkdf.hash = "sha256";
+ pbkdf.iterations = 1000;
+ pbkdf.time_ms = 1;
+ auto r = crypt_set_pbkdf_type(m_cd, &pbkdf);
+ if (r != 0) {
+ lderr(m_cct) << "crypt_set_pbkdf_type failed: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ auto r = crypt_format(
+ m_cd, type, alg, cipher_mode, NULL, key, key_size, params);
+ if (r != 0) {
+ lderr(m_cct) << "crypt_format failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int Header::add_keyslot(const char* passphrase, size_t passphrase_size) {
+ ceph_assert(m_cd != nullptr);
+
+ auto r = crypt_keyslot_add_by_volume_key(
+ m_cd, CRYPT_ANY_SLOT, NULL, 0, passphrase, passphrase_size);
+ if (r != 0) {
+ lderr(m_cct) << "crypt_keyslot_add_by_volume_key failed: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int Header::load(const char* type) {
+ ceph_assert(m_cd != nullptr);
+
+ // libcryptsetup checks if device size matches the header and keyslots size
+ // in LUKS2, 2 X 4MB header + 128MB keyslots
+ if (ftruncate(m_fd, 136 * 1024 * 1024) != 0) {
+ lderr(m_cct) << "failed to truncate anonymous file: "
+ << cpp_strerror(-errno) << dendl;
+ return -errno;
+ }
+
+ auto r = crypt_load(m_cd, type, NULL);
+ if (r != 0) {
+ lderr(m_cct) << "crypt_load failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ldout(m_cct, 20) << "sector size: " << get_sector_size() << ", data offset: "
+ << get_data_offset() << dendl;
+
+ return 0;
+}
+
+int Header::read_volume_key(const char* passphrase, size_t passphrase_size,
+ char* volume_key, size_t* volume_key_size) {
+ ceph_assert(m_cd != nullptr);
+
+ auto r = crypt_volume_key_get(
+ m_cd, CRYPT_ANY_SLOT, volume_key, volume_key_size, passphrase,
+ passphrase_size);
+ if (r != 0) {
+ lderr(m_cct) << "crypt_volume_key_get failed: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int Header::get_sector_size() {
+ ceph_assert(m_cd != nullptr);
+ return crypt_get_sector_size(m_cd);
+}
+
+uint64_t Header::get_data_offset() {
+ ceph_assert(m_cd != nullptr);
+ return crypt_get_data_offset(m_cd) << 9;
+}
+
+const char* Header::get_cipher() {
+ ceph_assert(m_cd != nullptr);
+ return crypt_get_cipher(m_cd);
+}
+
+const char* Header::get_cipher_mode() {
+ ceph_assert(m_cd != nullptr);
+ return crypt_get_cipher_mode(m_cd);
+}
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
diff --git a/src/librbd/crypto/luks/Header.h b/src/librbd/crypto/luks/Header.h
new file mode 100644
index 000000000..cee80a8e4
--- /dev/null
+++ b/src/librbd/crypto/luks/Header.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_LUKS_HEADER_H
+#define CEPH_LIBRBD_CRYPTO_LUKS_HEADER_H
+
+#include <libcryptsetup.h>
+#include "common/ceph_context.h"
+#include "include/buffer.h"
+
+namespace librbd {
+namespace crypto {
+namespace luks {
+
+class Header {
+public:
+ Header(CephContext* cct);
+ ~Header();
+ int init();
+
+ int write(const ceph::bufferlist& bl);
+ ssize_t read(ceph::bufferlist* bl);
+
+ int format(const char* type, const char* alg, const char* key,
+ size_t key_size, const char* cipher_mode, uint32_t sector_size,
+ uint32_t data_alignment, bool insecure_fast_mode);
+ int add_keyslot(const char* passphrase, size_t passphrase_size);
+ int load(const char* type);
+ int read_volume_key(const char* passphrase, size_t passphrase_size,
+ char* volume_key, size_t* volume_key_size);
+
+ int get_sector_size();
+ uint64_t get_data_offset();
+ const char* get_cipher();
+ const char* get_cipher_mode();
+
+private:
+ void libcryptsetup_log(int level, const char* msg);
+ static void libcryptsetup_log_wrapper(int level, const char* msg,
+ void* header);
+
+ CephContext* m_cct;
+ int m_fd;
+ struct crypt_device *m_cd;
+};
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_LUKS_HEADER_H
diff --git a/src/librbd/crypto/luks/LoadRequest.cc b/src/librbd/crypto/luks/LoadRequest.cc
new file mode 100644
index 000000000..10339469d
--- /dev/null
+++ b/src/librbd/crypto/luks/LoadRequest.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LoadRequest.h"
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/crypto/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ReadResult.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::crypto::luks::LoadRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace crypto {
+namespace luks {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+LoadRequest<I>::LoadRequest(
+ I* image_ctx, encryption_format_t format, std::string&& passphrase,
+ ceph::ref_t<CryptoInterface>* result_crypto,
+ Context* on_finish) : m_image_ctx(image_ctx),
+ m_format(format),
+ m_passphrase(std::move(passphrase)),
+ m_on_finish(on_finish),
+ m_result_crypto(result_crypto),
+ m_initial_read_size(DEFAULT_INITIAL_READ_SIZE),
+ m_header(image_ctx->cct), m_offset(0) {
+}
+
+template <typename I>
+void LoadRequest<I>::set_initial_read_size(uint64_t read_size) {
+ m_initial_read_size = read_size;
+}
+
+template <typename I>
+void LoadRequest<I>::send() {
+ // setup interface with libcryptsetup
+ auto r = m_header.init();
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ LoadRequest<I>, &LoadRequest<I>::handle_read_header>(this);
+ read(m_initial_read_size, ctx);
+}
+
+template <typename I>
+void LoadRequest<I>::read(uint64_t end_offset, Context* on_finish) {
+ auto length = end_offset - m_offset;
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, librbd::util::get_image_ctx(m_image_ctx),
+ io::AIO_TYPE_READ);
+ ZTracer::Trace trace;
+ auto req = io::ImageDispatchSpec::create_read(
+ *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp,
+ {{m_offset, length}}, io::ReadResult{&m_bl},
+ m_image_ctx->get_data_io_context(), 0, 0, trace);
+ req->send();
+}
+
+template <typename I>
+bool LoadRequest<I>::handle_read(int r) {
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "error reading from image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return false;
+ }
+
+ // write header to libcryptsetup interface
+ r = m_header.write(m_bl);
+ if (r < 0) {
+ finish(r);
+ return false;
+ }
+
+ m_offset += m_bl.length();
+ m_bl.clear();
+ return true;
+}
+
+template <typename I>
+void LoadRequest<I>::handle_read_header(int r) {
+ if (!handle_read(r)) {
+ return;
+ }
+
+ const char* type;
+ switch (m_format) {
+ case RBD_ENCRYPTION_FORMAT_LUKS1:
+ type = CRYPT_LUKS1;
+ break;
+ case RBD_ENCRYPTION_FORMAT_LUKS2:
+ type = CRYPT_LUKS2;
+ break;
+ default:
+ lderr(m_image_ctx->cct) << "unsupported format type: " << m_format
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ // parse header via libcryptsetup
+ r = m_header.load(type);
+ if (r != 0) {
+ if (m_offset < MAXIMUM_HEADER_SIZE) {
+ // perhaps we did not feed the entire header to libcryptsetup, retry
+ auto ctx = create_context_callback<
+ LoadRequest<I>, &LoadRequest<I>::handle_read_header>(this);
+ read(MAXIMUM_HEADER_SIZE, ctx);
+ return;
+ }
+
+ finish(r);
+ return;
+ }
+
+ auto cipher = m_header.get_cipher();
+ if (strcmp(cipher, "aes") != 0) {
+ lderr(m_image_ctx->cct) << "unsupported cipher: " << cipher << dendl;
+ finish(-ENOTSUP);
+ return;
+ }
+
+ auto cipher_mode = m_header.get_cipher_mode();
+ if (strcmp(cipher_mode, "xts-plain64") != 0) {
+ lderr(m_image_ctx->cct) << "unsupported cipher mode: " << cipher_mode
+ << dendl;
+ finish(-ENOTSUP);
+ return;
+ }
+
+ read_volume_key();
+ return;
+}
+
+template <typename I>
+void LoadRequest<I>::handle_read_keyslots(int r) {
+ if (!handle_read(r)) {
+ return;
+ }
+
+ read_volume_key();
+}
+
+template <typename I>
+void LoadRequest<I>::read_volume_key() {
+ char volume_key[64];
+ size_t volume_key_size = sizeof(volume_key);
+
+ auto r = m_header.read_volume_key(
+ m_passphrase.c_str(), m_passphrase.size(),
+ reinterpret_cast<char*>(volume_key), &volume_key_size);
+ if (r != 0) {
+ auto keyslots_end_offset = m_header.get_data_offset();
+ if (m_offset < keyslots_end_offset) {
+ // perhaps we did not feed the the necessary keyslot, retry
+ auto ctx = create_context_callback<
+ LoadRequest<I>, &LoadRequest<I>::handle_read_keyslots>(this);
+ read(keyslots_end_offset, ctx);
+ return;
+ }
+
+ finish(r);
+ return;
+ }
+
+ r = util::build_crypto(
+ m_image_ctx->cct, reinterpret_cast<unsigned char*>(volume_key),
+ volume_key_size, m_header.get_sector_size(),
+ m_header.get_data_offset(), m_result_crypto);
+ finish(r);
+}
+
+template <typename I>
+void LoadRequest<I>::finish(int r) {
+ ceph_memzero_s(&m_passphrase[0], m_passphrase.size(), m_passphrase.size());
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+template class librbd::crypto::luks::LoadRequest<librbd::ImageCtx>;
diff --git a/src/librbd/crypto/luks/LoadRequest.h b/src/librbd/crypto/luks/LoadRequest.h
new file mode 100644
index 000000000..340e89503
--- /dev/null
+++ b/src/librbd/crypto/luks/LoadRequest.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_LUKS_LOAD_REQUEST_H
+#define CEPH_LIBRBD_CRYPTO_LUKS_LOAD_REQUEST_H
+
+#include "include/rbd/librbd.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/crypto/CryptoInterface.h"
+#include "librbd/crypto/luks/Header.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace crypto {
+namespace luks {
+
+// max header size in LUKS1/2 (excl. keyslots) is 4MB
+const uint64_t MAXIMUM_HEADER_SIZE = 4 * 1024 * 1024;
+// default header size in LUKS2 2 X 16KB + 1 X 256KB keyslot
+const uint64_t DEFAULT_INITIAL_READ_SIZE = 288 * 1024;
+
+template <typename I>
+class LoadRequest {
+public:
+ static LoadRequest* create(
+ I* image_ctx, encryption_format_t format, std::string&& passphrase,
+ ceph::ref_t<CryptoInterface>* result_crypto, Context* on_finish) {
+ return new LoadRequest(image_ctx, format, std::move(passphrase),
+ result_crypto, on_finish);
+ }
+
+ LoadRequest(I* image_ctx, encryption_format_t format,
+ std::string&& passphrase,
+ ceph::ref_t<CryptoInterface>* result_crypto,
+ Context* on_finish);
+ void send();
+ void finish(int r);
+ void set_initial_read_size(uint64_t read_size);
+
+private:
+ I* m_image_ctx;
+ encryption_format_t m_format;
+ std::string m_passphrase;
+ Context* m_on_finish;
+ ceph::bufferlist m_bl;
+ ceph::ref_t<CryptoInterface>* m_result_crypto;
+ uint64_t m_initial_read_size;
+ Header m_header;
+ uint64_t m_offset;
+
+ void read(uint64_t end_offset, Context* on_finish);
+ bool handle_read(int r);
+ void handle_read_header(int r);
+ void handle_read_keyslots(int r);
+ void read_volume_key();
+};
+
+} // namespace luks
+} // namespace crypto
+} // namespace librbd
+
+extern template class librbd::crypto::luks::LoadRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CRYPTO_LUKS_LOAD_REQUEST_H
diff --git a/src/librbd/crypto/openssl/DataCryptor.cc b/src/librbd/crypto/openssl/DataCryptor.cc
new file mode 100644
index 000000000..aa9427a79
--- /dev/null
+++ b/src/librbd/crypto/openssl/DataCryptor.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/crypto/openssl/DataCryptor.h"
+#include <openssl/err.h>
+#include <string.h>
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+namespace librbd {
+namespace crypto {
+namespace openssl {
+
+int DataCryptor::init(const char* cipher_name, const unsigned char* key,
+ uint16_t key_length) {
+ if (m_key != nullptr) {
+ ceph_memzero_s(m_key, m_key_size, m_key_size);
+ delete [] m_key;
+ m_key = nullptr;
+ m_key_size = 0;
+ }
+ if (cipher_name == nullptr) {
+ lderr(m_cct) << "missing cipher name" << dendl;
+ return -EINVAL;
+ }
+ if (key == nullptr) {
+ lderr(m_cct) << "missing key" << dendl;
+ return -EINVAL;
+ }
+
+ m_cipher = EVP_get_cipherbyname(cipher_name);
+ if (m_cipher == nullptr) {
+ lderr(m_cct) << "EVP_get_cipherbyname failed. Cipher name: " << cipher_name
+ << dendl;
+ log_errors();
+ return -EINVAL;
+ }
+
+ auto expected_key_length = EVP_CIPHER_key_length(m_cipher);
+ if (expected_key_length != key_length) {
+ lderr(m_cct) << "cipher " << cipher_name << " expects key of "
+ << expected_key_length << " bytes. got: " << key_length
+ << dendl;
+ return -EINVAL;
+ }
+
+ m_key_size = key_length;
+ m_key = new unsigned char[key_length];
+ memcpy(m_key, key, key_length);
+ m_iv_size = static_cast<uint32_t>(EVP_CIPHER_iv_length(m_cipher));
+ return 0;
+}
+
+DataCryptor::~DataCryptor() {
+ if (m_key != nullptr) {
+ ceph_memzero_s(m_key, m_key_size, m_key_size);
+ delete [] m_key;
+ m_key = nullptr;
+ }
+}
+
+uint32_t DataCryptor::get_block_size() const {
+ return EVP_CIPHER_block_size(m_cipher);
+}
+
+uint32_t DataCryptor::get_iv_size() const {
+ return m_iv_size;
+}
+
+const unsigned char* DataCryptor::get_key() const {
+ return m_key;
+}
+
+int DataCryptor::get_key_length() const {
+ return EVP_CIPHER_key_length(m_cipher);
+}
+
+EVP_CIPHER_CTX* DataCryptor::get_context(CipherMode mode) {
+ int enc;
+ switch(mode) {
+ case CIPHER_MODE_ENC:
+ enc = 1;
+ break;
+ case CIPHER_MODE_DEC:
+ enc = 0;
+ break;
+ default:
+ lderr(m_cct) << "Invalid CipherMode:" << mode << dendl;
+ return nullptr;
+ }
+
+ auto ctx = EVP_CIPHER_CTX_new();
+ if (ctx == nullptr) {
+ lderr(m_cct) << "EVP_CIPHER_CTX_new failed" << dendl;
+ log_errors();
+ return nullptr;
+ }
+
+ if (1 != EVP_CipherInit_ex(ctx, m_cipher, nullptr, m_key, nullptr, enc)) {
+ lderr(m_cct) << "EVP_CipherInit_ex failed" << dendl;
+ log_errors();
+ return nullptr;
+ }
+
+ return ctx;
+}
+
+void DataCryptor::return_context(EVP_CIPHER_CTX* ctx, CipherMode mode) {
+ if (ctx != nullptr) {
+ EVP_CIPHER_CTX_free(ctx);
+ }
+}
+
+int DataCryptor::init_context(EVP_CIPHER_CTX* ctx, const unsigned char* iv,
+ uint32_t iv_length) const {
+ if (iv_length != m_iv_size) {
+ lderr(m_cct) << "cipher expects IV of " << m_iv_size << " bytes. got: "
+ << iv_length << dendl;
+ return -EINVAL;
+ }
+ if (1 != EVP_CipherInit_ex(ctx, nullptr, nullptr, nullptr, iv, -1)) {
+ lderr(m_cct) << "EVP_CipherInit_ex failed" << dendl;
+ log_errors();
+ return -EIO;
+ }
+ return 0;
+}
+
+int DataCryptor::update_context(EVP_CIPHER_CTX* ctx, const unsigned char* in,
+ unsigned char* out, uint32_t len) const {
+ int out_length;
+ if (1 != EVP_CipherUpdate(ctx, out, &out_length, in, len)) {
+ lderr(m_cct) << "EVP_CipherUpdate failed. len=" << len << dendl;
+ log_errors();
+ return -EIO;
+ }
+ return out_length;
+}
+
+void DataCryptor::log_errors() const {
+ while (true) {
+ auto error = ERR_get_error();
+ if (error == 0) {
+ break;
+ }
+ lderr(m_cct) << "OpenSSL error: " << ERR_error_string(error, nullptr)
+ << dendl;
+ }
+}
+
+} // namespace openssl
+} // namespace crypto
+} // namespace librbd
diff --git a/src/librbd/crypto/openssl/DataCryptor.h b/src/librbd/crypto/openssl/DataCryptor.h
new file mode 100644
index 000000000..af6956883
--- /dev/null
+++ b/src/librbd/crypto/openssl/DataCryptor.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CRYPTO_OPENSSL_DATA_CRYPTOR_H
+#define CEPH_LIBRBD_CRYPTO_OPENSSL_DATA_CRYPTOR_H
+
+#include "librbd/crypto/DataCryptor.h"
+#include "include/Context.h"
+#include <openssl/evp.h>
+
+namespace librbd {
+namespace crypto {
+namespace openssl {
+
+class DataCryptor : public crypto::DataCryptor<EVP_CIPHER_CTX> {
+
+public:
+ DataCryptor(CephContext* cct) : m_cct(cct) {};
+ ~DataCryptor();
+
+ int init(const char* cipher_name, const unsigned char* key,
+ uint16_t key_length);
+ uint32_t get_block_size() const override;
+ uint32_t get_iv_size() const override;
+ const unsigned char* get_key() const override;
+ int get_key_length() const override;
+
+ EVP_CIPHER_CTX* get_context(CipherMode mode) override;
+ void return_context(EVP_CIPHER_CTX* ctx, CipherMode mode) override;
+ int init_context(EVP_CIPHER_CTX* ctx, const unsigned char* iv,
+ uint32_t iv_length) const override;
+ int update_context(EVP_CIPHER_CTX* ctx, const unsigned char* in,
+ unsigned char* out, uint32_t len) const override;
+
+private:
+ CephContext* m_cct;
+ unsigned char* m_key = nullptr;
+ uint16_t m_key_size = 0;
+ const EVP_CIPHER* m_cipher;
+ uint32_t m_iv_size;
+
+ void log_errors() const;
+};
+
+} // namespace openssl
+} // namespace crypto
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CRYPTO_OPENSSL_DATA_CRYPTOR_H
diff --git a/src/librbd/deep_copy/Handler.h b/src/librbd/deep_copy/Handler.h
new file mode 100644
index 000000000..fea553ee2
--- /dev/null
+++ b/src/librbd/deep_copy/Handler.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_HANDLER_H
+#define CEPH_LIBRBD_DEEP_COPY_HANDLER_H
+
+#include "include/int_types.h"
+#include "include/rbd/librbd.hpp"
+
+namespace librbd {
+namespace deep_copy {
+
+struct Handler {
+ virtual ~Handler() {}
+
+ virtual void handle_read(uint64_t bytes_read) = 0;
+
+ virtual int update_progress(uint64_t object_number,
+ uint64_t object_count) = 0;
+};
+
+struct NoOpHandler : public Handler {
+ void handle_read(uint64_t bytes_read) override {
+ }
+
+ int update_progress(uint64_t object_number,
+ uint64_t object_count) override {
+ return 0;
+ }
+};
+
+class ProgressHandler : public NoOpHandler {
+public:
+ ProgressHandler(ProgressContext* progress_ctx)
+ : m_progress_ctx(progress_ctx) {
+ }
+
+ int update_progress(uint64_t object_number,
+ uint64_t object_count) override {
+ return m_progress_ctx->update_progress(object_number, object_count);
+ }
+
+private:
+ librbd::ProgressContext* m_progress_ctx;
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_DEEP_COPY_HANDLER_H
diff --git a/src/librbd/deep_copy/ImageCopyRequest.cc b/src/librbd/deep_copy/ImageCopyRequest.cc
new file mode 100644
index 000000000..08e959dd5
--- /dev/null
+++ b/src/librbd/deep_copy/ImageCopyRequest.cc
@@ -0,0 +1,278 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ImageCopyRequest.h"
+#include "ObjectCopyRequest.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/deep_copy/Handler.h"
+#include "librbd/deep_copy/Utils.h"
+#include "librbd/object_map/DiffRequest.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::ImageCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+ImageCopyRequest<I>::ImageCopyRequest(I *src_image_ctx, I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const ObjectNumber &object_number,
+ const SnapSeqs &snap_seqs,
+ Handler *handler,
+ Context *on_finish)
+ : RefCountedObject(dst_image_ctx->cct), m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start),
+ m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start),
+ m_flatten(flatten), m_object_number(object_number), m_snap_seqs(snap_seqs),
+ m_handler(handler), m_on_finish(on_finish), m_cct(dst_image_ctx->cct),
+ m_lock(ceph::make_mutex(unique_lock_name("ImageCopyRequest::m_lock", this))) {
+}
+
+template <typename I>
+void ImageCopyRequest<I>::send() {
+ m_dst_image_ctx->image_lock.lock_shared();
+ util::compute_snap_map(m_dst_image_ctx->cct, m_src_snap_id_start,
+ m_src_snap_id_end, m_dst_image_ctx->snaps, m_snap_seqs,
+ &m_snap_map);
+ m_dst_image_ctx->image_lock.unlock_shared();
+
+ if (m_snap_map.empty()) {
+ lderr(m_cct) << "failed to map snapshots within boundary" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ compute_diff();
+}
+
+template <typename I>
+void ImageCopyRequest<I>::cancel() {
+ std::lock_guard locker{m_lock};
+
+ ldout(m_cct, 20) << dendl;
+ m_canceled = true;
+}
+
+template <typename I>
+void ImageCopyRequest<I>::map_src_objects(uint64_t dst_object,
+ std::set<uint64_t> *src_objects) {
+ std::vector<std::pair<uint64_t, uint64_t>> image_extents;
+ Striper::extent_to_file(m_cct, &m_dst_image_ctx->layout, dst_object, 0,
+ m_dst_image_ctx->layout.object_size, image_extents);
+
+ for (auto &e : image_extents) {
+ std::map<object_t, std::vector<ObjectExtent>> src_object_extents;
+ Striper::file_to_extents(m_cct, m_src_image_ctx->format_string,
+ &m_src_image_ctx->layout, e.first, e.second, 0,
+ src_object_extents);
+ for (auto &p : src_object_extents) {
+ for (auto &s : p.second) {
+ src_objects->insert(s.objectno);
+ }
+ }
+ }
+
+ ceph_assert(!src_objects->empty());
+
+ ldout(m_cct, 20) << dst_object << " -> " << *src_objects << dendl;
+}
+
+template <typename I>
+void ImageCopyRequest<I>::compute_diff() {
+ if (m_flatten) {
+ send_object_copies();
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ ImageCopyRequest<I>, &ImageCopyRequest<I>::handle_compute_diff>(this);
+ auto req = object_map::DiffRequest<I>::create(m_src_image_ctx, m_src_snap_id_start,
+ m_src_snap_id_end, &m_object_diff_state,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageCopyRequest<I>::handle_compute_diff(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(m_cct, 10) << "fast-diff optimization disabled" << dendl;
+ m_object_diff_state.resize(0);
+ }
+
+ send_object_copies();
+}
+
+template <typename I>
+void ImageCopyRequest<I>::send_object_copies() {
+ m_object_no = 0;
+ if (m_object_number) {
+ m_object_no = *m_object_number + 1;
+ }
+
+ uint64_t size;
+ {
+ std::shared_lock image_locker{m_src_image_ctx->image_lock};
+ size = m_src_image_ctx->get_image_size(CEPH_NOSNAP);
+ for (auto snap_id : m_src_image_ctx->snaps) {
+ size = std::max(size, m_src_image_ctx->get_image_size(snap_id));
+ }
+ }
+ m_end_object_no = Striper::get_num_objects(m_dst_image_ctx->layout, size);
+
+ ldout(m_cct, 20) << "start_object=" << m_object_no << ", "
+ << "end_object=" << m_end_object_no << dendl;
+
+ bool complete;
+ {
+ std::lock_guard locker{m_lock};
+ auto max_ops = m_src_image_ctx->config.template get_val<uint64_t>(
+ "rbd_concurrent_management_ops");
+
+ // attempt to schedule at least 'max_ops' initial requests where
+ // some objects might be skipped if fast-diff notes no change
+ for (uint64_t i = 0; i < max_ops; i++) {
+ send_next_object_copy();
+ }
+
+ complete = (m_current_ops == 0) && !m_updating_progress;
+ }
+
+ if (complete) {
+ finish(m_ret_val);
+ }
+}
+
+template <typename I>
+void ImageCopyRequest<I>::send_next_object_copy() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (m_canceled && m_ret_val == 0) {
+ ldout(m_cct, 10) << "image copy canceled" << dendl;
+ m_ret_val = -ECANCELED;
+ }
+
+ if (m_ret_val < 0 || m_object_no >= m_end_object_no) {
+ return;
+ }
+
+ uint64_t ono = m_object_no++;
+ Context *ctx = new LambdaContext(
+ [this, ono](int r) {
+ handle_object_copy(ono, r);
+ });
+
+ ldout(m_cct, 20) << "object_num=" << ono << dendl;
+ ++m_current_ops;
+
+ uint8_t object_diff_state = object_map::DIFF_STATE_HOLE;
+ if (m_object_diff_state.size() > 0) {
+ std::set<uint64_t> src_objects;
+ map_src_objects(ono, &src_objects);
+
+ for (auto src_ono : src_objects) {
+ if (src_ono >= m_object_diff_state.size()) {
+ object_diff_state = object_map::DIFF_STATE_DATA_UPDATED;
+ } else {
+ auto state = m_object_diff_state[src_ono];
+ if ((state == object_map::DIFF_STATE_HOLE_UPDATED &&
+ object_diff_state != object_map::DIFF_STATE_DATA_UPDATED) ||
+ (state == object_map::DIFF_STATE_DATA &&
+ object_diff_state == object_map::DIFF_STATE_HOLE) ||
+ (state == object_map::DIFF_STATE_DATA_UPDATED)) {
+ object_diff_state = state;
+ }
+ }
+ }
+
+ if (object_diff_state == object_map::DIFF_STATE_HOLE) {
+ ldout(m_cct, 20) << "skipping non-existent object " << ono << dendl;
+ create_async_context_callback(*m_src_image_ctx, ctx)->complete(0);
+ return;
+ }
+ }
+
+ uint32_t flags = 0;
+ if (m_flatten) {
+ flags |= OBJECT_COPY_REQUEST_FLAG_FLATTEN;
+ }
+ if (object_diff_state == object_map::DIFF_STATE_DATA) {
+ // no source objects have been updated and at least one has clean data
+ flags |= OBJECT_COPY_REQUEST_FLAG_EXISTS_CLEAN;
+ }
+
+ auto req = ObjectCopyRequest<I>::create(
+ m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_dst_snap_id_start,
+ m_snap_map, ono, flags, m_handler, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageCopyRequest<I>::handle_object_copy(uint64_t object_no, int r) {
+ ldout(m_cct, 20) << "object_no=" << object_no << ", r=" << r << dendl;
+
+ bool complete;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_current_ops > 0);
+ --m_current_ops;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "object copy failed: " << cpp_strerror(r) << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ } else {
+ m_copied_objects.push(object_no);
+ while (!m_updating_progress && !m_copied_objects.empty() &&
+ m_copied_objects.top() ==
+ (m_object_number ? *m_object_number + 1 : 0)) {
+ m_object_number = m_copied_objects.top();
+ m_copied_objects.pop();
+ uint64_t progress_object_no = *m_object_number + 1;
+ m_updating_progress = true;
+ m_lock.unlock();
+ m_handler->update_progress(progress_object_no, m_end_object_no);
+ m_lock.lock();
+ ceph_assert(m_updating_progress);
+ m_updating_progress = false;
+ }
+ }
+
+ send_next_object_copy();
+ complete = (m_current_ops == 0) && !m_updating_progress;
+ }
+
+ if (complete) {
+ finish(m_ret_val);
+ }
+}
+
+template <typename I>
+void ImageCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ put();
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::ImageCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/ImageCopyRequest.h b/src/librbd/deep_copy/ImageCopyRequest.h
new file mode 100644
index 000000000..cb8b83781
--- /dev/null
+++ b/src/librbd/deep_copy/ImageCopyRequest.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/bit_vector.hpp"
+#include "common/ceph_mutex.h"
+#include "common/RefCountedObj.h"
+#include "librbd/Types.h"
+#include "librbd/deep_copy/Types.h"
+#include <functional>
+#include <map>
+#include <queue>
+#include <set>
+#include <vector>
+#include <boost/optional.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace deep_copy {
+
+class Handler;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCopyRequest : public RefCountedObject {
+public:
+ static ImageCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ const ObjectNumber &object_number,
+ const SnapSeqs &snap_seqs,
+ Handler *handler,
+ Context *on_finish) {
+ return new ImageCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start,
+ src_snap_id_end, dst_snap_id_start, flatten,
+ object_number, snap_seqs, handler, on_finish);
+ }
+
+ ImageCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, const ObjectNumber &object_number,
+ const SnapSeqs &snap_seqs, Handler *handler,
+ Context *on_finish);
+
+ void send();
+ void cancel();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * COMPUTE_DIFF
+ * |
+ * | . . . . .
+ * | . . (parallel execution of
+ * v v . multiple objects at once)
+ * COPY_OBJECT . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_src_snap_id_end;
+ librados::snap_t m_dst_snap_id_start;
+ bool m_flatten;
+ ObjectNumber m_object_number;
+ SnapSeqs m_snap_seqs;
+ Handler *m_handler;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ ceph::mutex m_lock;
+ bool m_canceled = false;
+
+ uint64_t m_object_no = 0;
+ uint64_t m_end_object_no = 0;
+ uint64_t m_current_ops = 0;
+ std::priority_queue<
+ uint64_t, std::vector<uint64_t>, std::greater<uint64_t>> m_copied_objects;
+ bool m_updating_progress = false;
+ SnapMap m_snap_map;
+ int m_ret_val = 0;
+
+ BitVector<2> m_object_diff_state;
+
+ void map_src_objects(uint64_t dst_object, std::set<uint64_t> *src_objects);
+
+ void compute_diff();
+ void handle_compute_diff(int r);
+
+ void send_object_copies();
+ void send_next_object_copy();
+ void handle_object_copy(uint64_t object_no, int r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::ImageCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/MetadataCopyRequest.cc b/src/librbd/deep_copy/MetadataCopyRequest.cc
new file mode 100644
index 000000000..c584bea54
--- /dev/null
+++ b/src/librbd/deep_copy/MetadataCopyRequest.cc
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MetadataCopyRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+#include "librbd/image/GetMetadataRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::MetadataCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+namespace {
+
+const uint64_t MAX_METADATA_ITEMS = 128;
+
+} // anonymous namespace
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+MetadataCopyRequest<I>::MetadataCopyRequest(I *src_image_ctx, I *dst_image_ctx,
+ Context *on_finish)
+ : m_src_image_ctx(src_image_ctx), m_dst_image_ctx(dst_image_ctx),
+ m_on_finish(on_finish), m_cct(dst_image_ctx->cct) {
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::send() {
+ list_src_metadata();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::list_src_metadata() {
+ ldout(m_cct, 20) << "start_key=" << m_last_metadata_key << dendl;
+
+ m_metadata.clear();
+ auto ctx = create_context_callback<
+ MetadataCopyRequest<I>,
+ &MetadataCopyRequest<I>::handle_list_src_metadata>(this);
+ auto req = image::GetMetadataRequest<I>::create(
+ m_src_image_ctx->md_ctx, m_src_image_ctx->header_oid, true, "",
+ m_last_metadata_key, MAX_METADATA_ITEMS, &m_metadata, ctx);
+ req->send();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::handle_list_src_metadata(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_metadata.empty()) {
+ finish(0);
+ return;
+ }
+
+ m_last_metadata_key = m_metadata.rbegin()->first;
+ m_more_metadata = (m_metadata.size() >= MAX_METADATA_ITEMS);
+ set_dst_metadata();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::set_dst_metadata() {
+ ldout(m_cct, 20) << "count=" << m_metadata.size() << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::metadata_set(&op, m_metadata);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ MetadataCopyRequest<I>,
+ &MetadataCopyRequest<I>::handle_set_dst_metadata>(this);
+ m_dst_image_ctx->md_ctx.aio_operate(m_dst_image_ctx->header_oid, aio_comp,
+ &op);
+ aio_comp->release();
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::handle_set_dst_metadata(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to set metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_more_metadata) {
+ list_src_metadata();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void MetadataCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::MetadataCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/MetadataCopyRequest.h b/src/librbd/deep_copy/MetadataCopyRequest.h
new file mode 100644
index 000000000..8db55db96
--- /dev/null
+++ b/src/librbd/deep_copy/MetadataCopyRequest.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MetadataCopyRequest {
+public:
+ static MetadataCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ Context *on_finish) {
+ return new MetadataCopyRequest(src_image_ctx, dst_image_ctx, on_finish);
+ }
+
+ MetadataCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * LIST_SRC_METADATA <------\
+ * | | (repeat if additional
+ * v | metadata)
+ * SET_DST_METADATA --------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ typedef std::map<std::string, bufferlist> Metadata;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ bufferlist m_out_bl;
+
+ std::map<std::string, bufferlist> m_metadata;
+ std::string m_last_metadata_key;
+ bool m_more_metadata = false;
+
+ void list_src_metadata();
+ void handle_list_src_metadata(int r);
+
+ void set_dst_metadata();
+ void handle_set_dst_metadata(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::MetadataCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/ObjectCopyRequest.cc b/src/librbd/deep_copy/ObjectCopyRequest.cc
new file mode 100644
index 000000000..e86ed5ea1
--- /dev/null
+++ b/src/librbd/deep_copy/ObjectCopyRequest.cc
@@ -0,0 +1,845 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ObjectCopyRequest.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/errno.h"
+#include "librados/snap_set_diff.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/deep_copy/Handler.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::ObjectCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using librbd::util::get_image_ctx;
+
+template <typename I>
+ObjectCopyRequest<I>::ObjectCopyRequest(I *src_image_ctx,
+ I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t dst_snap_id_start,
+ const SnapMap &snap_map,
+ uint64_t dst_object_number,
+ uint32_t flags, Handler* handler,
+ Context *on_finish)
+ : m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_cct(dst_image_ctx->cct),
+ m_src_snap_id_start(src_snap_id_start),
+ m_dst_snap_id_start(dst_snap_id_start), m_snap_map(snap_map),
+ m_dst_object_number(dst_object_number), m_flags(flags),
+ m_handler(handler), m_on_finish(on_finish) {
+ ceph_assert(src_image_ctx->data_ctx.is_valid());
+ ceph_assert(dst_image_ctx->data_ctx.is_valid());
+ ceph_assert(!m_snap_map.empty());
+
+ m_src_async_op = new io::AsyncOperation();
+ m_src_async_op->start_op(*get_image_ctx(m_src_image_ctx));
+
+ m_src_io_ctx.dup(m_src_image_ctx->data_ctx);
+ m_dst_io_ctx.dup(m_dst_image_ctx->data_ctx);
+
+ m_dst_oid = m_dst_image_ctx->get_object_name(dst_object_number);
+
+ ldout(m_cct, 20) << "dst_oid=" << m_dst_oid << ", "
+ << "src_snap_id_start=" << m_src_snap_id_start << ", "
+ << "dst_snap_id_start=" << m_dst_snap_id_start << ", "
+ << "snap_map=" << m_snap_map << dendl;
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send() {
+ send_list_snaps();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_list_snaps() {
+ // image extents are consistent across src and dst so compute once
+ io::util::extent_to_file(
+ m_dst_image_ctx, m_dst_object_number, 0,
+ m_dst_image_ctx->layout.object_size, m_image_extents);
+ ldout(m_cct, 20) << "image_extents=" << m_image_extents << dendl;
+
+ auto ctx = create_async_context_callback(
+ *m_src_image_ctx, create_context_callback<
+ ObjectCopyRequest, &ObjectCopyRequest<I>::handle_list_snaps>(this));
+ if ((m_flags & OBJECT_COPY_REQUEST_FLAG_EXISTS_CLEAN) != 0) {
+ // skip listing the snaps if we know the destination exists and is clean,
+ // but we do need to update the object-map
+ ctx->complete(0);
+ return;
+ }
+
+ io::SnapIds snap_ids;
+ snap_ids.reserve(1 + m_snap_map.size());
+ snap_ids.push_back(m_src_snap_id_start);
+ for (auto& [src_snap_id, _] : m_snap_map) {
+ if (m_src_snap_id_start < src_snap_id) {
+ snap_ids.push_back(src_snap_id);
+ }
+ }
+
+ auto list_snaps_flags = io::LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT;
+
+ m_snapshot_delta.clear();
+
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, get_image_ctx(m_src_image_ctx), io::AIO_TYPE_GENERIC);
+ auto req = io::ImageDispatchSpec::create_list_snaps(
+ *m_src_image_ctx, io::IMAGE_DISPATCH_LAYER_NONE, aio_comp,
+ io::Extents{m_image_extents}, std::move(snap_ids), list_snaps_flags,
+ &m_snapshot_delta, {});
+ req->send();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_list_snaps(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to list snaps: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(m_cct, 20) << "snapshot_delta=" << m_snapshot_delta << dendl;
+
+ compute_dst_object_may_exist();
+ compute_read_ops();
+
+ send_read();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_read() {
+ if (m_read_snaps.empty()) {
+ // all snapshots have been read
+ merge_write_ops();
+ compute_zero_ops();
+
+ send_update_object_map();
+ return;
+ }
+
+ auto index = *m_read_snaps.begin();
+ auto& read_op = m_read_ops[index];
+ if (read_op.image_interval.empty()) {
+ // nothing written to this object for this snapshot (must be trunc/remove)
+ handle_read(0);
+ return;
+ }
+
+ auto io_context = m_src_image_ctx->duplicate_data_io_context();
+ io_context->read_snap(index.second);
+
+ io::Extents image_extents{read_op.image_interval.begin(),
+ read_op.image_interval.end()};
+ io::ReadResult read_result{&read_op.image_extent_map,
+ &read_op.out_bl};
+
+ ldout(m_cct, 20) << "read: src_snap_seq=" << index.second << ", "
+ << "image_extents=" << image_extents << dendl;
+
+ int op_flags = (LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+
+ int read_flags = 0;
+ if (index.second != m_src_image_ctx->snap_id) {
+ read_flags |= io::READ_FLAG_DISABLE_CLIPPING;
+ }
+
+ auto ctx = create_context_callback<
+ ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_read>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, get_image_ctx(m_src_image_ctx), io::AIO_TYPE_READ);
+
+ auto req = io::ImageDispatchSpec::create_read(
+ *m_src_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp,
+ std::move(image_extents), std::move(read_result), io_context, op_flags,
+ read_flags, {});
+ req->send();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_read(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to read from source object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_handler != nullptr) {
+ auto index = *m_read_snaps.begin();
+ auto& read_op = m_read_ops[index];
+ m_handler->handle_read(read_op.out_bl.length());
+ }
+
+ ceph_assert(!m_read_snaps.empty());
+ m_read_snaps.erase(m_read_snaps.begin());
+
+ send_read();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_update_object_map() {
+ if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP) ||
+ m_dst_object_state.empty()) {
+ process_copyup();
+ return;
+ }
+
+ m_dst_image_ctx->owner_lock.lock_shared();
+ m_dst_image_ctx->image_lock.lock_shared();
+ if (m_dst_image_ctx->object_map == nullptr) {
+ // possible that exclusive lock was lost in background
+ lderr(m_cct) << "object map is not initialized" << dendl;
+
+ m_dst_image_ctx->image_lock.unlock_shared();
+ m_dst_image_ctx->owner_lock.unlock_shared();
+ finish(-EINVAL);
+ return;
+ }
+
+ auto &dst_object_state = *m_dst_object_state.begin();
+ auto it = m_snap_map.find(dst_object_state.first);
+ ceph_assert(it != m_snap_map.end());
+ auto dst_snap_id = it->second.front();
+ auto object_state = dst_object_state.second;
+ m_dst_object_state.erase(m_dst_object_state.begin());
+
+ ldout(m_cct, 20) << "dst_snap_id=" << dst_snap_id << ", object_state="
+ << static_cast<uint32_t>(object_state) << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ m_dst_image_ctx->image_lock.unlock_shared();
+ m_dst_image_ctx->owner_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_update_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+
+ auto dst_image_ctx = m_dst_image_ctx;
+ bool sent = dst_image_ctx->object_map->template aio_update<
+ Context, &Context::complete>(dst_snap_id, m_dst_object_number, object_state,
+ {}, {}, false, ctx);
+
+ // NOTE: state machine might complete before we reach here
+ dst_image_ctx->image_lock.unlock_shared();
+ dst_image_ctx->owner_lock.unlock_shared();
+ if (!sent) {
+ ceph_assert(dst_snap_id == CEPH_NOSNAP);
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_update_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_dst_object_state.empty()) {
+ send_update_object_map();
+ return;
+ }
+
+ process_copyup();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::process_copyup() {
+ if (m_snapshot_sparse_bufferlist.empty()) {
+ // no data to copy or truncate/zero. only the copyup state machine cares
+ // about whether the object exists or not, and it always copies from
+ // snap id 0.
+ finish(m_src_snap_id_start > 0 ? 0 : -ENOENT);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ // let dispatch layers have a chance to process the data but
+ // assume that the dispatch layer will only touch the sparse bufferlist
+ auto r = m_dst_image_ctx->io_object_dispatcher->prepare_copyup(
+ m_dst_object_number, &m_snapshot_sparse_bufferlist);
+ if (r < 0) {
+ lderr(m_cct) << "failed to prepare copyup data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_write_object();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::send_write_object() {
+ ceph_assert(!m_snapshot_sparse_bufferlist.empty());
+ auto& sparse_bufferlist = m_snapshot_sparse_bufferlist.begin()->second;
+
+ m_src_image_ctx->image_lock.lock_shared();
+ bool hide_parent = (m_src_snap_id_start == 0 &&
+ m_src_image_ctx->parent != nullptr);
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ // retrieve the destination snap context for the op
+ SnapIds dst_snap_ids;
+ librados::snap_t dst_snap_seq = 0;
+ librados::snap_t src_snap_seq = m_snapshot_sparse_bufferlist.begin()->first;
+ if (src_snap_seq != 0) {
+ auto snap_map_it = m_snap_map.find(src_snap_seq);
+ ceph_assert(snap_map_it != m_snap_map.end());
+
+ auto dst_snap_id = snap_map_it->second.front();
+ auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_id);
+ ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end());
+ if (!dst_may_exist_it->second && !sparse_bufferlist.empty()) {
+ // if the object cannot exist, the only valid op is to remove it
+ ldout(m_cct, 20) << "object DNE: src_snap_seq=" << src_snap_seq << dendl;
+ ceph_assert(sparse_bufferlist.ext_count() == 1U);
+ ceph_assert(sparse_bufferlist.begin().get_val().state ==
+ io::SPARSE_EXTENT_STATE_ZEROED &&
+ sparse_bufferlist.begin().get_off() == 0 &&
+ sparse_bufferlist.begin().get_len() ==
+ m_dst_image_ctx->layout.object_size);
+ }
+
+ // write snapshot context should be before actual snapshot
+ ceph_assert(!snap_map_it->second.empty());
+ auto dst_snap_ids_it = snap_map_it->second.begin();
+ ++dst_snap_ids_it;
+
+ dst_snap_ids = SnapIds{dst_snap_ids_it, snap_map_it->second.end()};
+ if (!dst_snap_ids.empty()) {
+ dst_snap_seq = dst_snap_ids.front();
+ }
+ ceph_assert(dst_snap_seq != CEPH_NOSNAP);
+ }
+
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", "
+ << "dst_snap_seq=" << dst_snap_seq << ", "
+ << "dst_snaps=" << dst_snap_ids << dendl;
+
+ librados::ObjectWriteOperation op;
+
+ bool migration = ((m_flags & OBJECT_COPY_REQUEST_FLAG_MIGRATION) != 0);
+ if (migration) {
+ ldout(m_cct, 20) << "assert_snapc_seq=" << dst_snap_seq << dendl;
+ cls_client::assert_snapc_seq(&op, dst_snap_seq,
+ cls::rbd::ASSERT_SNAPC_SEQ_GT_SNAPSET_SEQ);
+ }
+
+ for (auto& sbe : sparse_bufferlist) {
+ switch (sbe.get_val().state) {
+ case io::SPARSE_EXTENT_STATE_DATA:
+ ldout(m_cct, 20) << "write op: " << sbe.get_off() << "~"
+ << sbe.get_len() << dendl;
+ op.write(sbe.get_off(), std::move(sbe.get_val().bl));
+ op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ break;
+ case io::SPARSE_EXTENT_STATE_ZEROED:
+ if (sbe.get_off() + sbe.get_len() ==
+ m_dst_image_ctx->layout.object_size) {
+ if (sbe.get_off() == 0) {
+ if (hide_parent) {
+ ldout(m_cct, 20) << "create+truncate op" << dendl;
+ op.create(false);
+ op.truncate(0);
+ } else {
+ ldout(m_cct, 20) << "remove op" << dendl;
+ op.remove();
+ }
+ } else {
+ ldout(m_cct, 20) << "trunc op: " << sbe.get_off() << dendl;
+ op.truncate(sbe.get_off());
+ }
+ } else {
+ ldout(m_cct, 20) << "zero op: " << sbe.get_off() << "~"
+ << sbe.get_len() << dendl;
+ op.zero(sbe.get_off(), sbe.get_len());
+ }
+ break;
+ default:
+ ceph_abort();
+ }
+ }
+
+ if (op.size() == (migration ? 1 : 0)) {
+ handle_write_object(0);
+ return;
+ }
+
+ int r;
+ Context *finish_op_ctx;
+ {
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r);
+ }
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_write_object(r);
+ finish_op_ctx->complete(0);
+ });
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+ r = m_dst_io_ctx.aio_operate(m_dst_oid, comp, &op, dst_snap_seq, dst_snap_ids,
+ nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::handle_write_object(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r == -ERANGE) {
+ ldout(m_cct, 10) << "concurrent deep copy" << dendl;
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to write to destination object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ m_snapshot_sparse_bufferlist.erase(m_snapshot_sparse_bufferlist.begin());
+ if (!m_snapshot_sparse_bufferlist.empty()) {
+ send_write_object();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+Context *ObjectCopyRequest<I>::start_lock_op(ceph::shared_mutex &owner_lock,
+ int* r) {
+ ceph_assert(ceph_mutex_is_locked(m_dst_image_ctx->owner_lock));
+ if (m_dst_image_ctx->exclusive_lock == nullptr) {
+ return new LambdaContext([](int r) {});
+ }
+ return m_dst_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_read_ops() {
+ ldout(m_cct, 20) << dendl;
+
+ m_src_image_ctx->image_lock.lock_shared();
+ bool read_from_parent = (m_src_snap_id_start == 0 &&
+ m_src_image_ctx->parent != nullptr);
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ bool only_dne_extents = true;
+ interval_set<uint64_t> dne_image_interval;
+
+ // compute read ops for any data sections or for any extents that we need to
+ // read from our parent
+ for (auto& [key, image_intervals] : m_snapshot_delta) {
+ io::WriteReadSnapIds write_read_snap_ids{key};
+
+ // advance the src write snap id to the first valid snap id
+ if (write_read_snap_ids.first > m_src_snap_id_start) {
+ // don't attempt to read from snapshots that shouldn't exist in
+ // case the OSD fails to give a correct snap list
+ auto snap_map_it = m_snap_map.find(write_read_snap_ids.first);
+ ceph_assert(snap_map_it != m_snap_map.end());
+ auto dst_snap_seq = snap_map_it->second.front();
+
+ auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_seq);
+ ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end());
+ if (!dst_may_exist_it->second) {
+ ldout(m_cct, 20) << "DNE snapshot: " << write_read_snap_ids.first
+ << dendl;
+ continue;
+ }
+ }
+
+ for (auto& image_interval : image_intervals) {
+ auto state = image_interval.get_val().state;
+ switch (state) {
+ case io::SPARSE_EXTENT_STATE_DNE:
+ if (write_read_snap_ids == io::INITIAL_WRITE_READ_SNAP_IDS &&
+ read_from_parent) {
+ // special-case for DNE initial object-extents since when flattening
+ // we need to read data from the parent images extents
+ ldout(m_cct, 20) << "DNE extent: "
+ << image_interval.get_off() << "~"
+ << image_interval.get_len() << dendl;
+ dne_image_interval.insert(
+ image_interval.get_off(), image_interval.get_len());
+ }
+ break;
+ case io::SPARSE_EXTENT_STATE_ZEROED:
+ only_dne_extents = false;
+ break;
+ case io::SPARSE_EXTENT_STATE_DATA:
+ ldout(m_cct, 20) << "read op: "
+ << "snap_ids=" << write_read_snap_ids << " "
+ << image_interval.get_off() << "~"
+ << image_interval.get_len() << dendl;
+ m_read_ops[write_read_snap_ids].image_interval.union_insert(
+ image_interval.get_off(), image_interval.get_len());
+ only_dne_extents = false;
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ }
+ }
+
+ bool flatten = ((m_flags & OBJECT_COPY_REQUEST_FLAG_FLATTEN) != 0);
+ if (!dne_image_interval.empty() && (!only_dne_extents || flatten)) {
+ auto snap_map_it = m_snap_map.begin();
+ ceph_assert(snap_map_it != m_snap_map.end());
+
+ auto src_snap_seq = snap_map_it->first;
+ WriteReadSnapIds write_read_snap_ids{src_snap_seq, src_snap_seq};
+
+ // prepare to prune the extents to the maximum parent overlap
+ m_src_image_ctx->image_lock.lock_shared();
+ uint64_t src_parent_overlap = 0;
+ int r = m_src_image_ctx->get_parent_overlap(src_snap_seq,
+ &src_parent_overlap);
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed getting parent overlap for snap_id: "
+ << src_snap_seq << ": " << cpp_strerror(r) << dendl;
+ } else {
+ ldout(m_cct, 20) << "parent overlap=" << src_parent_overlap << dendl;
+ for (auto& [image_offset, image_length] : dne_image_interval) {
+ auto end_image_offset = std::min(
+ image_offset + image_length, src_parent_overlap);
+ if (image_offset >= end_image_offset) {
+ // starting offset is beyond the end of the parent overlap
+ continue;
+ }
+
+ image_length = end_image_offset - image_offset;
+ ldout(m_cct, 20) << "parent read op: "
+ << "snap_ids=" << write_read_snap_ids << " "
+ << image_offset << "~" << image_length << dendl;
+ m_read_ops[write_read_snap_ids].image_interval.union_insert(
+ image_offset, image_length);
+ }
+ }
+ }
+
+ for (auto& [write_read_snap_ids, _] : m_read_ops) {
+ m_read_snaps.push_back(write_read_snap_ids);
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::merge_write_ops() {
+ ldout(m_cct, 20) << dendl;
+
+ for (auto& [write_read_snap_ids, read_op] : m_read_ops) {
+ auto src_snap_seq = write_read_snap_ids.first;
+
+ // convert the the resulting sparse image extent map to an interval ...
+ auto& image_data_interval = m_dst_data_interval[src_snap_seq];
+ for (auto [image_offset, image_length] : read_op.image_extent_map) {
+ image_data_interval.union_insert(image_offset, image_length);
+ }
+
+ // ... and compute the difference between it and the image extents since
+ // that indicates zeroed extents
+ interval_set<uint64_t> intersection;
+ intersection.intersection_of(read_op.image_interval, image_data_interval);
+ read_op.image_interval.subtract(intersection);
+
+ for (auto& [image_offset, image_length] : read_op.image_interval) {
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", "
+ << "inserting sparse-read zero " << image_offset << "~"
+ << image_length << dendl;
+ m_dst_zero_interval[src_snap_seq].union_insert(
+ image_offset, image_length);
+ }
+
+ uint64_t buffer_offset = 0;
+ for (auto [image_offset, image_length] : read_op.image_extent_map) {
+ // convert image extents back to object extents for the write op
+ striper::LightweightObjectExtents object_extents;
+ io::util::file_to_extents(m_dst_image_ctx, image_offset,
+ image_length, buffer_offset, &object_extents);
+ for (auto& object_extent : object_extents) {
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", "
+ << "object_offset=" << object_extent.offset << ", "
+ << "object_length=" << object_extent.length << dendl;
+
+ bufferlist sub_bl;
+ sub_bl.substr_of(read_op.out_bl, buffer_offset, object_extent.length);
+
+ m_snapshot_sparse_bufferlist[src_snap_seq].insert(
+ object_extent.offset, object_extent.length,
+ {io::SPARSE_EXTENT_STATE_DATA, object_extent.length,\
+ std::move(sub_bl)});
+
+ buffer_offset += object_extent.length;
+ }
+ }
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_zero_ops() {
+ ldout(m_cct, 20) << dendl;
+
+ m_src_image_ctx->image_lock.lock_shared();
+ bool hide_parent = (m_src_snap_id_start == 0 &&
+ m_src_image_ctx->parent != nullptr);
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ // ensure we have a zeroed interval for each snapshot
+ for (auto& [src_snap_seq, _] : m_snap_map) {
+ if (m_src_snap_id_start < src_snap_seq) {
+ m_dst_zero_interval[src_snap_seq];
+ }
+ }
+
+ // exists if copying from an arbitrary snapshot w/o any deltas in the
+ // start snapshot slot (i.e. DNE)
+ bool object_exists = (
+ m_src_snap_id_start > 0 &&
+ m_snapshot_delta.count({m_src_snap_id_start, m_src_snap_id_start}) == 0);
+ bool fast_diff = m_dst_image_ctx->test_features(RBD_FEATURE_FAST_DIFF);
+ uint64_t prev_end_size = 0;
+
+ // compute zero ops from the zeroed intervals
+ for (auto &it : m_dst_zero_interval) {
+ auto src_snap_seq = it.first;
+ auto &zero_interval = it.second;
+
+ auto snap_map_it = m_snap_map.find(src_snap_seq);
+ ceph_assert(snap_map_it != m_snap_map.end());
+ auto dst_snap_seq = snap_map_it->second.front();
+
+ auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_seq);
+ ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end());
+ if (!dst_may_exist_it->second && object_exists) {
+ ldout(m_cct, 5) << "object DNE for snap_id: " << dst_snap_seq << dendl;
+ m_snapshot_sparse_bufferlist[src_snap_seq].insert(
+ 0, m_dst_image_ctx->layout.object_size,
+ {io::SPARSE_EXTENT_STATE_ZEROED, m_dst_image_ctx->layout.object_size});
+ object_exists = false;
+ prev_end_size = 0;
+ continue;
+ }
+
+ if (hide_parent) {
+ std::shared_lock image_locker{m_dst_image_ctx->image_lock};
+ uint64_t parent_overlap = 0;
+ int r = m_dst_image_ctx->get_parent_overlap(dst_snap_seq,
+ &parent_overlap);
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed getting parent overlap for snap_id: "
+ << dst_snap_seq << ": " << cpp_strerror(r) << dendl;
+ }
+ if (parent_overlap == 0) {
+ ldout(m_cct, 20) << "no parent overlap" << dendl;
+ hide_parent = false;
+ } else {
+ auto image_extents = m_image_extents;
+ uint64_t overlap = m_dst_image_ctx->prune_parent_extents(
+ image_extents, parent_overlap);
+ if (overlap == 0) {
+ ldout(m_cct, 20) << "no parent overlap" << dendl;
+ hide_parent = false;
+ }
+ }
+ }
+
+ // collect known zeroed extents from the snapshot delta for the current
+ // src snapshot. If this is the first snapshot, we might need to handle
+ // the whiteout case if it overlaps with the parent
+ auto first_src_snap_id = m_snap_map.begin()->first;
+ auto snapshot_delta_it = m_snapshot_delta.lower_bound(
+ {(hide_parent && src_snap_seq == first_src_snap_id ?
+ 0 : src_snap_seq), 0});
+ for (; snapshot_delta_it != m_snapshot_delta.end() &&
+ snapshot_delta_it->first.first <= src_snap_seq;
+ ++snapshot_delta_it) {
+ auto& write_read_snap_ids = snapshot_delta_it->first;
+ auto& image_intervals = snapshot_delta_it->second;
+ for (auto& image_interval : image_intervals) {
+ auto state = image_interval.get_val().state;
+ switch (state) {
+ case io::SPARSE_EXTENT_STATE_ZEROED:
+ if (write_read_snap_ids != io::INITIAL_WRITE_READ_SNAP_IDS) {
+ ldout(m_cct, 20) << "zeroed extent: "
+ << "src_snap_seq=" << src_snap_seq << " "
+ << image_interval.get_off() << "~"
+ << image_interval.get_len() << dendl;
+ zero_interval.union_insert(
+ image_interval.get_off(), image_interval.get_len());
+ } else if (hide_parent &&
+ write_read_snap_ids == io::INITIAL_WRITE_READ_SNAP_IDS) {
+ ldout(m_cct, 20) << "zeroed (hide parent) extent: "
+ << "src_snap_seq=" << src_snap_seq << " "
+ << image_interval.get_off() << "~"
+ << image_interval.get_len() << dendl;
+ zero_interval.union_insert(
+ image_interval.get_off(), image_interval.get_len());
+ }
+ break;
+ case io::SPARSE_EXTENT_STATE_DNE:
+ case io::SPARSE_EXTENT_STATE_DATA:
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ }
+ }
+
+ // subtract any data intervals from our zero intervals
+ auto& data_interval = m_dst_data_interval[src_snap_seq];
+ interval_set<uint64_t> intersection;
+ intersection.intersection_of(zero_interval, data_interval);
+ zero_interval.subtract(intersection);
+
+ // update end_size if there are writes into higher offsets
+ uint64_t end_size = prev_end_size;
+ auto iter = m_snapshot_sparse_bufferlist.find(src_snap_seq);
+ if (iter != m_snapshot_sparse_bufferlist.end()) {
+ for (auto &sparse_bufferlist : iter->second) {
+ object_exists = true;
+ end_size = std::max(
+ end_size, sparse_bufferlist.get_off() + sparse_bufferlist.get_len());
+ }
+ }
+
+ ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", "
+ << "dst_snap_seq=" << dst_snap_seq << ", "
+ << "zero_interval=" << zero_interval << ", "
+ << "end_size=" << end_size << dendl;
+ for (auto z = zero_interval.begin(); z != zero_interval.end(); ++z) {
+ // convert image extents back to object extents for the write op
+ striper::LightweightObjectExtents object_extents;
+ io::util::file_to_extents(m_dst_image_ctx, z.get_start(), z.get_len(), 0,
+ &object_extents);
+ for (auto& object_extent : object_extents) {
+ ceph_assert(object_extent.offset + object_extent.length <=
+ m_dst_image_ctx->layout.object_size);
+
+ if (object_extent.offset + object_extent.length >= end_size) {
+ // zero interval at the object end
+ if ((object_extent.offset == 0 && hide_parent) ||
+ (object_extent.offset < prev_end_size)) {
+ ldout(m_cct, 20) << "truncate " << object_extent.offset
+ << dendl;
+ auto length =
+ m_dst_image_ctx->layout.object_size - object_extent.offset;
+ m_snapshot_sparse_bufferlist[src_snap_seq].insert(
+ object_extent.offset, length,
+ {io::SPARSE_EXTENT_STATE_ZEROED, length});
+ }
+
+ object_exists = (object_extent.offset > 0 || hide_parent);
+ end_size = std::min(end_size, object_extent.offset);
+ } else {
+ // zero interval inside the object
+ ldout(m_cct, 20) << "zero "
+ << object_extent.offset << "~"
+ << object_extent.length << dendl;
+ m_snapshot_sparse_bufferlist[src_snap_seq].insert(
+ object_extent.offset, object_extent.length,
+ {io::SPARSE_EXTENT_STATE_ZEROED, object_extent.length});
+ object_exists = true;
+ }
+ }
+ }
+
+ uint8_t dst_object_map_state = OBJECT_NONEXISTENT;
+ if (object_exists) {
+ dst_object_map_state = OBJECT_EXISTS;
+ if (fast_diff && m_snapshot_sparse_bufferlist.count(src_snap_seq) == 0) {
+ dst_object_map_state = OBJECT_EXISTS_CLEAN;
+ }
+ m_dst_object_state[src_snap_seq] = dst_object_map_state;
+ }
+
+ ldout(m_cct, 20) << "dst_snap_seq=" << dst_snap_seq << ", "
+ << "end_size=" << end_size << ", "
+ << "dst_object_map_state="
+ << static_cast<uint32_t>(dst_object_map_state) << dendl;
+ prev_end_size = end_size;
+ }
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ // ensure IoCtxs are closed prior to proceeding
+ auto on_finish = m_on_finish;
+
+ m_src_async_op->finish_op();
+ delete m_src_async_op;
+ delete this;
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void ObjectCopyRequest<I>::compute_dst_object_may_exist() {
+ std::shared_lock image_locker{m_dst_image_ctx->image_lock};
+
+ auto snap_ids = m_dst_image_ctx->snaps;
+ snap_ids.push_back(CEPH_NOSNAP);
+
+ for (auto snap_id : snap_ids) {
+ m_dst_object_may_exist[snap_id] =
+ (m_dst_object_number < m_dst_image_ctx->get_object_count(snap_id));
+ }
+
+ ldout(m_cct, 20) << "dst_object_may_exist=" << m_dst_object_may_exist
+ << dendl;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::ObjectCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/ObjectCopyRequest.h b/src/librbd/deep_copy/ObjectCopyRequest.h
new file mode 100644
index 000000000..7a89e333e
--- /dev/null
+++ b/src/librbd/deep_copy/ObjectCopyRequest.h
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/interval_set.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/deep_copy/Types.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <map>
+#include <string>
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+namespace io { class AsyncOperation; }
+
+namespace deep_copy {
+
+struct Handler;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectCopyRequest {
+public:
+ static ObjectCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t dst_snap_id_start,
+ const SnapMap &snap_map,
+ uint64_t object_number, uint32_t flags,
+ Handler* handler, Context *on_finish) {
+ return new ObjectCopyRequest(src_image_ctx, dst_image_ctx,
+ src_snap_id_start, dst_snap_id_start, snap_map,
+ object_number, flags, handler, on_finish);
+ }
+
+ ObjectCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t dst_snap_id_start, const SnapMap &snap_map,
+ uint64_t object_number, uint32_t flags, Handler* handler,
+ Context *on_finish);
+
+ void send();
+
+ // testing support
+ inline librados::IoCtx &get_src_io_ctx() {
+ return m_src_io_ctx;
+ }
+ inline librados::IoCtx &get_dst_io_ctx() {
+ return m_dst_io_ctx;
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * LIST_SNAPS
+ * |
+ * |/---------\
+ * | | (repeat for each snapshot)
+ * v |
+ * READ ---------/
+ * |
+ * | /-----------\
+ * | | | (repeat for each snapshot)
+ * v v |
+ * UPDATE_OBJECT_MAP ---/ (skip if object
+ * | map disabled)
+ * | /-----------\
+ * | | | (repeat for each snapshot)
+ * v v |
+ * WRITE_OBJECT --------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ enum WriteOpType {
+ WRITE_OP_TYPE_WRITE,
+ WRITE_OP_TYPE_ZERO
+ };
+
+ struct ReadOp {
+ interval_set<uint64_t> image_interval;
+ io::Extents image_extent_map;
+ bufferlist out_bl;
+ };
+
+ typedef std::pair<librados::snap_t, librados::snap_t> WriteReadSnapIds;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ CephContext *m_cct;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_dst_snap_id_start;
+ SnapMap m_snap_map;
+ uint64_t m_dst_object_number;
+ uint32_t m_flags;
+ Handler* m_handler;
+ Context *m_on_finish;
+
+ decltype(m_src_image_ctx->data_ctx) m_src_io_ctx;
+ decltype(m_dst_image_ctx->data_ctx) m_dst_io_ctx;
+ std::string m_dst_oid;
+
+ io::Extents m_image_extents;
+
+ io::SnapshotDelta m_snapshot_delta;
+
+ std::map<WriteReadSnapIds, ReadOp> m_read_ops;
+ std::list<WriteReadSnapIds> m_read_snaps;
+ io::SnapshotSparseBufferlist m_snapshot_sparse_bufferlist;
+
+ std::map<librados::snap_t, interval_set<uint64_t>> m_dst_data_interval;
+ std::map<librados::snap_t, interval_set<uint64_t>> m_dst_zero_interval;
+ std::map<librados::snap_t, uint8_t> m_dst_object_state;
+ std::map<librados::snap_t, bool> m_dst_object_may_exist;
+
+ io::AsyncOperation* m_src_async_op = nullptr;
+
+ void send_list_snaps();
+ void handle_list_snaps(int r);
+
+ void send_read();
+ void handle_read(int r);
+
+ void send_update_object_map();
+ void handle_update_object_map(int r);
+
+ void process_copyup();
+ void send_write_object();
+ void handle_write_object(int r);
+
+ Context *start_lock_op(ceph::shared_mutex &owner_lock, int* r);
+
+ void compute_read_ops();
+ void merge_write_ops();
+ void compute_zero_ops();
+
+ void compute_dst_object_may_exist();
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::ObjectCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/SetHeadRequest.cc b/src/librbd/deep_copy/SetHeadRequest.cc
new file mode 100644
index 000000000..1e056b958
--- /dev/null
+++ b/src/librbd/deep_copy/SetHeadRequest.cc
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SetHeadRequest.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/Utils.h"
+#include "librbd/image/AttachParentRequest.h"
+#include "librbd/image/DetachParentRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::SetHeadRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+SetHeadRequest<I>::SetHeadRequest(I *image_ctx, uint64_t size,
+ const cls::rbd::ParentImageSpec &spec,
+ uint64_t parent_overlap,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_size(size), m_parent_spec(spec),
+ m_parent_overlap(parent_overlap), m_on_finish(on_finish),
+ m_cct(image_ctx->cct) {
+ ceph_assert(m_parent_overlap <= m_size);
+}
+
+template <typename I>
+void SetHeadRequest<I>::send() {
+ send_set_size();
+}
+
+template <typename I>
+void SetHeadRequest<I>::send_set_size() {
+ m_image_ctx->image_lock.lock_shared();
+ if (m_image_ctx->size == m_size) {
+ m_image_ctx->image_lock.unlock_shared();
+ send_detach_parent();
+ return;
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(m_cct, 20) << dendl;
+
+ // Change the image size on disk so that the snapshot picks up
+ // the expected size. We can do this because the last snapshot
+ // we process is the sync snapshot which was created to match the
+ // image size. We also don't need to worry about trimming because
+ // we track the highest possible object number within the sync record
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::set_size(&op, m_size);
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_set_size(r);
+ finish_op_ctx->complete(0);
+ });
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+ r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SetHeadRequest<I>::handle_set_size(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update image size: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // adjust in-memory image size now that it's updated on disk
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ if (m_image_ctx->size > m_size) {
+ if (m_image_ctx->parent_md.spec.pool_id != -1 &&
+ m_image_ctx->parent_md.overlap > m_size) {
+ m_image_ctx->parent_md.overlap = m_size;
+ }
+ }
+ m_image_ctx->size = m_size;
+ }
+
+ send_detach_parent();
+}
+
+template <typename I>
+void SetHeadRequest<I>::send_detach_parent() {
+ m_image_ctx->image_lock.lock_shared();
+ if (m_image_ctx->parent_md.spec.pool_id == -1 ||
+ (m_image_ctx->parent_md.spec == m_parent_spec &&
+ m_image_ctx->parent_md.overlap == m_parent_overlap)) {
+ m_image_ctx->image_lock.unlock_shared();
+ send_attach_parent();
+ return;
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(m_cct, 20) << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_detach_parent(r);
+ finish_op_ctx->complete(0);
+ });
+ auto req = image::DetachParentRequest<I>::create(*m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void SetHeadRequest<I>::handle_detach_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // adjust in-memory parent now that it's updated on disk
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_image_ctx->parent_md.spec = {};
+ m_image_ctx->parent_md.overlap = 0;
+ }
+
+ send_attach_parent();
+}
+
+template <typename I>
+void SetHeadRequest<I>::send_attach_parent() {
+ m_image_ctx->image_lock.lock_shared();
+ if (m_image_ctx->parent_md.spec == m_parent_spec &&
+ m_image_ctx->parent_md.overlap == m_parent_overlap) {
+ m_image_ctx->image_lock.unlock_shared();
+ finish(0);
+ return;
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(m_cct, 20) << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_attach_parent(r);
+ finish_op_ctx->complete(0);
+ });
+ auto req = image::AttachParentRequest<I>::create(
+ *m_image_ctx, m_parent_spec, m_parent_overlap, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void SetHeadRequest<I>::handle_attach_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // adjust in-memory parent now that it's updated on disk
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_image_ctx->parent_md.spec = m_parent_spec;
+ m_image_ctx->parent_md.overlap = m_parent_overlap;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+Context *SetHeadRequest<I>::start_lock_op(int* r) {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ return new LambdaContext([](int r) {});
+ }
+ return m_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void SetHeadRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::SetHeadRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/SetHeadRequest.h b/src/librbd/deep_copy/SetHeadRequest.h
new file mode 100644
index 000000000..9a17c9fd0
--- /dev/null
+++ b/src/librbd/deep_copy/SetHeadRequest.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SetHeadRequest {
+public:
+ static SetHeadRequest* create(ImageCtxT *image_ctx, uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap,
+ Context *on_finish) {
+ return new SetHeadRequest(image_ctx, size, parent_spec, parent_overlap,
+ on_finish);
+ }
+
+ SetHeadRequest(ImageCtxT *image_ctx, uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (skip if not needed)
+ * SET_SIZE
+ * |
+ * v (skip if not needed)
+ * DETACH_PARENT
+ * |
+ * v (skip if not needed)
+ * ATTACH_PARENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_size;
+ cls::rbd::ParentImageSpec m_parent_spec;
+ uint64_t m_parent_overlap;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ void send_set_size();
+ void handle_set_size(int r);
+
+ void send_detach_parent();
+ void handle_detach_parent(int r);
+
+ void send_attach_parent();
+ void handle_attach_parent(int r);
+
+ Context *start_lock_op(int* r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::SetHeadRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H
diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.cc b/src/librbd/deep_copy/SnapshotCopyRequest.cc
new file mode 100644
index 000000000..9ff02644d
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCopyRequest.cc
@@ -0,0 +1,731 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SnapshotCopyRequest.h"
+#include "SetHeadRequest.h"
+#include "SnapshotCreateRequest.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCopyRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+namespace {
+
+template <typename I>
+const std::string &get_snapshot_name(I *image_ctx, librados::snap_t snap_id) {
+ auto snap_it = std::find_if(image_ctx->snap_ids.begin(),
+ image_ctx->snap_ids.end(),
+ [snap_id](
+ const std::pair<
+ std::pair<cls::rbd::SnapshotNamespace,
+ std::string>,
+ librados::snap_t> &pair) {
+ return pair.second == snap_id;
+ });
+ ceph_assert(snap_it != image_ctx->snap_ids.end());
+ return snap_it->first.second;
+}
+
+} // anonymous namespace
+
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+SnapshotCopyRequest<I>::SnapshotCopyRequest(I *src_image_ctx,
+ I *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten,
+ asio::ContextWQ *work_queue,
+ SnapSeqs *snap_seqs,
+ Context *on_finish)
+ : RefCountedObject(dst_image_ctx->cct), m_src_image_ctx(src_image_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start),
+ m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start),
+ m_flatten(flatten), m_work_queue(work_queue), m_snap_seqs_result(snap_seqs),
+ m_snap_seqs(*snap_seqs), m_on_finish(on_finish), m_cct(dst_image_ctx->cct),
+ m_lock(ceph::make_mutex(unique_lock_name("SnapshotCopyRequest::m_lock", this))) {
+ ceph_assert((m_src_snap_id_start == 0 && m_dst_snap_id_start == 0) ||
+ (m_src_snap_id_start > 0 && m_dst_snap_id_start > 0));
+
+ // snap ids ordered from oldest to newest
+ m_src_image_ctx->image_lock.lock_shared();
+ m_src_snap_ids.insert(src_image_ctx->snaps.begin(),
+ src_image_ctx->snaps.end());
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ m_dst_image_ctx->image_lock.lock_shared();
+ m_dst_snap_ids.insert(dst_image_ctx->snaps.begin(),
+ dst_image_ctx->snaps.end());
+ m_dst_image_ctx->image_lock.unlock_shared();
+
+ if (m_src_snap_id_end != CEPH_NOSNAP) {
+ m_src_snap_ids.erase(m_src_snap_ids.upper_bound(m_src_snap_id_end),
+ m_src_snap_ids.end());
+ }
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send() {
+ cls::rbd::ParentImageSpec src_parent_spec;
+ int r = validate_parent(m_src_image_ctx, &src_parent_spec);
+ if (r < 0) {
+ lderr(m_cct) << "source image parent spec mismatch" << dendl;
+ error(r);
+ return;
+ }
+
+ r = validate_parent(m_dst_image_ctx, &m_dst_parent_spec);
+ if (r < 0) {
+ lderr(m_cct) << "destination image parent spec mismatch" << dendl;
+ error(r);
+ return;
+ }
+
+ send_snap_unprotect();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::cancel() {
+ std::lock_guard locker{m_lock};
+
+ ldout(m_cct, 20) << dendl;
+ m_canceled = true;
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_unprotect() {
+
+ SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_dst_snap_id_start > 0) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start);
+ }
+
+ for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t dst_snap_id = *snap_id_it;
+
+ m_dst_image_ctx->image_lock.lock_shared();
+
+ bool dst_unprotected;
+ int r = m_dst_image_ctx->is_snap_unprotected(dst_snap_id, &dst_unprotected);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve destination snap unprotect status: "
+ << cpp_strerror(r) << dendl;
+ m_dst_image_ctx->image_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+ m_dst_image_ctx->image_lock.unlock_shared();
+
+ if (dst_unprotected) {
+ // snap is already unprotected -- check next snap
+ continue;
+ }
+
+ // if destination snapshot is protected and (1) it isn't in our mapping
+ // table, or (2) the source snapshot isn't protected, unprotect it
+ auto snap_seq_it = std::find_if(
+ m_snap_seqs.begin(), m_snap_seqs.end(),
+ [dst_snap_id](const SnapSeqs::value_type& pair) {
+ return pair.second == dst_snap_id;
+ });
+
+ if (snap_seq_it != m_snap_seqs.end()) {
+ m_src_image_ctx->image_lock.lock_shared();
+ bool src_unprotected;
+ r = m_src_image_ctx->is_snap_unprotected(snap_seq_it->first,
+ &src_unprotected);
+ ldout(m_cct, 20) << "m_src_image_ctx->is_snap_unprotected("
+ << snap_seq_it->first << "): r=" << r
+ << ", src_unprotected=" << src_unprotected << dendl;
+ if (r == -ENOENT) {
+ src_unprotected = true;
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve source snap unprotect status: "
+ << cpp_strerror(r) << dendl;
+ m_src_image_ctx->image_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ if (src_unprotected) {
+ // source is unprotected -- unprotect destination snap
+ break;
+ }
+ } else {
+ // source snapshot doesn't exist -- unprotect destination snap
+ break;
+ }
+ }
+
+ if (snap_id_it == m_dst_snap_ids.end()) {
+ // no destination snapshots to unprotect
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_snap_remove();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id);
+
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_snap_unprotect(r);
+ finish_op_ctx->complete(0);
+ });
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ m_dst_image_ctx->operations->execute_snap_unprotect(
+ cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_unprotect(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to unprotect snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ // avoid the need to refresh to delete the newly unprotected snapshot
+ std::shared_lock image_locker{m_dst_image_ctx->image_lock};
+ auto snap_info_it = m_dst_image_ctx->snap_info.find(m_prev_snap_id);
+ if (snap_info_it != m_dst_image_ctx->snap_info.end()) {
+ snap_info_it->second.protection_status =
+ RBD_PROTECTION_STATUS_UNPROTECTED;
+ }
+ }
+
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_snap_unprotect();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_remove() {
+ SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_dst_snap_id_start > 0) {
+ snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start);
+ }
+
+ for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t dst_snap_id = *snap_id_it;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ m_dst_image_ctx->image_lock.lock_shared();
+ int r = m_dst_image_ctx->get_snap_namespace(dst_snap_id, &snap_namespace);
+ m_dst_image_ctx->image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve destination snap namespace: "
+ << m_snap_name << dendl;
+ finish(r);
+ return;
+ }
+
+ if (boost::get<cls::rbd::UserSnapshotNamespace>(&snap_namespace) ==
+ nullptr) {
+ continue;
+ }
+
+ // if the destination snapshot isn't in our mapping table, remove it
+ auto snap_seq_it = std::find_if(
+ m_snap_seqs.begin(), m_snap_seqs.end(),
+ [dst_snap_id](const SnapSeqs::value_type& pair) {
+ return pair.second == dst_snap_id;
+ });
+
+ if (snap_seq_it == m_snap_seqs.end()) {
+ break;
+ }
+ }
+
+ if (snap_id_it == m_dst_snap_ids.end()) {
+ // no destination snapshots to delete
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_snap_create();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id);
+
+ ldout(m_cct, 20) << ""
+ << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_snap_remove(r);
+ finish_op_ctx->complete(0);
+ });
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ m_dst_image_ctx->operations->execute_snap_remove(
+ cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_snap_remove();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_create() {
+ SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_src_snap_id_start > 0) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start);
+ }
+
+ for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t src_snap_id = *snap_id_it;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ m_src_image_ctx->image_lock.lock_shared();
+ int r = m_src_image_ctx->get_snap_namespace(src_snap_id, &snap_namespace);
+ m_src_image_ctx->image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve source snap namespace: "
+ << m_snap_name << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_snap_seqs.find(src_snap_id) == m_snap_seqs.end()) {
+ // the source snapshot is not in our mapping table, ...
+ if (boost::get<cls::rbd::UserSnapshotNamespace>(&snap_namespace) !=
+ nullptr) {
+ // ... create it since it's a user snapshot
+ break;
+ } else if (src_snap_id == m_src_snap_id_end) {
+ // ... map it to destination HEAD since it's not a user snapshot that we
+ // will create (e.g. MirrorSnapshotNamespace)
+ m_snap_seqs[src_snap_id] = CEPH_NOSNAP;
+ }
+ }
+ }
+
+ if (snap_id_it == m_src_snap_ids.end()) {
+ // no source snapshots to create
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_snap_protect();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id);
+
+ m_src_image_ctx->image_lock.lock_shared();
+ auto snap_info_it = m_src_image_ctx->snap_info.find(m_prev_snap_id);
+ if (snap_info_it == m_src_image_ctx->snap_info.end()) {
+ m_src_image_ctx->image_lock.unlock_shared();
+ lderr(m_cct) << "failed to retrieve source snap info: " << m_snap_name
+ << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ uint64_t size = snap_info_it->second.size;
+ m_snap_namespace = snap_info_it->second.snap_namespace;
+ cls::rbd::ParentImageSpec parent_spec;
+ uint64_t parent_overlap = 0;
+ if (!m_flatten && snap_info_it->second.parent.spec.pool_id != -1) {
+ parent_spec = m_dst_parent_spec;
+ parent_overlap = snap_info_it->second.parent.overlap;
+ }
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << ", "
+ << "size=" << size << ", "
+ << "parent_info=["
+ << "pool_id=" << parent_spec.pool_id << ", "
+ << "image_id=" << parent_spec.image_id << ", "
+ << "snap_id=" << parent_spec.snap_id << ", "
+ << "overlap=" << parent_overlap << "]" << dendl;
+
+ int r;
+ Context *finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_snap_create(r);
+ finish_op_ctx->complete(0);
+ });
+ SnapshotCreateRequest<I> *req = SnapshotCreateRequest<I>::create(
+ m_dst_image_ctx, m_snap_name, m_snap_namespace, size, parent_spec,
+ parent_overlap, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_create(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ if (handle_cancellation()) {
+ return;
+ }
+
+ ceph_assert(m_prev_snap_id != CEPH_NOSNAP);
+
+ auto snap_it = m_dst_image_ctx->snap_ids.find(
+ {cls::rbd::UserSnapshotNamespace(), m_snap_name});
+ ceph_assert(snap_it != m_dst_image_ctx->snap_ids.end());
+ librados::snap_t dst_snap_id = snap_it->second;
+
+ ldout(m_cct, 20) << "mapping source snap id " << m_prev_snap_id << " to "
+ << dst_snap_id << dendl;
+ m_snap_seqs[m_prev_snap_id] = dst_snap_id;
+
+ send_snap_create();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_snap_protect() {
+ SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id);
+ } else if (m_src_snap_id_start > 0) {
+ snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start);
+ }
+
+ for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) {
+ librados::snap_t src_snap_id = *snap_id_it;
+
+ m_src_image_ctx->image_lock.lock_shared();
+
+ bool src_protected;
+ int r = m_src_image_ctx->is_snap_protected(src_snap_id, &src_protected);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve source snap protect status: "
+ << cpp_strerror(r) << dendl;
+ m_src_image_ctx->image_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+ m_src_image_ctx->image_lock.unlock_shared();
+
+ if (!src_protected) {
+ // snap is not protected -- check next snap
+ continue;
+ }
+
+ // if destination snapshot is not protected, protect it
+ auto snap_seq_it = m_snap_seqs.find(src_snap_id);
+ ceph_assert(snap_seq_it != m_snap_seqs.end());
+ if (snap_seq_it->second == CEPH_NOSNAP) {
+ // implies src end snapshot is mapped to a non-copyable snapshot
+ ceph_assert(src_snap_id == m_src_snap_id_end);
+ break;
+ }
+
+ m_dst_image_ctx->image_lock.lock_shared();
+ bool dst_protected;
+ r = m_dst_image_ctx->is_snap_protected(snap_seq_it->second, &dst_protected);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve destination snap protect status: "
+ << cpp_strerror(r) << dendl;
+ m_dst_image_ctx->image_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+ m_dst_image_ctx->image_lock.unlock_shared();
+
+ if (!dst_protected) {
+ break;
+ }
+ }
+
+ if (snap_id_it == m_src_snap_ids.end()) {
+ // no destination snapshots to protect
+ m_prev_snap_id = CEPH_NOSNAP;
+ send_set_head();
+ return;
+ }
+
+ m_prev_snap_id = *snap_id_it;
+ m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id);
+
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_snap_protect(r);
+ finish_op_ctx->complete(0);
+ });
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ m_dst_image_ctx->operations->execute_snap_protect(
+ cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_snap_protect(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to protect snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_snap_protect();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_set_head() {
+ auto snap_seq_it = m_snap_seqs.find(m_src_snap_id_end);
+ if (m_src_snap_id_end != CEPH_NOSNAP &&
+ (snap_seq_it == m_snap_seqs.end() ||
+ snap_seq_it->second != CEPH_NOSNAP)) {
+ // not copying to src nor dst HEAD revision
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ uint64_t size;
+ cls::rbd::ParentImageSpec parent_spec;
+ uint64_t parent_overlap = 0;
+ {
+ std::shared_lock src_locker{m_src_image_ctx->image_lock};
+ auto snap_info_it = m_src_image_ctx->snap_info.find(m_src_snap_id_end);
+ if (snap_info_it != m_src_image_ctx->snap_info.end()) {
+ auto& snap_info = snap_info_it->second;
+ size = snap_info.size;
+ if (!m_flatten && snap_info.parent.spec.pool_id != -1) {
+ parent_spec = m_dst_parent_spec;
+ parent_overlap = snap_info.parent.overlap;
+ }
+ } else {
+ size = m_src_image_ctx->size;
+ if (!m_flatten) {
+ parent_spec = m_dst_image_ctx->parent_md.spec;
+ parent_overlap = m_src_image_ctx->parent_md.overlap;
+ }
+ }
+ }
+
+ auto ctx = create_context_callback<
+ SnapshotCopyRequest<I>, &SnapshotCopyRequest<I>::handle_set_head>(this);
+ auto req = SetHeadRequest<I>::create(m_dst_image_ctx, size, parent_spec,
+ parent_overlap, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_set_head(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (handle_cancellation()) {
+ return;
+ }
+
+ send_resize_object_map();
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::send_resize_object_map() {
+ int r = 0;
+
+ if (m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ std::shared_lock image_locker{m_dst_image_ctx->image_lock};
+
+ if (m_dst_image_ctx->object_map != nullptr &&
+ Striper::get_num_objects(m_dst_image_ctx->layout,
+ m_dst_image_ctx->size) !=
+ m_dst_image_ctx->object_map->size()) {
+
+ ldout(m_cct, 20) << dendl;
+
+ auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r);
+ if (finish_op_ctx != nullptr) {
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_resize_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+
+ m_dst_image_ctx->object_map->aio_resize(m_dst_image_ctx->size,
+ OBJECT_NONEXISTENT, ctx);
+ return;
+ }
+
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ }
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::handle_resize_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to resize object map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+bool SnapshotCopyRequest<I>::handle_cancellation() {
+ {
+ std::lock_guard locker{m_lock};
+ if (!m_canceled) {
+ return false;
+ }
+ }
+ ldout(m_cct, 10) << "snapshot copy canceled" << dendl;
+ finish(-ECANCELED);
+ return true;
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::error(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_work_queue->queue(new LambdaContext([this, r](int r1) { finish(r); }));
+}
+
+template <typename I>
+int SnapshotCopyRequest<I>::validate_parent(I *image_ctx,
+ cls::rbd::ParentImageSpec *spec) {
+ std::shared_lock owner_locker{image_ctx->owner_lock};
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ // ensure source image's parent specs are still consistent
+ *spec = image_ctx->parent_md.spec;
+ for (auto &snap_info_pair : image_ctx->snap_info) {
+ auto &parent_spec = snap_info_pair.second.parent.spec;
+ if (parent_spec.pool_id == -1) {
+ continue;
+ } else if (spec->pool_id == -1) {
+ *spec = parent_spec;
+ continue;
+ }
+
+ if (*spec != parent_spec) {
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+template <typename I>
+Context *SnapshotCopyRequest<I>::start_lock_op(int* r) {
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ return start_lock_op(m_dst_image_ctx->owner_lock, r);
+}
+
+template <typename I>
+Context *SnapshotCopyRequest<I>::start_lock_op(ceph::shared_mutex &owner_lock, int* r) {
+ ceph_assert(ceph_mutex_is_locked(m_dst_image_ctx->owner_lock));
+ if (m_dst_image_ctx->exclusive_lock == nullptr) {
+ return new LambdaContext([](int r) {});
+ }
+ return m_dst_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void SnapshotCopyRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ *m_snap_seqs_result = m_snap_seqs;
+ }
+
+ m_on_finish->complete(r);
+ put();
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::SnapshotCopyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.h b/src/librbd/deep_copy/SnapshotCopyRequest.h
new file mode 100644
index 000000000..9c6abdf73
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCopyRequest.h
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/RefCountedObj.h"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+class Context;
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SnapshotCopyRequest : public RefCountedObject {
+public:
+ static SnapshotCopyRequest* create(ImageCtxT *src_image_ctx,
+ ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, asio::ContextWQ *work_queue,
+ SnapSeqs *snap_seqs, Context *on_finish) {
+ return new SnapshotCopyRequest(src_image_ctx, dst_image_ctx,
+ src_snap_id_start, src_snap_id_end,
+ dst_snap_id_start, flatten, work_queue,
+ snap_seqs, on_finish);
+ }
+
+ SnapshotCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ librados::snap_t dst_snap_id_start,
+ bool flatten, asio::ContextWQ *work_queue,
+ SnapSeqs *snap_seqs, Context *on_finish);
+
+ void send();
+ void cancel();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * UNPROTECT_SNAP ----/
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * REMOVE_SNAP -------/
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * CREATE_SNAP -------/
+ * |
+ * | /-----------\
+ * | | |
+ * v v | (repeat as needed)
+ * PROTECT_SNAP ------/
+ * |
+ * v
+ * SET_HEAD (skip if not needed)
+ * |
+ * v
+ * RESIZE_OBJECT_MAP (skip if not needed)
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ typedef std::set<librados::snap_t> SnapIdSet;
+
+ ImageCtxT *m_src_image_ctx;
+ ImageCtxT *m_dst_image_ctx;
+ librados::snap_t m_src_snap_id_start;
+ librados::snap_t m_src_snap_id_end;
+ librados::snap_t m_dst_snap_id_start;
+ bool m_flatten;
+ asio::ContextWQ *m_work_queue;
+ SnapSeqs *m_snap_seqs_result;
+ SnapSeqs m_snap_seqs;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ SnapIdSet m_src_snap_ids;
+ SnapIdSet m_dst_snap_ids;
+ librados::snap_t m_prev_snap_id = CEPH_NOSNAP;
+
+ std::string m_snap_name;
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+
+ cls::rbd::ParentImageSpec m_dst_parent_spec;
+
+ ceph::mutex m_lock;
+ bool m_canceled = false;
+
+ void send_snap_unprotect();
+ void handle_snap_unprotect(int r);
+
+ void send_snap_remove();
+ void handle_snap_remove(int r);
+
+ void send_snap_create();
+ void handle_snap_create(int r);
+
+ void send_snap_protect();
+ void handle_snap_protect(int r);
+
+ void send_set_head();
+ void handle_set_head(int r);
+
+ void send_resize_object_map();
+ void handle_resize_object_map(int r);
+
+ bool handle_cancellation();
+
+ void error(int r);
+
+ int validate_parent(ImageCtxT *image_ctx, cls::rbd::ParentImageSpec *spec);
+
+ Context *start_lock_op(int* r);
+ Context *start_lock_op(ceph::shared_mutex &owner_locki, int* r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::SnapshotCopyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H
diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.cc b/src/librbd/deep_copy/SnapshotCreateRequest.cc
new file mode 100644
index 000000000..d437bd355
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCreateRequest.cc
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SetHeadRequest.h"
+#include "SnapshotCreateRequest.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCreateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace deep_copy {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+SnapshotCreateRequest<I>::SnapshotCreateRequest(
+ I *dst_image_ctx, const std::string &snap_name,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ uint64_t size, const cls::rbd::ParentImageSpec &spec,
+ uint64_t parent_overlap, Context *on_finish)
+ : m_dst_image_ctx(dst_image_ctx), m_snap_name(snap_name),
+ m_snap_namespace(snap_namespace), m_size(size),
+ m_parent_spec(spec), m_parent_overlap(parent_overlap),
+ m_on_finish(on_finish), m_cct(dst_image_ctx->cct) {
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send() {
+ send_set_head();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_set_head() {
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ SnapshotCreateRequest<I>, &SnapshotCreateRequest<I>::handle_set_head>(this);
+ auto req = SetHeadRequest<I>::create(m_dst_image_ctx, m_size, m_parent_spec,
+ m_parent_overlap, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::handle_set_head(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_snap();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_snap() {
+ ldout(m_cct, 20) << "snap_name=" << m_snap_name << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_create_snap(r);
+ finish_op_ctx->complete(0);
+ });
+ uint64_t flags = SNAP_CREATE_FLAG_SKIP_OBJECT_MAP |
+ SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE;
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ m_dst_image_ctx->operations->execute_snap_create(
+ m_snap_namespace, m_snap_name.c_str(), ctx, 0U, flags, m_prog_ctx);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::handle_create_snap(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_object_map();
+}
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_object_map() {
+
+ if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+ finish(0);
+ return;
+ }
+
+ m_dst_image_ctx->image_lock.lock_shared();
+ auto snap_it = m_dst_image_ctx->snap_ids.find(
+ {cls::rbd::UserSnapshotNamespace(), m_snap_name});
+ if (snap_it == m_dst_image_ctx->snap_ids.end()) {
+ lderr(m_cct) << "failed to locate snap: " << m_snap_name << dendl;
+ m_dst_image_ctx->image_lock.unlock_shared();
+ finish(-ENOENT);
+ return;
+ }
+ librados::snap_t local_snap_id = snap_it->second;
+ m_dst_image_ctx->image_lock.unlock_shared();
+
+ std::string object_map_oid(librbd::ObjectMap<>::object_map_name(
+ m_dst_image_ctx->id, local_snap_id));
+ uint64_t object_count = Striper::get_num_objects(m_dst_image_ctx->layout,
+ m_size);
+ ldout(m_cct, 20) << "object_map_oid=" << object_map_oid << ", "
+ << "object_count=" << object_count << dendl;
+
+ // initialize an empty object map of the correct size (object sync
+ // will populate the object map)
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::object_map_resize(&op, object_count, OBJECT_NONEXISTENT);
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_create_object_map(r);
+ finish_op_ctx->complete(0);
+ });
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+ r = m_dst_image_ctx->md_ctx.aio_operate(object_map_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::handle_create_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create object map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::start_lock_op(int* r) {
+ std::shared_lock owner_locker{m_dst_image_ctx->owner_lock};
+ if (m_dst_image_ctx->exclusive_lock == nullptr) {
+ return new LambdaContext([](int r) {});
+ }
+ return m_dst_image_ctx->exclusive_lock->start_op(r);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace deep_copy
+} // namespace librbd
+
+template class librbd::deep_copy::SnapshotCreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.h b/src/librbd/deep_copy/SnapshotCreateRequest.h
new file mode 100644
index 000000000..41f7f54e4
--- /dev/null
+++ b/src/librbd/deep_copy/SnapshotCreateRequest.h
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include "librbd/internal.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+class Context;
+
+namespace librbd {
+namespace deep_copy {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SnapshotCreateRequest {
+public:
+ static SnapshotCreateRequest* create(ImageCtxT *dst_image_ctx,
+ const std::string &snap_name,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap,
+ Context *on_finish) {
+ return new SnapshotCreateRequest(dst_image_ctx, snap_name, snap_namespace, size,
+ parent_spec, parent_overlap, on_finish);
+ }
+
+ SnapshotCreateRequest(ImageCtxT *dst_image_ctx,
+ const std::string &snap_name,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ uint64_t size,
+ const cls::rbd::ParentImageSpec &parent_spec,
+ uint64_t parent_overlap, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SET_HEAD
+ * |
+ * v
+ * CREATE_SNAP
+ * |
+ * v (skip if not needed)
+ * CREATE_OBJECT_MAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_dst_image_ctx;
+ std::string m_snap_name;
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ uint64_t m_size;
+ cls::rbd::ParentImageSpec m_parent_spec;
+ uint64_t m_parent_overlap;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ NoOpProgressContext m_prog_ctx;
+
+ void send_set_head();
+ void handle_set_head(int r);
+
+ void send_create_snap();
+ void handle_create_snap(int r);
+
+ void send_create_object_map();
+ void handle_create_object_map(int r);
+
+ Context *start_lock_op(int* r);
+
+ void finish(int r);
+};
+
+} // namespace deep_copy
+} // namespace librbd
+
+extern template class librbd::deep_copy::SnapshotCreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/deep_copy/Types.h b/src/librbd/deep_copy/Types.h
new file mode 100644
index 000000000..9cd4835b3
--- /dev/null
+++ b/src/librbd/deep_copy/Types.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_TYPES_H
+#define CEPH_LIBRBD_DEEP_COPY_TYPES_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include <boost/optional.hpp>
+
+namespace librbd {
+namespace deep_copy {
+
+enum {
+ OBJECT_COPY_REQUEST_FLAG_FLATTEN = 1U << 0,
+ OBJECT_COPY_REQUEST_FLAG_MIGRATION = 1U << 1,
+ OBJECT_COPY_REQUEST_FLAG_EXISTS_CLEAN = 1U << 2,
+};
+
+typedef std::vector<librados::snap_t> SnapIds;
+typedef std::map<librados::snap_t, SnapIds> SnapMap;
+
+typedef boost::optional<uint64_t> ObjectNumber;
+
+} // namespace deep_copy
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_DEEP_COPY_TYPES_H
diff --git a/src/librbd/deep_copy/Utils.cc b/src/librbd/deep_copy/Utils.cc
new file mode 100644
index 000000000..c2dd25020
--- /dev/null
+++ b/src/librbd/deep_copy/Utils.cc
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "Utils.h"
+#include <set>
+
+namespace librbd {
+namespace deep_copy {
+namespace util {
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::deep_copy::util::" << __func__ << ": "
+
+void compute_snap_map(CephContext* cct,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ const SnapIds& dst_snap_ids,
+ const SnapSeqs &snap_seqs,
+ SnapMap *snap_map) {
+ std::set<librados::snap_t> ordered_dst_snap_ids{
+ dst_snap_ids.begin(), dst_snap_ids.end()};
+ auto dst_snap_id_it = ordered_dst_snap_ids.begin();
+
+ SnapIds snap_ids;
+ for (auto &it : snap_seqs) {
+ // ensure all dst snap ids are included in the mapping table since
+ // deep copy will skip non-user snapshots
+ while (dst_snap_id_it != ordered_dst_snap_ids.end()) {
+ if (*dst_snap_id_it < it.second) {
+ snap_ids.insert(snap_ids.begin(), *dst_snap_id_it);
+ } else if (*dst_snap_id_it > it.second) {
+ break;
+ }
+ ++dst_snap_id_it;
+ }
+
+ // we should only have the HEAD revision in the the last snap seq
+ ceph_assert(snap_ids.empty() || snap_ids[0] != CEPH_NOSNAP);
+ snap_ids.insert(snap_ids.begin(), it.second);
+
+ if (it.first < src_snap_id_start) {
+ continue;
+ } else if (it.first > src_snap_id_end) {
+ break;
+ }
+
+ (*snap_map)[it.first] = snap_ids;
+ }
+
+ ldout(cct, 10) << "src_snap_id_start=" << src_snap_id_start << ", "
+ << "src_snap_id_end=" << src_snap_id_end << ", "
+ << "dst_snap_ids=" << dst_snap_ids << ", "
+ << "snap_seqs=" << snap_seqs << ", "
+ << "snap_map=" << *snap_map << dendl;
+}
+
+} // namespace util
+} // namespace deep_copy
+} // namespace librbd
diff --git a/src/librbd/deep_copy/Utils.h b/src/librbd/deep_copy/Utils.h
new file mode 100644
index 000000000..268a39daf
--- /dev/null
+++ b/src/librbd/deep_copy/Utils.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_DEEP_COPY_UTILS_H
+#define CEPH_LIBRBD_DEEP_COPY_UTILS_H
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+#include "librbd/deep_copy/Types.h"
+
+#include <boost/optional.hpp>
+
+namespace librbd {
+namespace deep_copy {
+namespace util {
+
+void compute_snap_map(CephContext* cct,
+ librados::snap_t src_snap_id_start,
+ librados::snap_t src_snap_id_end,
+ const SnapIds& dst_snap_ids,
+ const SnapSeqs &snap_seqs,
+ SnapMap *snap_map);
+
+} // namespace util
+} // namespace deep_copy
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_DEEP_COPY_UTILS_H
diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.cc b/src/librbd/exclusive_lock/AutomaticPolicy.cc
new file mode 100644
index 000000000..bfaddc1b2
--- /dev/null
+++ b/src/librbd/exclusive_lock/AutomaticPolicy.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/AutomaticPolicy.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ExclusiveLock.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock::AutomaticPolicy "
+
+namespace librbd {
+namespace exclusive_lock {
+
+int AutomaticPolicy::lock_requested(bool force) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock));
+ ceph_assert(m_image_ctx->exclusive_lock != nullptr);
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force
+ << dendl;
+
+ // release the lock upon request (ignore forced requests)
+ m_image_ctx->exclusive_lock->release_lock(nullptr);
+ return 0;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.h b/src/librbd/exclusive_lock/AutomaticPolicy.h
new file mode 100644
index 000000000..12ba9b6c4
--- /dev/null
+++ b/src/librbd/exclusive_lock/AutomaticPolicy.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H
+
+#include "librbd/exclusive_lock/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+class AutomaticPolicy : public Policy {
+public:
+ AutomaticPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ bool may_auto_request_lock() override {
+ return true;
+ }
+
+ int lock_requested(bool force) override;
+
+private:
+ ImageCtx *m_image_ctx;
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H
diff --git a/src/librbd/exclusive_lock/ImageDispatch.cc b/src/librbd/exclusive_lock/ImageDispatch.cc
new file mode 100644
index 000000000..cd7f450f2
--- /dev/null
+++ b/src/librbd/exclusive_lock/ImageDispatch.cc
@@ -0,0 +1,318 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/ImageDispatch.h"
+#include "include/Context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::ImageDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_context_callback;
+
+template <typename I>
+ImageDispatch<I>::ImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::exclusve_lock::ImageDispatch::m_lock",
+ this))) {
+}
+
+template <typename I>
+void ImageDispatch<I>::shut_down(Context* on_finish) {
+ // release any IO waiting on exclusive lock
+ Contexts on_dispatches;
+ {
+ std::unique_lock locker{m_lock};
+ std::swap(on_dispatches, m_on_dispatches);
+ }
+
+ for (auto ctx : on_dispatches) {
+ ctx->complete(0);
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ImageDispatch<I>::set_require_lock(bool init_shutdown,
+ io::Direction direction,
+ Context* on_finish) {
+ // pause any matching IO from proceeding past this layer
+ set_require_lock(direction, true);
+
+ if (direction == io::DIRECTION_READ) {
+ on_finish->complete(0);
+ return;
+ }
+
+ // push through a flush for any in-flight writes at lower levels
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, util::get_image_ctx(m_image_ctx), io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ *m_image_ctx, io::IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK, aio_comp,
+ (init_shutdown ?
+ io::FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH :
+ io::FLUSH_SOURCE_EXCLUSIVE_LOCK), {});
+ req->send();
+}
+
+template <typename I>
+void ImageDispatch<I>::unset_require_lock(io::Direction direction) {
+ set_require_lock(direction, false);
+}
+
+template <typename I>
+bool ImageDispatch<I>::set_require_lock(io::Direction direction, bool enabled) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "direction=" << direction << ", enabled=" << enabled
+ << dendl;
+
+ std::unique_lock locker{m_lock};
+ auto prev_require_lock = (m_require_lock_on_read || m_require_lock_on_write);
+
+ switch (direction) {
+ case io::DIRECTION_READ:
+ m_require_lock_on_read = enabled;
+ break;
+ case io::DIRECTION_WRITE:
+ m_require_lock_on_write = enabled;
+ break;
+ case io::DIRECTION_BOTH:
+ m_require_lock_on_read = enabled;
+ m_require_lock_on_write = enabled;
+ break;
+ }
+
+ bool require_lock = (m_require_lock_on_read || m_require_lock_on_write);
+ return ((enabled && !prev_require_lock && require_lock) ||
+ (!enabled && prev_require_lock && !require_lock));
+}
+
+template <typename I>
+bool ImageDispatch<I>::read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ if (needs_exclusive_lock(true, tid, dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool ImageDispatch<I>::discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool ImageDispatch<I>::compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool ImageDispatch<I>::flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ // don't attempt to grab the exclusive lock if were are just internally
+ // clearing out our in-flight IO queue
+ if (flush_source != io::FLUSH_SOURCE_USER) {
+ return false;
+ }
+
+ if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool ImageDispatch<I>::is_lock_required(bool read_op) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return ((read_op && m_require_lock_on_read) ||
+ (!read_op && m_require_lock_on_write));
+}
+
+template <typename I>
+bool ImageDispatch<I>::needs_exclusive_lock(bool read_op, uint64_t tid,
+ io::DispatchResult* dispatch_result,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ bool lock_required = false;
+ {
+ std::shared_lock locker{m_lock};
+ lock_required = is_lock_required(read_op);
+ }
+
+ if (lock_required) {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ // raced with the exclusive lock being disabled
+ return false;
+ }
+
+ ldout(cct, 5) << "exclusive lock required: delaying IO" << dendl;
+ if (!m_image_ctx->get_exclusive_lock_policy()->may_auto_request_lock()) {
+ lderr(cct) << "op requires exclusive lock" << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ on_dispatched->complete(
+ m_image_ctx->exclusive_lock->get_unlocked_op_error());
+ return true;
+ }
+
+ // block potential races with other incoming IOs
+ std::unique_lock locker{m_lock};
+ bool retesting_lock = (
+ !m_on_dispatches.empty() && m_on_dispatches.front() == on_dispatched);
+ if (!m_on_dispatches.empty() && !retesting_lock) {
+ *dispatch_result = io::DISPATCH_RESULT_RESTART;
+ m_on_dispatches.push_back(on_dispatched);
+ return true;
+ }
+
+ if (!is_lock_required(read_op)) {
+ return false;
+ }
+
+ ceph_assert(m_on_dispatches.empty() || retesting_lock);
+ m_on_dispatches.push_back(on_dispatched);
+ locker.unlock();
+
+ *dispatch_result = io::DISPATCH_RESULT_RESTART;
+ auto ctx = create_context_callback<
+ ImageDispatch<I>, &ImageDispatch<I>::handle_acquire_lock>(this);
+ m_image_ctx->exclusive_lock->acquire_lock(ctx);
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+void ImageDispatch<I>::handle_acquire_lock(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(!m_on_dispatches.empty());
+
+ Context* failed_dispatch = nullptr;
+ Contexts on_dispatches;
+ if (r == -ESHUTDOWN) {
+ ldout(cct, 5) << "IO raced with exclusive lock shutdown" << dendl;
+ } else if (r < 0) {
+ lderr(cct) << "failed to acquire exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ failed_dispatch = m_on_dispatches.front();
+ m_on_dispatches.pop_front();
+ }
+
+ // re-test if lock is still required (i.e. it wasn't acquired/lost) via a
+ // restart dispatch
+ std::swap(on_dispatches, m_on_dispatches);
+ locker.unlock();
+
+ if (failed_dispatch != nullptr) {
+ failed_dispatch->complete(r);
+ }
+ for (auto ctx : on_dispatches) {
+ ctx->complete(0);
+ }
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::ImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/ImageDispatch.h b/src/librbd/exclusive_lock/ImageDispatch.h
new file mode 100644
index 000000000..77c101973
--- /dev/null
+++ b/src/librbd/exclusive_lock/ImageDispatch.h
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/ceph_mutex.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <atomic>
+#include <list>
+#include <unordered_set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+struct AioCompletion;
+}
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT>
+class ImageDispatch : public io::ImageDispatchInterface {
+public:
+ static ImageDispatch* create(ImageCtxT* image_ctx) {
+ return new ImageDispatch(image_ctx);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ ImageDispatch(ImageCtxT* image_ctx);
+
+ io::ImageDispatchLayer get_dispatch_layer() const override {
+ return io::IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK;
+ }
+
+ void set_require_lock(bool init_shutdown,
+ io::Direction direction, Context* on_finish);
+ void unset_require_lock(io::Direction direction);
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ typedef std::list<Context*> Contexts;
+ typedef std::unordered_set<uint64_t> Tids;
+
+ ImageCtxT* m_image_ctx;
+ mutable ceph::shared_mutex m_lock;
+
+ bool m_require_lock_on_read = false;
+ bool m_require_lock_on_write = false;
+
+ Contexts m_on_dispatches;
+
+ bool set_require_lock(io::Direction direction, bool enabled);
+
+ bool is_lock_required(bool read_op) const;
+
+ bool needs_exclusive_lock(bool read_op, uint64_t tid,
+ io::DispatchResult* dispatch_result,
+ Context* on_dispatched);
+
+ void handle_acquire_lock(int r);
+};
+
+} // namespace exclusiv_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::ImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_IMAGE_DISPATCH_H
diff --git a/src/librbd/exclusive_lock/Policy.h b/src/librbd/exclusive_lock/Policy.h
new file mode 100644
index 000000000..7064a6515
--- /dev/null
+++ b/src/librbd/exclusive_lock/Policy.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
+
+namespace librbd {
+namespace exclusive_lock {
+
+enum OperationRequestType {
+ OPERATION_REQUEST_TYPE_GENERAL = 0,
+ OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE = 1,
+ OPERATION_REQUEST_TYPE_FORCE_PROMOTION = 2,
+};
+
+struct Policy {
+ virtual ~Policy() {
+ }
+
+ virtual bool may_auto_request_lock() = 0;
+ virtual int lock_requested(bool force) = 0;
+
+ virtual bool accept_blocked_request(OperationRequestType) {
+ return false;
+ }
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.cc b/src/librbd/exclusive_lock/PostAcquireRequest.cc
new file mode 100644
index 000000000..4553b2158
--- /dev/null
+++ b/src/librbd/exclusive_lock/PostAcquireRequest.cc
@@ -0,0 +1,368 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/PostAcquireRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/journal/Policy.h"
+#include "librbd/PluginRegistry.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::PostAcquireRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+PostAcquireRequest<I>* PostAcquireRequest<I>::create(I &image_ctx,
+ Context *on_acquire,
+ Context *on_finish) {
+ return new PostAcquireRequest(image_ctx, on_acquire, on_finish);
+}
+
+template <typename I>
+PostAcquireRequest<I>::PostAcquireRequest(I &image_ctx, Context *on_acquire,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_on_acquire(on_acquire),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_object_map(nullptr), m_journal(nullptr), m_error_result(0) {
+}
+
+template <typename I>
+PostAcquireRequest<I>::~PostAcquireRequest() {
+ if (!m_prepare_lock_completed) {
+ m_image_ctx.state->handle_prepare_lock_complete();
+ }
+ delete m_on_acquire;
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send() {
+ send_refresh();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_refresh() {
+ if (!m_image_ctx.state->is_refresh_required()) {
+ send_open_object_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<klass, &klass::handle_refresh>(this));
+
+ // ImageState is blocked waiting for lock to complete -- safe to directly
+ // refresh
+ image::RefreshRequest<I> *req = image::RefreshRequest<I>::create(
+ m_image_ctx, true, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_refresh(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ // next issued IO or op will (re)-refresh the image and shut down lock
+ ldout(cct, 5) << "exclusive lock dynamically disabled" << dendl;
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ save_result(r);
+ revert();
+ finish();
+ return;
+ }
+
+ send_open_object_map();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_open_journal() {
+ // alert caller that we now own the exclusive lock
+ m_on_acquire->complete(0);
+ m_on_acquire = nullptr;
+
+ bool journal_enabled;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ journal_enabled = (m_image_ctx.test_features(RBD_FEATURE_JOURNALING,
+ m_image_ctx.image_lock) &&
+ !m_image_ctx.get_journal_policy()->journal_disabled());
+ }
+ if (!journal_enabled) {
+ apply();
+ send_process_plugin_acquire_lock();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_open_journal>(
+ this);
+ m_journal = m_image_ctx.create_journal();
+
+ // journal playback requires object map (if enabled) and itself
+ apply();
+
+ m_journal->open(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_open_journal(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl;
+ send_close_journal();
+ return;
+ }
+
+ send_allocate_journal_tag();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_allocate_journal_tag() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_allocate_journal_tag>(this, m_journal);
+ m_image_ctx.get_journal_policy()->allocate_tag_on_lock(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_allocate_journal_tag(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to allocate journal tag: " << cpp_strerror(r)
+ << dendl;
+ send_close_journal();
+ return;
+ }
+
+ send_process_plugin_acquire_lock();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_process_plugin_acquire_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_process_plugin_acquire_lock>(this);
+ m_image_ctx.plugin_registry->acquired_exclusive_lock(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_process_plugin_acquire_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to process plugins: " << cpp_strerror(r)
+ << dendl;
+ send_process_plugin_release_lock();
+ return;
+ }
+
+ finish();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_process_plugin_release_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_process_plugin_release_lock>(this);
+ m_image_ctx.plugin_registry->prerelease_exclusive_lock(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_process_plugin_release_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to release plugins: " << cpp_strerror(r)
+ << dendl;
+ }
+ send_close_journal();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_close_journal() {
+ if (m_journal == nullptr) {
+ send_close_object_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_close_journal>(
+ this);
+ m_journal->close(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_close_journal(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl;
+ }
+
+ send_close_object_map();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_open_object_map() {
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ send_open_journal();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_open_object_map>(
+ this);
+
+ m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP);
+ m_object_map->open(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_open_object_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open object map: " << cpp_strerror(r) << dendl;
+ m_object_map->put();
+ m_object_map = nullptr;
+
+ if (r != -EFBIG) {
+ save_result(r);
+ revert();
+ finish();
+ return;
+ }
+ }
+
+ send_open_journal();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::send_close_object_map() {
+ if (m_object_map == nullptr) {
+ revert();
+ finish();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PostAcquireRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_close_object_map>(this);
+ m_object_map->close(ctx);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::handle_close_object_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl;
+ }
+
+ revert();
+ finish();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::apply() {
+ {
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ ceph_assert(m_image_ctx.object_map == nullptr);
+ m_image_ctx.object_map = m_object_map;
+
+ ceph_assert(m_image_ctx.journal == nullptr);
+ m_image_ctx.journal = m_journal;
+ }
+
+ m_prepare_lock_completed = true;
+ m_image_ctx.state->handle_prepare_lock_complete();
+}
+
+template <typename I>
+void PostAcquireRequest<I>::revert() {
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ m_image_ctx.object_map = nullptr;
+ m_image_ctx.journal = nullptr;
+
+ if (m_object_map) {
+ m_object_map->put();
+ }
+ if (m_journal) {
+ m_journal->put();
+ }
+
+ ceph_assert(m_error_result < 0);
+}
+
+template <typename I>
+void PostAcquireRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::PostAcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.h b/src/librbd/exclusive_lock/PostAcquireRequest.h
new file mode 100644
index 000000000..2f7efdf07
--- /dev/null
+++ b/src/librbd/exclusive_lock/PostAcquireRequest.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "librbd/ImageCtx.h"
+#include "msg/msg_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class PostAcquireRequest {
+public:
+ static PostAcquireRequest* create(ImageCtxT &image_ctx, Context *on_acquire,
+ Context *on_finish);
+
+ ~PostAcquireRequest();
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v
+ * REFRESH (skip if not
+ * | needed)
+ * v
+ * OPEN_OBJECT_MAP (skip if
+ * | disabled)
+ * v
+ * OPEN_JOURNAL (skip if
+ * | * disabled)
+ * | *
+ * | * * * * * * * *
+ * v *
+ * ALLOCATE_JOURNAL_TAG *
+ * | * *
+ * | * *
+ * v * *
+ * PROCESS_PLUGIN_ACQUIRE*
+ * | * *
+ * | * *
+ * | v v v
+ * | PROCESS_PLUGIN_RELEASE
+ * | |
+ * | v
+ * | CLOSE_JOURNAL
+ * | |
+ * | v
+ * | CLOSE_OBJECT_MAP
+ * | |
+ * v |
+ * <finish> <----------/
+ *
+ * @endverbatim
+ */
+
+ PostAcquireRequest(ImageCtxT &image_ctx, Context *on_acquire,
+ Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_acquire;
+ Context *m_on_finish;
+
+ decltype(m_image_ctx.object_map) m_object_map;
+ decltype(m_image_ctx.journal) m_journal;
+
+ bool m_prepare_lock_completed = false;
+ int m_error_result;
+
+ void send_refresh();
+ void handle_refresh(int r);
+
+ void send_open_journal();
+ void handle_open_journal(int r);
+
+ void send_allocate_journal_tag();
+ void handle_allocate_journal_tag(int r);
+
+ void send_open_object_map();
+ void handle_open_object_map(int r);
+
+ void send_close_journal();
+ void handle_close_journal(int r);
+
+ void send_close_object_map();
+ void handle_close_object_map(int r);
+
+ void send_process_plugin_acquire_lock();
+ void handle_process_plugin_acquire_lock(int r);
+
+ void send_process_plugin_release_lock();
+ void handle_process_plugin_release_lock(int r);
+
+ void apply();
+ void revert();
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::PostAcquireRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.cc b/src/librbd/exclusive_lock/PreAcquireRequest.cc
new file mode 100644
index 000000000..feb0913d7
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreAcquireRequest.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/PreAcquireRequest.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ImageState.h"
+#include "librbd/asio/ContextWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::PreAcquireRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+PreAcquireRequest<I>* PreAcquireRequest<I>::create(I &image_ctx,
+ Context *on_finish) {
+ return new PreAcquireRequest(image_ctx, on_finish);
+}
+
+template <typename I>
+PreAcquireRequest<I>::PreAcquireRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+PreAcquireRequest<I>::~PreAcquireRequest() {
+}
+
+template <typename I>
+void PreAcquireRequest<I>::send() {
+ send_prepare_lock();
+}
+
+template <typename I>
+void PreAcquireRequest<I>::send_prepare_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ // acquire the lock if the image is not busy performing other actions
+ Context *ctx = create_context_callback<
+ PreAcquireRequest<I>, &PreAcquireRequest<I>::handle_prepare_lock>(this);
+ m_image_ctx.state->prepare_lock(ctx);
+}
+
+template <typename I>
+void PreAcquireRequest<I>::handle_prepare_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ send_flush_notifies();
+}
+
+template <typename I>
+void PreAcquireRequest<I>::send_flush_notifies() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreAcquireRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_flush_notifies>(
+ this);
+ m_image_ctx.image_watcher->flush(ctx);
+}
+
+template <typename I>
+void PreAcquireRequest<I>::handle_flush_notifies(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(r == 0);
+ finish();
+}
+
+template <typename I>
+void PreAcquireRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::PreAcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.h b/src/librbd/exclusive_lock/PreAcquireRequest.h
new file mode 100644
index 000000000..15d4b2c12
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreAcquireRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "librbd/ImageCtx.h"
+#include "msg/msg_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class PreAcquireRequest {
+public:
+ static PreAcquireRequest* create(ImageCtxT &image_ctx, Context *on_finish);
+
+ ~PreAcquireRequest();
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * PREPARE_LOCK
+ * |
+ * v
+ * FLUSH_NOTIFIES
+ * |
+ * |
+ * |
+ v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ PreAcquireRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ void send_prepare_lock();
+ void handle_prepare_lock(int r);
+
+ void send_flush_notifies();
+ void handle_flush_notifies(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::PreAcquireRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.cc b/src/librbd/exclusive_lock/PreReleaseRequest.cc
new file mode 100644
index 000000000..a9cd1248a
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreReleaseRequest.cc
@@ -0,0 +1,363 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/PreReleaseRequest.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/ImageDispatch.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include "librbd/PluginRegistry.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::PreReleaseRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+PreReleaseRequest<I>* PreReleaseRequest<I>::create(
+ I &image_ctx, ImageDispatch<I>* image_dispatch, bool shutting_down,
+ AsyncOpTracker &async_op_tracker, Context *on_finish) {
+ return new PreReleaseRequest(image_ctx, image_dispatch, shutting_down,
+ async_op_tracker, on_finish);
+}
+
+template <typename I>
+PreReleaseRequest<I>::PreReleaseRequest(I &image_ctx,
+ ImageDispatch<I>* image_dispatch,
+ bool shutting_down,
+ AsyncOpTracker &async_op_tracker,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_image_dispatch(image_dispatch),
+ m_shutting_down(shutting_down), m_async_op_tracker(async_op_tracker),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)) {
+}
+
+template <typename I>
+PreReleaseRequest<I>::~PreReleaseRequest() {
+ if (!m_shutting_down) {
+ m_image_ctx.state->handle_prepare_lock_complete();
+ }
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send() {
+ send_cancel_op_requests();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_cancel_op_requests() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_cancel_op_requests>(this);
+ m_image_ctx.cancel_async_requests(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_cancel_op_requests(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ send_set_require_lock();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_set_require_lock() {
+ if (!m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ // exclusive-lock was disabled, no need to block IOs
+ send_wait_for_ops();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_set_require_lock>(this);
+
+ // setting the lock as required will automatically cause the IO
+ // queue to re-request the lock if any IO is queued
+ if (m_image_ctx.clone_copy_on_read ||
+ m_image_ctx.test_features(RBD_FEATURE_JOURNALING) ||
+ m_image_ctx.test_features(RBD_FEATURE_DIRTY_CACHE)) {
+ m_image_dispatch->set_require_lock(m_shutting_down,
+ io::DIRECTION_BOTH, ctx);
+ } else {
+ m_image_dispatch->set_require_lock(m_shutting_down,
+ io::DIRECTION_WRITE, ctx);
+ }
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_set_require_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ // IOs are still flushed regardless of the error
+ lderr(cct) << "failed to set lock: " << cpp_strerror(r) << dendl;
+ }
+
+ send_wait_for_ops();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_wait_for_ops() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_wait_for_ops>(this);
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_wait_for_ops(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ send_prepare_lock();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_prepare_lock() {
+ if (m_shutting_down) {
+ send_process_plugin_release_lock();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ // release the lock if the image is not busy performing other actions
+ Context *ctx = create_context_callback<
+ PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_prepare_lock>(this);
+ m_image_ctx.state->prepare_lock(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_prepare_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ send_process_plugin_release_lock();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_process_plugin_release_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ Context *ctx = create_async_context_callback(m_image_ctx, create_context_callback<
+ PreReleaseRequest<I>,
+ &PreReleaseRequest<I>::handle_process_plugin_release_lock>(this));
+ m_image_ctx.plugin_registry->prerelease_exclusive_lock(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_process_plugin_release_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to handle plugins before releasing lock: "
+ << cpp_strerror(r) << dendl;
+ m_image_dispatch->unset_require_lock(io::DIRECTION_BOTH);
+ save_result(r);
+ finish();
+ return;
+ }
+
+ send_invalidate_cache();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_invalidate_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ PreReleaseRequest<I>,
+ &PreReleaseRequest<I>::handle_invalidate_cache>(this);
+ m_image_ctx.io_image_dispatcher->invalidate_cache(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_invalidate_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -EBLOCKLISTED && r != -EBUSY) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r)
+ << dendl;
+ m_image_dispatch->unset_require_lock(io::DIRECTION_BOTH);
+ save_result(r);
+ finish();
+ return;
+ }
+
+ send_flush_io();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_flush_io() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ // ensure that all in-flight IO is flushed -- skipping the refresh layer
+ // since it should have been flushed when the lock was required and now
+ // refreshes are disabled / interlocked w/ this state machine.
+ auto ctx = create_context_callback<
+ PreReleaseRequest<I>, &PreReleaseRequest<I>::handle_flush_io>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(&m_image_ctx), librbd::io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ m_image_ctx, io::IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK, aio_comp,
+ io::FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH, {});
+ req->send();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_flush_io(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl;
+ }
+
+ send_flush_notifies();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_flush_notifies() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx =
+ create_context_callback<klass, &klass::handle_flush_notifies>(this);
+ m_image_ctx.image_watcher->flush(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_flush_notifies(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(r == 0);
+ send_close_journal();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_close_journal() {
+ {
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ std::swap(m_journal, m_image_ctx.journal);
+ }
+
+ if (m_journal == nullptr) {
+ send_close_object_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_close_journal>(
+ this);
+ m_journal->close(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_close_journal(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ // error implies some journal events were not flushed -- continue
+ lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl;
+ }
+
+ m_journal->put();
+ m_journal = nullptr;
+
+ send_close_object_map();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_close_object_map() {
+ {
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ std::swap(m_object_map, m_image_ctx.object_map);
+ }
+
+ if (m_object_map == nullptr) {
+ send_unlock();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = PreReleaseRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_close_object_map>(this, m_object_map);
+ m_object_map->close(ctx);
+}
+
+template <typename I>
+void PreReleaseRequest<I>::handle_close_object_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl;
+ }
+ m_object_map->put();
+
+ send_unlock();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::send_unlock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ finish();
+}
+
+template <typename I>
+void PreReleaseRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::PreReleaseRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.h b/src/librbd/exclusive_lock/PreReleaseRequest.h
new file mode 100644
index 000000000..426337943
--- /dev/null
+++ b/src/librbd/exclusive_lock/PreReleaseRequest.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H
+
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class AsyncOpTracker;
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+template <typename> struct ImageDispatch;
+
+template <typename ImageCtxT = ImageCtx>
+class PreReleaseRequest {
+public:
+ static PreReleaseRequest* create(ImageCtxT &image_ctx,
+ ImageDispatch<ImageCtxT>* image_dispatch,
+ bool shutting_down,
+ AsyncOpTracker &async_op_tracker,
+ Context *on_finish);
+
+ ~PreReleaseRequest();
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * CANCEL_OP_REQUESTS
+ * |
+ * v
+ * SET_REQUIRE_LOCK
+ * |
+ * v
+ * WAIT_FOR_OPS
+ * |
+ * v
+ * PREPARE_LOCK
+ * |
+ * v
+ * PROCESS_PLUGIN_RELEASE
+ * |
+ * v
+ * SHUT_DOWN_IMAGE_CACHE
+ * |
+ * v
+ * INVALIDATE_CACHE
+ * |
+ * v
+ * FLUSH_IO
+ * |
+ * v
+ * FLUSH_NOTIFIES . . . . . . . . . . . . . .
+ * | .
+ * v .
+ * CLOSE_JOURNAL .
+ * | (journal disabled, .
+ * v object map enabled) .
+ * CLOSE_OBJECT_MAP < . . . . . . . . . . . .
+ * | .
+ * v (object map disabled) .
+ * <finish> < . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ PreReleaseRequest(ImageCtxT &image_ctx,
+ ImageDispatch<ImageCtxT>* image_dispatch,
+ bool shutting_down, AsyncOpTracker &async_op_tracker,
+ Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ ImageDispatch<ImageCtxT>* m_image_dispatch;
+ bool m_shutting_down;
+ AsyncOpTracker &m_async_op_tracker;
+ Context *m_on_finish;
+
+ int m_error_result = 0;
+
+ decltype(m_image_ctx.object_map) m_object_map = nullptr;
+ decltype(m_image_ctx.journal) m_journal = nullptr;
+
+ void send_cancel_op_requests();
+ void handle_cancel_op_requests(int r);
+
+ void send_set_require_lock();
+ void handle_set_require_lock(int r);
+
+ void send_wait_for_ops();
+ void handle_wait_for_ops(int r);
+
+ void send_prepare_lock();
+ void handle_prepare_lock(int r);
+
+ void send_process_plugin_release_lock();
+ void handle_process_plugin_release_lock(int r);
+
+ void send_invalidate_cache();
+ void handle_invalidate_cache(int r);
+
+ void send_flush_io();
+ void handle_flush_io(int r);
+
+ void send_flush_notifies();
+ void handle_flush_notifies(int r);
+
+ void send_close_journal();
+ void handle_close_journal(int r);
+
+ void send_close_object_map();
+ void handle_close_object_map(int r);
+
+ void send_unlock();
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/StandardPolicy.cc b/src/librbd/exclusive_lock/StandardPolicy.cc
new file mode 100644
index 000000000..519e9618e
--- /dev/null
+++ b/src/librbd/exclusive_lock/StandardPolicy.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ExclusiveLock.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock::StandardPolicy "
+
+namespace librbd {
+namespace exclusive_lock {
+
+template <typename I>
+int StandardPolicy<I>::lock_requested(bool force) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock));
+ ceph_assert(m_image_ctx->exclusive_lock != nullptr);
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force
+ << dendl;
+
+ return -EROFS;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::StandardPolicy<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/StandardPolicy.h b/src/librbd/exclusive_lock/StandardPolicy.h
new file mode 100644
index 000000000..dd4e19050
--- /dev/null
+++ b/src/librbd/exclusive_lock/StandardPolicy.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
+
+#include "librbd/exclusive_lock/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class StandardPolicy : public Policy {
+public:
+ StandardPolicy(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ bool may_auto_request_lock() override {
+ return false;
+ }
+
+ int lock_requested(bool force) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::StandardPolicy<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
diff --git a/src/librbd/image/AttachChildRequest.cc b/src/librbd/image/AttachChildRequest.cc
new file mode 100644
index 000000000..2f74191ed
--- /dev/null
+++ b/src/librbd/image/AttachChildRequest.cc
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/AttachChildRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::AttachChildRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+AttachChildRequest<I>::AttachChildRequest(
+ I *image_ctx, I *parent_image_ctx, const librados::snap_t &parent_snap_id,
+ I *old_parent_image_ctx, const librados::snap_t &old_parent_snap_id,
+ uint32_t clone_format, Context* on_finish)
+ : m_image_ctx(image_ctx), m_parent_image_ctx(parent_image_ctx),
+ m_parent_snap_id(parent_snap_id),
+ m_old_parent_image_ctx(old_parent_image_ctx),
+ m_old_parent_snap_id(old_parent_snap_id), m_clone_format(clone_format),
+ m_on_finish(on_finish), m_cct(m_image_ctx->cct) {
+}
+
+template <typename I>
+void AttachChildRequest<I>::send() {
+ if (m_clone_format == 1) {
+ v1_add_child();
+ } else {
+ v2_set_op_feature();
+ }
+}
+
+template <typename I>
+void AttachChildRequest<I>::v1_add_child() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::add_child(&op, {m_parent_image_ctx->md_ctx.get_id(), "",
+ m_parent_image_ctx->id,
+ m_parent_snap_id}, m_image_ctx->id);
+
+ using klass = AttachChildRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v1_add_child>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v1_add_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -EEXIST && m_old_parent_image_ctx != nullptr) {
+ ldout(m_cct, 5) << "child already exists" << dendl;
+ } else {
+ lderr(m_cct) << "couldn't add child: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ }
+
+ v1_refresh();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v1_refresh() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = AttachChildRequest<I>;
+ RefreshRequest<I> *req = RefreshRequest<I>::create(
+ *m_parent_image_ctx, false, false,
+ create_context_callback<klass, &klass::handle_v1_refresh>(this));
+ req->send();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v1_refresh(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ bool snap_protected = false;
+ if (r == 0) {
+ std::shared_lock image_locker{m_parent_image_ctx->image_lock};
+ r = m_parent_image_ctx->is_snap_protected(m_parent_snap_id,
+ &snap_protected);
+ }
+
+ if (r < 0 || !snap_protected) {
+ lderr(m_cct) << "validate protected failed" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ v1_remove_child_from_old_parent();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v1_remove_child_from_old_parent() {
+ if (m_old_parent_image_ctx == nullptr) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::remove_child(&op, {m_old_parent_image_ctx->md_ctx.get_id(),
+ m_old_parent_image_ctx->md_ctx.get_namespace(),
+ m_old_parent_image_ctx->id,
+ m_old_parent_snap_id}, m_image_ctx->id);
+
+ using klass = AttachChildRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_remove_child_from_old_parent>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v1_remove_child_from_old_parent(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "couldn't remove child: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void AttachChildRequest<I>::v2_set_op_feature() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::op_features_set(&op, RBD_OPERATION_FEATURE_CLONE_CHILD,
+ RBD_OPERATION_FEATURE_CLONE_CHILD);
+
+ using klass = AttachChildRequest<I>;
+ auto aio_comp = create_rados_callback<
+ klass, &klass::handle_v2_set_op_feature>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v2_set_op_feature(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable clone v2: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ v2_child_attach();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v2_child_attach() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::child_attach(&op, m_parent_snap_id,
+ {m_image_ctx->md_ctx.get_id(),
+ m_image_ctx->md_ctx.get_namespace(),
+ m_image_ctx->id});
+
+ using klass = AttachChildRequest<I>;
+ auto aio_comp = create_rados_callback<
+ klass, &klass::handle_v2_child_attach>(this);
+ int r = m_parent_image_ctx->md_ctx.aio_operate(m_parent_image_ctx->header_oid,
+ aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v2_child_attach(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -EEXIST && m_old_parent_image_ctx != nullptr) {
+ ldout(m_cct, 5) << "child already exists" << dendl;
+ } else {
+ lderr(m_cct) << "failed to attach child image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+ }
+
+ v2_child_detach_from_old_parent();
+}
+
+template <typename I>
+void AttachChildRequest<I>::v2_child_detach_from_old_parent() {
+ if (m_old_parent_image_ctx == nullptr) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::child_detach(&op, m_old_parent_snap_id,
+ {m_image_ctx->md_ctx.get_id(),
+ m_image_ctx->md_ctx.get_namespace(),
+ m_image_ctx->id});
+
+ using klass = AttachChildRequest<I>;
+ auto aio_comp = create_rados_callback<
+ klass, &klass::handle_v2_child_detach_from_old_parent>(this);
+ int r = m_old_parent_image_ctx->md_ctx.aio_operate(
+ m_old_parent_image_ctx->header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachChildRequest<I>::handle_v2_child_detach_from_old_parent(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to detach child image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void AttachChildRequest<I>::finish(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::AttachChildRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/AttachChildRequest.h b/src/librbd/image/AttachChildRequest.h
new file mode 100644
index 000000000..a40afaf54
--- /dev/null
+++ b/src/librbd/image/AttachChildRequest.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class AttachChildRequest {
+public:
+ static AttachChildRequest* create(ImageCtxT *image_ctx,
+ ImageCtxT *parent_image_ctx,
+ const librados::snap_t &parent_snap_id,
+ ImageCtxT *old_parent_image_ctx,
+ const librados::snap_t &old_parent_snap_id,
+ uint32_t clone_format,
+ Context* on_finish) {
+ return new AttachChildRequest(image_ctx, parent_image_ctx, parent_snap_id,
+ old_parent_image_ctx, old_parent_snap_id,
+ clone_format, on_finish);
+ }
+
+ AttachChildRequest(ImageCtxT *image_ctx,
+ ImageCtxT *parent_image_ctx,
+ const librados::snap_t &parent_snap_id,
+ ImageCtxT *old_parent_image_ctx,
+ const librados::snap_t &old_parent_snap_id,
+ uint32_t clone_format,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * (clone v1) | (clone v2)
+ * /----------------/ \---------------\
+ * | |
+ * v v
+ * V1 ADD CHILD V2 SET CLONE
+ * | |
+ * v v
+ * V1 VALIDATE PROTECTED V2 ATTACH CHILD
+ * | |
+ * | v
+ * V1 REMOVE CHILD FROM OLD PARENT V2 DETACH CHILD FROM OLD PARENT
+ * | |
+ * \----------------\ /---------------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ ImageCtxT *m_parent_image_ctx;
+ librados::snap_t m_parent_snap_id;
+ ImageCtxT *m_old_parent_image_ctx;
+ librados::snap_t m_old_parent_snap_id;
+ uint32_t m_clone_format;
+ Context* m_on_finish;
+
+ CephContext *m_cct;
+
+ void v1_add_child();
+ void handle_v1_add_child(int r);
+
+ void v1_refresh();
+ void handle_v1_refresh(int r);
+
+ void v1_remove_child_from_old_parent();
+ void handle_v1_remove_child_from_old_parent(int r);
+
+ void v2_set_op_feature();
+ void handle_v2_set_op_feature(int r);
+
+ void v2_child_attach();
+ void handle_v2_child_attach(int r);
+
+ void v2_child_detach_from_old_parent();
+ void handle_v2_child_detach_from_old_parent(int r);
+
+ void finish(int r);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::AttachChildRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H
diff --git a/src/librbd/image/AttachParentRequest.cc b/src/librbd/image/AttachParentRequest.cc
new file mode 100644
index 000000000..d0c35b6a9
--- /dev/null
+++ b/src/librbd/image/AttachParentRequest.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/AttachParentRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::AttachParentRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+
+template <typename I>
+void AttachParentRequest<I>::send() {
+ attach_parent();
+}
+
+template <typename I>
+void AttachParentRequest<I>::attach_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "parent_image_spec=" << m_parent_image_spec << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (!m_legacy_parent) {
+ librbd::cls_client::parent_attach(&op, m_parent_image_spec,
+ m_parent_overlap, m_reattach);
+ } else {
+ librbd::cls_client::set_parent(&op, m_parent_image_spec, m_parent_overlap);
+ }
+
+ auto aio_comp = create_rados_callback<
+ AttachParentRequest<I>,
+ &AttachParentRequest<I>::handle_attach_parent>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void AttachParentRequest<I>::handle_attach_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_legacy_parent && r == -EOPNOTSUPP && !m_reattach) {
+ if (m_parent_image_spec.pool_namespace ==
+ m_image_ctx.md_ctx.get_namespace()) {
+ m_parent_image_spec.pool_namespace = "";
+ }
+ if (m_parent_image_spec.pool_namespace.empty()) {
+ ldout(cct, 10) << "retrying using legacy parent method" << dendl;
+ m_legacy_parent = true;
+ attach_parent();
+ return;
+ }
+
+ // namespaces require newer OSDs
+ r = -EXDEV;
+ }
+
+ if (r < 0) {
+ lderr(cct) << "attach parent encountered an error: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void AttachParentRequest<I>::finish(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::AttachParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/AttachParentRequest.h b/src/librbd/image/AttachParentRequest.h
new file mode 100644
index 000000000..482e03273
--- /dev/null
+++ b/src/librbd/image/AttachParentRequest.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class AttachParentRequest {
+public:
+ static AttachParentRequest* create(ImageCtxT& image_ctx,
+ const cls::rbd::ParentImageSpec& pspec,
+ uint64_t parent_overlap,
+ bool reattach,
+ Context* on_finish) {
+ return new AttachParentRequest(image_ctx, pspec, parent_overlap, reattach,
+ on_finish);
+ }
+
+ AttachParentRequest(ImageCtxT& image_ctx,
+ const cls::rbd::ParentImageSpec& pspec,
+ uint64_t parent_overlap, bool reattach,
+ Context* on_finish)
+ : m_image_ctx(image_ctx), m_parent_image_spec(pspec),
+ m_parent_overlap(parent_overlap), m_reattach(reattach),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | * * * * * *
+ * | * * -EOPNOTSUPP
+ * v v *
+ * ATTACH_PARENT * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ cls::rbd::ParentImageSpec m_parent_image_spec;
+ uint64_t m_parent_overlap;
+ bool m_reattach;
+ Context* m_on_finish;
+
+ bool m_legacy_parent = false;
+
+ void attach_parent();
+ void handle_attach_parent(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::AttachParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H
diff --git a/src/librbd/image/CloneRequest.cc b/src/librbd/image/CloneRequest.cc
new file mode 100644
index 000000000..7a955f064
--- /dev/null
+++ b/src/librbd/image/CloneRequest.cc
@@ -0,0 +1,607 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/deep_copy/MetadataCopyRequest.h"
+#include "librbd/image/AttachChildRequest.h"
+#include "librbd/image/AttachParentRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/RemoveRequest.h"
+#include "librbd/image/Types.h"
+#include "librbd/mirror/EnableRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CloneRequest: " << this << " " \
+ << __func__ << ": "
+
+#define MAX_KEYS 64
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+using util::create_context_callback;
+using util::create_async_context_callback;
+
+template <typename I>
+CloneRequest<I>::CloneRequest(
+ ConfigProxy& config,
+ IoCtx& parent_io_ctx,
+ const std::string& parent_image_id,
+ const std::string& parent_snap_name,
+ const cls::rbd::SnapshotNamespace& parent_snap_namespace,
+ uint64_t parent_snap_id,
+ IoCtx &c_ioctx,
+ const std::string &c_name,
+ const std::string &c_id,
+ ImageOptions c_options,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ asio::ContextWQ *op_work_queue, Context *on_finish)
+ : m_config(config), m_parent_io_ctx(parent_io_ctx),
+ m_parent_image_id(parent_image_id), m_parent_snap_name(parent_snap_name),
+ m_parent_snap_namespace(parent_snap_namespace),
+ m_parent_snap_id(parent_snap_id), m_ioctx(c_ioctx), m_name(c_name),
+ m_id(c_id), m_opts(c_options), m_mirror_image_mode(mirror_image_mode),
+ m_non_primary_global_image_id(non_primary_global_image_id),
+ m_primary_mirror_uuid(primary_mirror_uuid),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish),
+ m_use_p_features(true) {
+
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+
+ bool default_format_set;
+ m_opts.is_set(RBD_IMAGE_OPTION_FORMAT, &default_format_set);
+ if (!default_format_set) {
+ m_opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2));
+ }
+
+ ldout(m_cct, 20) << "parent_pool_id=" << parent_io_ctx.get_id() << ", "
+ << "parent_image_id=" << parent_image_id << ", "
+ << "parent_snap=" << parent_snap_name << "/"
+ << parent_snap_id << " clone to "
+ << "pool_id=" << m_ioctx.get_id() << ", "
+ << "name=" << m_name << ", "
+ << "opts=" << m_opts << dendl;
+}
+
+template <typename I>
+void CloneRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+ validate_options();
+}
+
+template <typename I>
+void CloneRequest<I>::validate_options() {
+ ldout(m_cct, 20) << dendl;
+
+ uint64_t format = 0;
+ m_opts.get(RBD_IMAGE_OPTION_FORMAT, &format);
+ if (format < 2) {
+ lderr(m_cct) << "format 2 or later required for clone" << dendl;
+ complete(-EINVAL);
+ return;
+ }
+
+ if (m_opts.get(RBD_IMAGE_OPTION_FEATURES, &m_features) == 0) {
+ if (m_features & ~RBD_FEATURES_ALL) {
+ lderr(m_cct) << "librbd does not support requested features" << dendl;
+ complete(-ENOSYS);
+ return;
+ }
+ m_use_p_features = false;
+ }
+
+ if (m_opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &m_clone_format) < 0) {
+ std::string default_clone_format = m_config.get_val<std::string>(
+ "rbd_default_clone_format");
+ if (default_clone_format == "1") {
+ m_clone_format = 1;
+ } else if (default_clone_format == "auto") {
+ librados::Rados rados(m_ioctx);
+ int8_t min_compat_client;
+ int8_t require_min_compat_client;
+ int r = rados.get_min_compatible_client(&min_compat_client,
+ &require_min_compat_client);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+ if (std::max(min_compat_client, require_min_compat_client) <
+ CEPH_RELEASE_MIMIC) {
+ m_clone_format = 1;
+ }
+ }
+ }
+
+ if (m_clone_format == 1 &&
+ m_parent_io_ctx.get_namespace() != m_ioctx.get_namespace()) {
+ ldout(m_cct, 1) << "clone v2 required for cross-namespace clones" << dendl;
+ complete(-EXDEV);
+ return;
+ }
+
+ open_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::open_parent() {
+ ldout(m_cct, 20) << dendl;
+ ceph_assert(m_parent_snap_name.empty() ^ (m_parent_snap_id == CEPH_NOSNAP));
+
+ if (m_parent_snap_id != CEPH_NOSNAP) {
+ m_parent_image_ctx = I::create("", m_parent_image_id, m_parent_snap_id,
+ m_parent_io_ctx, true);
+ } else {
+ m_parent_image_ctx = I::create("", m_parent_image_id,
+ m_parent_snap_name.c_str(),
+ m_parent_io_ctx,
+ true);
+ m_parent_image_ctx->snap_namespace = m_parent_snap_namespace;
+ }
+
+ Context *ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_open_parent>(this);
+ m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_open_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_parent_image_ctx = nullptr;
+
+ lderr(m_cct) << "failed to open parent image: " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ m_parent_snap_id = m_parent_image_ctx->snap_id;
+ m_pspec = {m_parent_io_ctx.get_id(), m_parent_io_ctx.get_namespace(),
+ m_parent_image_id, m_parent_snap_id};
+ validate_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::validate_parent() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_parent_image_ctx->operations_disabled) {
+ lderr(m_cct) << "image operations disabled due to unsupported op features"
+ << dendl;
+ m_r_saved = -EROFS;
+ close_parent();
+ return;
+ }
+
+ if (m_parent_image_ctx->snap_id == CEPH_NOSNAP) {
+ lderr(m_cct) << "image to be cloned must be a snapshot" << dendl;
+ m_r_saved = -EINVAL;
+ close_parent();
+ return;
+ }
+
+ if (m_parent_image_ctx->old_format) {
+ lderr(m_cct) << "parent image must be in new format" << dendl;
+ m_r_saved = -EINVAL;
+ close_parent();
+ return;
+ }
+
+ m_parent_image_ctx->image_lock.lock_shared();
+ uint64_t p_features = m_parent_image_ctx->features;
+ m_size = m_parent_image_ctx->get_image_size(m_parent_image_ctx->snap_id);
+
+ bool snap_protected;
+ int r = m_parent_image_ctx->is_snap_protected(m_parent_image_ctx->snap_id, &snap_protected);
+ m_parent_image_ctx->image_lock.unlock_shared();
+
+ if ((p_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
+ lderr(m_cct) << "parent image must support layering" << dendl;
+ m_r_saved = -ENOSYS;
+ close_parent();
+ return;
+ }
+ if (m_use_p_features) {
+ m_features = p_features;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "unable to locate parent's snapshot" << dendl;
+ m_r_saved = r;
+ close_parent();
+ return;
+ }
+
+ if (m_clone_format == 1 && !snap_protected) {
+ lderr(m_cct) << "parent snapshot must be protected" << dendl;
+ m_r_saved = -EINVAL;
+ close_parent();
+ return;
+ }
+
+ validate_child();
+}
+
+template <typename I>
+void CloneRequest<I>::validate_child() {
+ ldout(m_cct, 15) << dendl;
+
+ if ((m_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
+ lderr(m_cct) << "cloning image must support layering" << dendl;
+ m_r_saved = -ENOSYS;
+ close_parent();
+ return;
+ }
+
+ using klass = CloneRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_validate_child>(this);
+
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+
+ int r = m_ioctx.aio_operate(util::old_header_name(m_name), comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_validate_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r != -ENOENT) {
+ lderr(m_cct) << "rbd image " << m_name << " already exists" << dendl;
+ m_r_saved = r;
+ close_parent();
+ return;
+ }
+
+ create_child();
+}
+
+template <typename I>
+void CloneRequest<I>::create_child() {
+ ldout(m_cct, 15) << dendl;
+
+ uint64_t order = m_parent_image_ctx->order;
+ if (m_opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ m_opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ m_opts.set(RBD_IMAGE_OPTION_FEATURES, m_features);
+
+ uint64_t stripe_unit = m_parent_image_ctx->stripe_unit;
+ if (m_opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ m_opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+
+ uint64_t stripe_count = m_parent_image_ctx->stripe_count;
+ if (m_opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ m_opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_create_child>(this);
+
+ auto req = CreateRequest<I>::create(
+ m_config, m_ioctx, m_name, m_id, m_size, m_opts,
+ image::CREATE_FLAG_SKIP_MIRROR_ENABLE,
+ cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, m_non_primary_global_image_id,
+ m_primary_mirror_uuid, m_op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_create_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EBADF) {
+ ldout(m_cct, 5) << "image id already in-use" << dendl;
+ complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error creating child: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_parent();
+ return;
+ }
+ open_child();
+}
+
+template <typename I>
+void CloneRequest<I>::open_child() {
+ ldout(m_cct, 15) << dendl;
+
+ m_imctx = I::create(m_name, "", nullptr, m_ioctx, false);
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_open_child>(this);
+
+ uint64_t flags = OPEN_FLAG_SKIP_OPEN_PARENT;
+ if ((m_features & RBD_FEATURE_MIGRATING) != 0) {
+ flags |= OPEN_FLAG_IGNORE_MIGRATING;
+ }
+
+ m_imctx->state->open(flags, ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_open_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_imctx = nullptr;
+
+ lderr(m_cct) << "Error opening new image: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ remove_child();
+ return;
+ }
+
+ attach_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::attach_parent() {
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_attach_parent>(this);
+ auto req = AttachParentRequest<I>::create(
+ *m_imctx, m_pspec, m_size, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_attach_parent(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ return;
+ }
+
+ attach_child();
+}
+
+template <typename I>
+void CloneRequest<I>::attach_child() {
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_attach_child>(this);
+ auto req = AttachChildRequest<I>::create(
+ m_imctx, m_parent_image_ctx, m_parent_image_ctx->snap_id, nullptr, 0,
+ m_clone_format, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_attach_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ return;
+ }
+
+ copy_metadata();
+}
+
+template <typename I>
+void CloneRequest<I>::copy_metadata() {
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_copy_metadata>(this);
+ auto req = deep_copy::MetadataCopyRequest<I>::create(
+ m_parent_image_ctx, m_imctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_copy_metadata(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ close_child();
+ return;
+ }
+
+ get_mirror_mode();
+}
+
+template <typename I>
+void CloneRequest<I>::get_mirror_mode() {
+ ldout(m_cct, 15) << dendl;
+
+ uint64_t mirror_image_mode;
+ if (!m_non_primary_global_image_id.empty()) {
+ enable_mirror();
+ return;
+ } else if (m_opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
+ &mirror_image_mode) == 0) {
+ m_mirror_image_mode = static_cast<cls::rbd::MirrorImageMode>(
+ mirror_image_mode);
+ enable_mirror();
+ return;
+ } else if (!m_imctx->test_features(RBD_FEATURE_JOURNALING)) {
+ close_child();
+ return;
+ }
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = CloneRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_mode>(this);
+ m_out_bl.clear();
+ m_imctx->md_ctx.aio_operate(RBD_MIRRORING,
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_get_mirror_mode(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ } else if (m_mirror_mode == cls::rbd::MIRROR_MODE_POOL) {
+ m_mirror_image_mode = cls::rbd::MIRROR_IMAGE_MODE_JOURNAL;
+ enable_mirror();
+ return;
+ }
+
+ close_child();
+}
+
+template <typename I>
+void CloneRequest<I>::enable_mirror() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_enable_mirror>(this);
+ auto req = mirror::EnableRequest<I>::create(
+ m_imctx, m_mirror_image_mode, m_non_primary_global_image_id, true, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_enable_mirror(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ m_r_saved = r;
+ }
+ close_child();
+}
+
+template <typename I>
+void CloneRequest<I>::close_child() {
+ ldout(m_cct, 15) << dendl;
+
+ ceph_assert(m_imctx != nullptr);
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_close_child>(this);
+ m_imctx->state->close(ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_close_child(int r) {
+ ldout(m_cct, 15) << dendl;
+
+ m_imctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "couldn't close image: " << cpp_strerror(r) << dendl;
+ if (m_r_saved == 0) {
+ m_r_saved = r;
+ }
+ }
+
+ if (m_r_saved < 0) {
+ remove_child();
+ return;
+ }
+
+ close_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::remove_child() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CloneRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_remove_child>(this);
+
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_ioctx, m_name, m_id, false, false, m_no_op, m_op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CloneRequest<I>::handle_remove_child(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "Error removing failed clone: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ close_parent();
+}
+
+template <typename I>
+void CloneRequest<I>::close_parent() {
+ ldout(m_cct, 20) << dendl;
+ ceph_assert(m_parent_image_ctx != nullptr);
+
+ auto ctx = create_context_callback<
+ CloneRequest<I>, &CloneRequest<I>::handle_close_parent>(this);
+ m_parent_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void CloneRequest<I>::handle_close_parent(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_parent_image_ctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to close parent image: "
+ << cpp_strerror(r) << dendl;
+ if (m_r_saved == 0) {
+ m_r_saved = r;
+ }
+ }
+
+ complete(m_r_saved);
+}
+
+template <typename I>
+void CloneRequest<I>::complete(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} //namespace image
+} //namespace librbd
+
+template class librbd::image::CloneRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CloneRequest.h b/src/librbd/image/CloneRequest.h
new file mode 100644
index 000000000..35d9cab17
--- /dev/null
+++ b/src/librbd/image/CloneRequest.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/config_fwd.h"
+#include "librbd/internal.h"
+#include "include/rbd/librbd.hpp"
+
+class Context;
+
+using librados::IoCtx;
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CloneRequest {
+public:
+ static CloneRequest *create(
+ ConfigProxy& config, IoCtx& parent_io_ctx,
+ const std::string& parent_image_id,
+ const std::string& parent_snap_name,
+ const cls::rbd::SnapshotNamespace& parent_snap_namespace,
+ uint64_t parent_snap_id,
+ IoCtx &c_ioctx, const std::string &c_name,
+ const std::string &c_id, ImageOptions c_options,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ asio::ContextWQ *op_work_queue, Context *on_finish) {
+ return new CloneRequest(config, parent_io_ctx, parent_image_id,
+ parent_snap_name, parent_snap_namespace,
+ parent_snap_id, c_ioctx, c_name, c_id, c_options,
+ mirror_image_mode, non_primary_global_image_id,
+ primary_mirror_uuid, op_work_queue, on_finish);
+ }
+
+ CloneRequest(ConfigProxy& config, IoCtx& parent_io_ctx,
+ const std::string& parent_image_id,
+ const std::string& parent_snap_name,
+ const cls::rbd::SnapshotNamespace& parent_snap_namespace,
+ uint64_t parent_snap_id,
+ IoCtx &c_ioctx, const std::string &c_name,
+ const std::string &c_id, ImageOptions c_options,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ asio::ContextWQ *op_work_queue, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN PARENT
+ * |
+ * v
+ * VALIDATE CHILD <finish>
+ * | ^
+ * v |
+ * CREATE CHILD * * * * * * * * * > CLOSE PARENT
+ * | ^
+ * v |
+ * OPEN CHILD * * * * * * * * * * > REMOVE CHILD
+ * | ^
+ * v |
+ * ATTACH PARENT * * * * * * * * > CLOSE CHILD
+ * | ^
+ * v *
+ * ATTACH CHILD * * * * * * * * * * * *
+ * | *
+ * v *
+ * COPY META DATA * * * * * * * * * * ^
+ * | *
+ * v (skip if not needed) *
+ * GET MIRROR MODE * * * * * * * * * ^
+ * | *
+ * v (skip if not needed) *
+ * SET MIRROR ENABLED * * * * * * * * *
+ * |
+ * v
+ * CLOSE CHILD
+ * |
+ * v
+ * CLOSE PARENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ConfigProxy& m_config;
+ IoCtx &m_parent_io_ctx;
+ std::string m_parent_image_id;
+ std::string m_parent_snap_name;
+ cls::rbd::SnapshotNamespace m_parent_snap_namespace;
+ uint64_t m_parent_snap_id;
+ ImageCtxT *m_parent_image_ctx;
+
+ IoCtx &m_ioctx;
+ std::string m_name;
+ std::string m_id;
+ ImageOptions m_opts;
+ cls::rbd::ParentImageSpec m_pspec;
+ ImageCtxT *m_imctx;
+ cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ cls::rbd::MirrorImageMode m_mirror_image_mode;
+ const std::string m_non_primary_global_image_id;
+ const std::string m_primary_mirror_uuid;
+ NoOpProgressContext m_no_op;
+ asio::ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ uint64_t m_clone_format = 2;
+ bool m_use_p_features;
+ uint64_t m_features;
+ bufferlist m_out_bl;
+ uint64_t m_size;
+ int m_r_saved = 0;
+
+ void validate_options();
+
+ void open_parent();
+ void handle_open_parent(int r);
+
+ void validate_parent();
+
+ void validate_child();
+ void handle_validate_child(int r);
+
+ void create_child();
+ void handle_create_child(int r);
+
+ void open_child();
+ void handle_open_child(int r);
+
+ void attach_parent();
+ void handle_attach_parent(int r);
+
+ void attach_child();
+ void handle_attach_child(int r);
+
+ void copy_metadata();
+ void handle_copy_metadata(int r);
+
+ void get_mirror_mode();
+ void handle_get_mirror_mode(int r);
+
+ void enable_mirror();
+ void handle_enable_mirror(int r);
+
+ void close_child();
+ void handle_close_child(int r);
+
+ void remove_child();
+ void handle_remove_child(int r);
+
+ void close_parent();
+ void handle_close_parent(int r);
+
+ void complete(int r);
+};
+
+} //namespace image
+} //namespace librbd
+
+extern template class librbd::image::CloneRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H
diff --git a/src/librbd/image/CloseRequest.cc b/src/librbd/image/CloseRequest.cc
new file mode 100644
index 000000000..7293687f5
--- /dev/null
+++ b/src/librbd/image/CloseRequest.cc
@@ -0,0 +1,350 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/CloseRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ConfigWatcher.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatcher.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CloseRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+CloseRequest<I>::CloseRequest(I *image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0),
+ m_exclusive_lock(nullptr) {
+ ceph_assert(image_ctx != nullptr);
+}
+
+template <typename I>
+void CloseRequest<I>::send() {
+ if (m_image_ctx->config_watcher != nullptr) {
+ m_image_ctx->config_watcher->shut_down();
+
+ delete m_image_ctx->config_watcher;
+ m_image_ctx->config_watcher = nullptr;
+ }
+
+ send_block_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_block_image_watcher() {
+ if (m_image_ctx->image_watcher == nullptr) {
+ send_shut_down_update_watchers();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // prevent incoming requests from our peers
+ m_image_ctx->image_watcher->block_notifies(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_block_image_watcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_block_image_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ send_shut_down_update_watchers();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_update_watchers() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->state->shut_down_update_watchers(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_shut_down_update_watchers>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_update_watchers(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down update watchers: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ send_flush();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(ctx, m_image_ctx,
+ io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp,
+ io::FLUSH_SOURCE_SHUTDOWN, {});
+ req->send();
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl;
+ }
+
+ send_shut_down_exclusive_lock();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_exclusive_lock() {
+ {
+ std::unique_lock owner_locker{m_image_ctx->owner_lock};
+ m_exclusive_lock = m_image_ctx->exclusive_lock;
+
+ // if reading a snapshot -- possible object map is open
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ if (m_exclusive_lock == nullptr && m_image_ctx->object_map) {
+ m_image_ctx->object_map->put();
+ m_image_ctx->object_map = nullptr;
+ }
+ }
+
+ if (m_exclusive_lock == nullptr) {
+ send_unregister_image_watcher();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // in-flight IO will be flushed and in-flight requests will be canceled
+ // before releasing lock
+ m_exclusive_lock->shut_down(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_shut_down_exclusive_lock>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_exclusive_lock(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ ceph_assert(m_image_ctx->exclusive_lock == nullptr);
+
+ // object map and journal closed during exclusive lock shutdown
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ ceph_assert(m_image_ctx->journal == nullptr);
+ ceph_assert(m_image_ctx->object_map == nullptr);
+ }
+
+ m_exclusive_lock->put();
+ m_exclusive_lock = nullptr;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ send_unregister_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_unregister_image_watcher() {
+ if (m_image_ctx->image_watcher == nullptr) {
+ send_flush_readahead();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->image_watcher->unregister_watch(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_unregister_image_watcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_unregister_image_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to unregister image watcher: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ send_flush_readahead();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_readahead() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->readahead.wait_for_pending(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush_readahead>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_readahead(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ send_shut_down_image_dispatcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_image_dispatcher() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->io_image_dispatcher->shut_down(create_context_callback<
+ CloseRequest<I>,
+ &CloseRequest<I>::handle_shut_down_image_dispatcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_image_dispatcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down image dispatcher: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ send_shut_down_object_dispatcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_object_dispatcher() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->io_object_dispatcher->shut_down(create_context_callback<
+ CloseRequest<I>,
+ &CloseRequest<I>::handle_shut_down_object_dispatcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_object_dispatcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "failed to shut down object dispatcher: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ send_flush_op_work_queue();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_op_work_queue() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->op_work_queue->queue(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush_op_work_queue>(this), 0);
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_op_work_queue(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+ send_close_parent();
+}
+
+template <typename I>
+void CloseRequest<I>::send_close_parent() {
+ if (m_image_ctx->parent == nullptr) {
+ send_flush_image_watcher();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->parent->state->close(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_close_parent>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_close_parent(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ m_image_ctx->parent = nullptr;
+ save_result(r);
+ if (r < 0) {
+ lderr(cct) << "error closing parent image: " << cpp_strerror(r) << dendl;
+ }
+ send_flush_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_image_watcher() {
+ if (m_image_ctx->image_watcher == nullptr) {
+ finish();
+ return;
+ }
+
+ m_image_ctx->image_watcher->flush(create_context_callback<
+ CloseRequest<I>, &CloseRequest<I>::handle_flush_image_watcher>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_image_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "error flushing image watcher: " << cpp_strerror(r) << dendl;
+ }
+ save_result(r);
+ finish();
+}
+
+template <typename I>
+void CloseRequest<I>::finish() {
+ m_image_ctx->shutdown();
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::CloseRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CloseRequest.h b/src/librbd/image/CloseRequest.h
new file mode 100644
index 000000000..ee298aa9d
--- /dev/null
+++ b/src/librbd/image/CloseRequest.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
+
+#include "librbd/ImageCtx.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CloseRequest {
+public:
+ static CloseRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return new CloseRequest(image_ctx, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * BLOCK_IMAGE_WATCHER (skip if R/O)
+ * |
+ * v
+ * SHUT_DOWN_UPDATE_WATCHERS
+ * |
+ * v
+ * FLUSH
+ * |
+ * v (skip if disabled)
+ * SHUT_DOWN_EXCLUSIVE_LOCK
+ * |
+ * v
+ * UNREGISTER_IMAGE_WATCHER (skip if R/O)
+ * |
+ * v
+ * FLUSH_READAHEAD
+ * |
+ * v
+ * SHUT_DOWN_IMAGE_DISPATCHER
+ * |
+ * v
+ * SHUT_DOWN_OBJECT_DISPATCHER
+ * |
+ * v
+ * FLUSH_OP_WORK_QUEUE
+ * |
+ * v (skip if no parent)
+ * CLOSE_PARENT
+ * |
+ * v
+ * FLUSH_IMAGE_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ CloseRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ decltype(m_image_ctx->exclusive_lock) m_exclusive_lock;
+
+ void send_block_image_watcher();
+ void handle_block_image_watcher(int r);
+
+ void send_shut_down_update_watchers();
+ void handle_shut_down_update_watchers(int r);
+
+ void send_flush();
+ void handle_flush(int r);
+
+ void send_shut_down_exclusive_lock();
+ void handle_shut_down_exclusive_lock(int r);
+
+ void send_unregister_image_watcher();
+ void handle_unregister_image_watcher(int r);
+
+ void send_flush_readahead();
+ void handle_flush_readahead(int r);
+
+ void send_shut_down_image_dispatcher();
+ void handle_shut_down_image_dispatcher(int r);
+
+ void send_shut_down_object_dispatcher();
+ void handle_shut_down_object_dispatcher(int r);
+
+ void send_flush_op_work_queue();
+ void handle_flush_op_work_queue(int r);
+
+ void send_close_parent();
+ void handle_close_parent(int r);
+
+ void send_flush_image_watcher();
+ void handle_flush_image_watcher(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::CloseRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
diff --git a/src/librbd/image/CreateRequest.cc b/src/librbd/image/CreateRequest.cc
new file mode 100644
index 000000000..3fc1aa613
--- /dev/null
+++ b/src/librbd/image/CreateRequest.cc
@@ -0,0 +1,835 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/CreateRequest.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/ceph_context.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "osdc/Striper.h"
+#include "librbd/Features.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/image/Types.h"
+#include "librbd/image/ValidatePoolRequest.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/journal/TypeTraits.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "journal/Journaler.h"
+
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CreateRequest: " << __func__ \
+ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+using util::create_context_callback;
+
+namespace {
+
+int validate_features(CephContext *cct, uint64_t features) {
+ if (features & ~RBD_FEATURES_ALL) {
+ lderr(cct) << "librbd does not support requested features." << dendl;
+ return -ENOSYS;
+ }
+ if ((features & RBD_FEATURES_INTERNAL) != 0) {
+ lderr(cct) << "cannot use internally controlled features" << dendl;
+ return -EINVAL;
+ }
+ if ((features & RBD_FEATURE_FAST_DIFF) != 0 &&
+ (features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ lderr(cct) << "cannot use fast diff without object map" << dendl;
+ return -EINVAL;
+ }
+ if ((features & RBD_FEATURE_OBJECT_MAP) != 0 &&
+ (features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot use object map without exclusive lock" << dendl;
+ return -EINVAL;
+ }
+ if ((features & RBD_FEATURE_JOURNALING) != 0 &&
+ (features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot use journaling without exclusive lock" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int validate_striping(CephContext *cct, uint8_t order, uint64_t stripe_unit,
+ uint64_t stripe_count) {
+ if ((stripe_unit && !stripe_count) ||
+ (!stripe_unit && stripe_count)) {
+ lderr(cct) << "must specify both (or neither) of stripe-unit and "
+ << "stripe-count" << dendl;
+ return -EINVAL;
+ } else if (stripe_unit && ((1ull << order) % stripe_unit || stripe_unit > (1ull << order))) {
+ lderr(cct) << "stripe unit is not a factor of the object size" << dendl;
+ return -EINVAL;
+ } else if (stripe_unit != 0 && stripe_unit < 512) {
+ lderr(cct) << "stripe unit must be at least 512 bytes" << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+bool validate_layout(CephContext *cct, uint64_t size, file_layout_t &layout) {
+ if (!librbd::ObjectMap<>::is_compatible(layout, size)) {
+ lderr(cct) << "image size not compatible with object map" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+int get_image_option(const ImageOptions &image_options, int option,
+ uint8_t *value) {
+ uint64_t large_value;
+ int r = image_options.get(option, &large_value);
+ if (r < 0) {
+ return r;
+ }
+ *value = static_cast<uint8_t>(large_value);
+ return 0;
+}
+
+} // anonymous namespace
+
+template<typename I>
+int CreateRequest<I>::validate_order(CephContext *cct, uint8_t order) {
+ if (order > 25 || order < 12) {
+ lderr(cct) << "order must be in the range [12, 25]" << dendl;
+ return -EDOM;
+ }
+ return 0;
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CreateRequest: " << this << " " \
+ << __func__ << ": "
+
+template<typename I>
+CreateRequest<I>::CreateRequest(const ConfigProxy& config, IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ const ImageOptions &image_options,
+ uint32_t create_flags,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ asio::ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_config(config), m_image_name(image_name), m_image_id(image_id),
+ m_size(size), m_create_flags(create_flags),
+ m_mirror_image_mode(mirror_image_mode),
+ m_non_primary_global_image_id(non_primary_global_image_id),
+ m_primary_mirror_uuid(primary_mirror_uuid),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+
+ m_io_ctx.dup(ioctx);
+ m_cct = reinterpret_cast<CephContext *>(m_io_ctx.cct());
+
+ m_id_obj = util::id_obj_name(m_image_name);
+ m_header_obj = util::header_name(m_image_id);
+ m_objmap_name = ObjectMap<>::object_map_name(m_image_id, CEPH_NOSNAP);
+ if (!non_primary_global_image_id.empty() &&
+ (m_create_flags & CREATE_FLAG_MIRROR_ENABLE_MASK) == 0) {
+ m_create_flags |= CREATE_FLAG_FORCE_MIRROR_ENABLE;
+ }
+
+ if (image_options.get(RBD_IMAGE_OPTION_FEATURES, &m_features) != 0) {
+ m_features = librbd::rbd_features_from_string(
+ m_config.get_val<std::string>("rbd_default_features"), nullptr);
+ m_negotiate_features = true;
+ }
+
+ uint64_t features_clear = 0;
+ uint64_t features_set = 0;
+ image_options.get(RBD_IMAGE_OPTION_FEATURES_CLEAR, &features_clear);
+ image_options.get(RBD_IMAGE_OPTION_FEATURES_SET, &features_set);
+
+ uint64_t features_conflict = features_clear & features_set;
+ features_clear &= ~features_conflict;
+ features_set &= ~features_conflict;
+ m_features |= features_set;
+ m_features &= ~features_clear;
+
+ m_features &= ~RBD_FEATURES_IMPLICIT_ENABLE;
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP) {
+ m_features |= RBD_FEATURE_FAST_DIFF;
+ }
+
+ if (image_options.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &m_stripe_unit) != 0 ||
+ m_stripe_unit == 0) {
+ m_stripe_unit = m_config.get_val<Option::size_t>("rbd_default_stripe_unit");
+ }
+ if (image_options.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &m_stripe_count) != 0 ||
+ m_stripe_count == 0) {
+ m_stripe_count = m_config.get_val<uint64_t>("rbd_default_stripe_count");
+ }
+ if (get_image_option(image_options, RBD_IMAGE_OPTION_ORDER, &m_order) != 0 ||
+ m_order == 0) {
+ m_order = config.get_val<uint64_t>("rbd_default_order");
+ }
+ if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_ORDER,
+ &m_journal_order) != 0) {
+ m_journal_order = m_config.get_val<uint64_t>("rbd_journal_order");
+ }
+ if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
+ &m_journal_splay_width) != 0) {
+ m_journal_splay_width = m_config.get_val<uint64_t>(
+ "rbd_journal_splay_width");
+ }
+ if (image_options.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &m_journal_pool) != 0) {
+ m_journal_pool = m_config.get_val<std::string>("rbd_journal_pool");
+ }
+ if (image_options.get(RBD_IMAGE_OPTION_DATA_POOL, &m_data_pool) != 0) {
+ m_data_pool = m_config.get_val<std::string>("rbd_default_data_pool");
+ }
+
+ m_layout.object_size = 1ull << m_order;
+ if (m_stripe_unit == 0 || m_stripe_count == 0) {
+ m_layout.stripe_unit = m_layout.object_size;
+ m_layout.stripe_count = 1;
+ } else {
+ m_layout.stripe_unit = m_stripe_unit;
+ m_layout.stripe_count = m_stripe_count;
+ }
+
+ if (!m_data_pool.empty() && m_data_pool != ioctx.get_pool_name()) {
+ m_features |= RBD_FEATURE_DATA_POOL;
+ } else {
+ m_data_pool.clear();
+ }
+
+ if ((m_stripe_unit != 0 && m_stripe_unit != (1ULL << m_order)) ||
+ (m_stripe_count != 0 && m_stripe_count != 1)) {
+ m_features |= RBD_FEATURE_STRIPINGV2;
+ }
+
+ ldout(m_cct, 10) << "name=" << m_image_name << ", "
+ << "id=" << m_image_id << ", "
+ << "size=" << m_size << ", "
+ << "features=" << m_features << ", "
+ << "order=" << (uint64_t)m_order << ", "
+ << "stripe_unit=" << m_stripe_unit << ", "
+ << "stripe_count=" << m_stripe_count << ", "
+ << "journal_order=" << (uint64_t)m_journal_order << ", "
+ << "journal_splay_width="
+ << (uint64_t)m_journal_splay_width << ", "
+ << "journal_pool=" << m_journal_pool << ", "
+ << "data_pool=" << m_data_pool << dendl;
+}
+
+template<typename I>
+void CreateRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+
+ int r = validate_features(m_cct, m_features);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ r = validate_order(m_cct, m_order);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ r = validate_striping(m_cct, m_order, m_stripe_unit, m_stripe_count);
+ if (r < 0) {
+ complete(r);
+ return;
+ }
+
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) &&
+ (!validate_layout(m_cct, m_size, m_layout))) {
+ complete(-EINVAL);
+ return;
+ }
+
+ validate_data_pool();
+}
+
+template <typename I>
+void CreateRequest<I>::validate_data_pool() {
+ m_data_io_ctx = m_io_ctx;
+ if ((m_features & RBD_FEATURE_DATA_POOL) != 0) {
+ librados::Rados rados(m_io_ctx);
+ int r = rados.ioctx_create(m_data_pool.c_str(), m_data_io_ctx);
+ if (r < 0) {
+ lderr(m_cct) << "data pool " << m_data_pool << " does not exist" << dendl;
+ complete(r);
+ return;
+ }
+ m_data_pool_id = m_data_io_ctx.get_id();
+ m_data_io_ctx.set_namespace(m_io_ctx.get_namespace());
+ }
+
+ if (!m_config.get_val<bool>("rbd_validate_pool")) {
+ add_image_to_directory();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CreateRequest<I>, &CreateRequest<I>::handle_validate_data_pool>(this);
+ auto req = ValidatePoolRequest<I>::create(m_data_io_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateRequest<I>::handle_validate_data_pool(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EINVAL) {
+ lderr(m_cct) << "pool does not support RBD images" << dendl;
+ complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to validate pool: " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ add_image_to_directory();
+}
+
+template<typename I>
+void CreateRequest<I>::add_image_to_directory() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (!m_io_ctx.get_namespace().empty()) {
+ cls_client::dir_state_assert(&op, cls::rbd::DIRECTORY_STATE_READY);
+ }
+ cls_client::dir_add_image(&op, m_image_name, m_image_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_add_image_to_directory>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_add_image_to_directory(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 5) << "directory entry for image " << m_image_name
+ << " already exists" << dendl;
+ complete(r);
+ return;
+ } else if (!m_io_ctx.get_namespace().empty() && r == -ENOENT) {
+ ldout(m_cct, 5) << "namespace " << m_io_ctx.get_namespace()
+ << " does not exist" << dendl;
+ complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error adding image to directory: " << cpp_strerror(r)
+ << dendl;
+ complete(r);
+ return;
+ }
+
+ create_id_object();
+}
+
+template<typename I>
+void CreateRequest<I>::create_id_object() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_client::set_id(&op, m_image_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_create_id_object>(this);
+ int r = m_io_ctx.aio_operate(m_id_obj, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_create_id_object(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 5) << "id object for " << m_image_name << " already exists"
+ << dendl;
+ m_r_saved = r;
+ remove_from_dir();
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error creating RBD id object: " << cpp_strerror(r)
+ << dendl;
+ m_r_saved = r;
+ remove_from_dir();
+ return;
+ }
+
+ negotiate_features();
+}
+
+template<typename I>
+void CreateRequest<I>::negotiate_features() {
+ if (!m_negotiate_features) {
+ create_image();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_all_features_start(&op);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_negotiate_features>(this);
+
+ m_outbl.clear();
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_negotiate_features(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ uint64_t all_features;
+ if (r >= 0) {
+ auto it = m_outbl.cbegin();
+ r = cls_client::get_all_features_finish(&it, &all_features);
+ }
+ if (r < 0) {
+ ldout(m_cct, 10) << "error retrieving server supported features set: "
+ << cpp_strerror(r) << dendl;
+ } else if ((m_features & all_features) != m_features) {
+ m_features &= all_features;
+ ldout(m_cct, 10) << "limiting default features set to server supported: "
+ << m_features << dendl;
+ }
+
+ create_image();
+}
+
+template<typename I>
+void CreateRequest<I>::create_image() {
+ ldout(m_cct, 15) << dendl;
+ ceph_assert(m_data_pool.empty() || m_data_pool_id != -1);
+
+ ostringstream oss;
+ oss << RBD_DATA_PREFIX;
+ if (m_data_pool_id != -1) {
+ oss << stringify(m_io_ctx.get_id()) << ".";
+ }
+ oss << m_image_id;
+ if (oss.str().length() > RBD_MAX_BLOCK_NAME_PREFIX_LENGTH) {
+ lderr(m_cct) << "object prefix '" << oss.str() << "' too large" << dendl;
+ m_r_saved = -EINVAL;
+ remove_id_object();
+ return;
+ }
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_client::create_image(&op, m_size, m_order, m_features, oss.str(),
+ m_data_pool_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_create_image>(this);
+ int r = m_io_ctx.aio_operate(m_header_obj, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_create_image(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 5) << "image id already in-use" << dendl;
+ complete(-EBADF);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ remove_id_object();
+ return;
+ }
+
+ set_stripe_unit_count();
+}
+
+template<typename I>
+void CreateRequest<I>::set_stripe_unit_count() {
+ if ((!m_stripe_unit && !m_stripe_count) ||
+ ((m_stripe_count == 1) && (m_stripe_unit == (1ull << m_order)))) {
+ object_map_resize();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_stripe_unit_count(&op, m_stripe_unit, m_stripe_count);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_stripe_unit_count>(this);
+ int r = m_io_ctx.aio_operate(m_header_obj, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_set_stripe_unit_count(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error setting stripe unit/count: "
+ << cpp_strerror(r) << dendl;
+ m_r_saved = r;
+ remove_header_object();
+ return;
+ }
+
+ object_map_resize();
+}
+
+template<typename I>
+void CreateRequest<I>::object_map_resize() {
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ fetch_mirror_mode();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::object_map_resize(&op, Striper::get_num_objects(m_layout, m_size),
+ OBJECT_NONEXISTENT);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_object_map_resize>(this);
+ int r = m_io_ctx.aio_operate(m_objmap_name, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_object_map_resize(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error creating initial object map: "
+ << cpp_strerror(r) << dendl;
+
+ m_r_saved = r;
+ remove_header_object();
+ return;
+ }
+
+ fetch_mirror_mode();
+}
+
+template<typename I>
+void CreateRequest<I>::fetch_mirror_mode() {
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ mirror_image_enable();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_fetch_mirror_mode>(this);
+ m_outbl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_outbl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_fetch_mirror_mode(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if ((r < 0) && (r != -ENOENT)) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ remove_object_map();
+ return;
+ }
+
+ m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ if (r == 0) {
+ auto it = m_outbl.cbegin();
+ r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode);
+ if (r < 0) {
+ lderr(m_cct) << "Failed to retrieve mirror mode" << dendl;
+
+ m_r_saved = r;
+ remove_object_map();
+ return;
+ }
+ }
+
+ journal_create();
+}
+
+template<typename I>
+void CreateRequest<I>::journal_create() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journal_create>(
+ this);
+
+ // only link to remote primary mirror uuid if in journal-based
+ // mirroring mode
+ bool use_primary_mirror_uuid = (
+ !m_non_primary_global_image_id.empty() &&
+ m_mirror_image_mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL);
+
+ librbd::journal::TagData tag_data;
+ tag_data.mirror_uuid = (use_primary_mirror_uuid ? m_primary_mirror_uuid :
+ librbd::Journal<I>::LOCAL_MIRROR_UUID);
+
+ typename journal::TypeTraits<I>::ContextWQ* context_wq;
+ Journal<>::get_work_queue(m_cct, &context_wq);
+
+ auto req = librbd::journal::CreateRequest<I>::create(
+ m_io_ctx, m_image_id, m_journal_order, m_journal_splay_width,
+ m_journal_pool, cls::journal::Tag::TAG_CLASS_NEW, tag_data,
+ librbd::Journal<I>::IMAGE_CLIENT_ID, context_wq, ctx);
+ req->send();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_journal_create(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error creating journal: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ remove_object_map();
+ return;
+ }
+
+ mirror_image_enable();
+}
+
+template<typename I>
+void CreateRequest<I>::mirror_image_enable() {
+ auto mirror_enable_flag = (m_create_flags & CREATE_FLAG_MIRROR_ENABLE_MASK);
+
+ if ((m_mirror_mode != cls::rbd::MIRROR_MODE_POOL &&
+ mirror_enable_flag != CREATE_FLAG_FORCE_MIRROR_ENABLE) ||
+ (mirror_enable_flag == CREATE_FLAG_SKIP_MIRROR_ENABLE)) {
+ complete(0);
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+ auto ctx = create_context_callback<
+ CreateRequest<I>, &CreateRequest<I>::handle_mirror_image_enable>(this);
+
+ auto req = mirror::EnableRequest<I>::create(
+ m_io_ctx, m_image_id, m_mirror_image_mode,
+ m_non_primary_global_image_id, true, m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_mirror_image_enable(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "cannot enable mirroring: " << cpp_strerror(r)
+ << dendl;
+
+ m_r_saved = r;
+ journal_remove();
+ return;
+ }
+
+ complete(0);
+}
+
+template<typename I>
+void CreateRequest<I>::complete(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_data_io_ctx.close();
+ auto on_finish = m_on_finish;
+ delete this;
+ on_finish->complete(r);
+}
+
+// cleanup
+template<typename I>
+void CreateRequest<I>::journal_remove() {
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ remove_object_map();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journal_remove>(
+ this);
+
+ typename journal::TypeTraits<I>::ContextWQ* context_wq;
+ Journal<>::get_work_queue(m_cct, &context_wq);
+
+ librbd::journal::RemoveRequest<I> *req =
+ librbd::journal::RemoveRequest<I>::create(
+ m_io_ctx, m_image_id, librbd::Journal<I>::IMAGE_CLIENT_ID, context_wq,
+ ctx);
+ req->send();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_journal_remove(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up journal after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_object_map();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_object_map() {
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ remove_header_object();
+ return;
+ }
+
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_object_map>(this);
+ int r = m_io_ctx.aio_remove(m_objmap_name, comp);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_object_map(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up object map after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_header_object();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_header_object() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_header_object>(this);
+ int r = m_io_ctx.aio_remove(m_header_obj, comp);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_header_object(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up image header after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_id_object();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_id_object() {
+ ldout(m_cct, 15) << dendl;
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_id_object>(this);
+ int r = m_io_ctx.aio_remove(m_id_obj, comp);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_id_object(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up id object after creation failed: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ remove_from_dir();
+}
+
+template<typename I>
+void CreateRequest<I>::remove_from_dir() {
+ ldout(m_cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::dir_remove_image(&op, m_image_name, m_image_id);
+
+ using klass = CreateRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_from_dir>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template<typename I>
+void CreateRequest<I>::handle_remove_from_dir(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error cleaning up image from rbd_directory object "
+ << "after creation failed: " << cpp_strerror(r) << dendl;
+ }
+
+ complete(m_r_saved);
+}
+
+} //namespace image
+} //namespace librbd
+
+template class librbd::image::CreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CreateRequest.h b/src/librbd/image/CreateRequest.h
new file mode 100644
index 000000000..9cb0eec7c
--- /dev/null
+++ b/src/librbd/image/CreateRequest.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H
+
+#include "common/config_fwd.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+
+class Context;
+
+using librados::IoCtx;
+
+namespace journal { class Journaler; }
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CreateRequest {
+public:
+ static CreateRequest *create(const ConfigProxy& config, IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ const ImageOptions &image_options,
+ uint32_t create_flags,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ asio::ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new CreateRequest(config, ioctx, image_name, image_id, size,
+ image_options, create_flags,
+ mirror_image_mode, non_primary_global_image_id,
+ primary_mirror_uuid, op_work_queue, on_finish);
+ }
+
+ static int validate_order(CephContext *cct, uint8_t order);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> . . . . > . . . . .
+ * | .
+ * v .
+ * VALIDATE DATA POOL v (pool validation
+ * | . disabled)
+ * v .
+ * (error: bottom up) ADD IMAGE TO DIRECTORY < . . . .
+ * _______<_______ |
+ * | | v
+ * | | CREATE ID OBJECT
+ * | | / |
+ * | REMOVE FROM DIR <-------/ v
+ * | | NEGOTIATE FEATURES (when using default features)
+ * | | |
+ * | | v (stripingv2 disabled)
+ * | | CREATE IMAGE. . . . > . . . .
+ * v | / | .
+ * | REMOVE ID OBJ <---------/ v .
+ * | | SET STRIPE UNIT COUNT .
+ * | | / | \ . . . . . > . . . .
+ * | REMOVE HEADER OBJ<------/ v /. (object-map
+ * | |\ OBJECT MAP RESIZE . . < . . * v disabled)
+ * | | \ / | \ . . . . . > . . . .
+ * | | *<-----------/ v /. (journaling
+ * | | FETCH MIRROR MODE. . < . . * v disabled)
+ * | | / | .
+ * | REMOVE OBJECT MAP<--------/ v .
+ * | |\ JOURNAL CREATE .
+ * | | \ / | .
+ * v | *<------------/ v .
+ * | | MIRROR IMAGE ENABLE .
+ * | | / | .
+ * | JOURNAL REMOVE*<-------/ | .
+ * | v .
+ * |_____________>___________________<finish> . . . . < . . . .
+ *
+ * @endverbatim
+ */
+
+ CreateRequest(const ConfigProxy& config, IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ const ImageOptions &image_options,
+ uint32_t create_flags,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ asio::ContextWQ *op_work_queue, Context *on_finish);
+
+ const ConfigProxy& m_config;
+ IoCtx m_io_ctx;
+ IoCtx m_data_io_ctx;
+ std::string m_image_name;
+ std::string m_image_id;
+ uint64_t m_size;
+ uint8_t m_order = 0;
+ uint64_t m_features = 0;
+ uint64_t m_stripe_unit = 0;
+ uint64_t m_stripe_count = 0;
+ uint8_t m_journal_order = 0;
+ uint8_t m_journal_splay_width = 0;
+ std::string m_journal_pool;
+ std::string m_data_pool;
+ int64_t m_data_pool_id = -1;
+ uint32_t m_create_flags;
+ cls::rbd::MirrorImageMode m_mirror_image_mode;
+ const std::string m_non_primary_global_image_id;
+ const std::string m_primary_mirror_uuid;
+ bool m_negotiate_features = false;
+
+ asio::ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ int m_r_saved = 0; // used to return actual error after cleanup
+ file_layout_t m_layout;
+ std::string m_id_obj, m_header_obj, m_objmap_name;
+
+ bufferlist m_outbl;
+ cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ cls::rbd::MirrorImage m_mirror_image_internal;
+
+ void validate_data_pool();
+ void handle_validate_data_pool(int r);
+
+ void add_image_to_directory();
+ void handle_add_image_to_directory(int r);
+
+ void create_id_object();
+ void handle_create_id_object(int r);
+
+ void negotiate_features();
+ void handle_negotiate_features(int r);
+
+ void create_image();
+ void handle_create_image(int r);
+
+ void set_stripe_unit_count();
+ void handle_set_stripe_unit_count(int r);
+
+ void object_map_resize();
+ void handle_object_map_resize(int r);
+
+ void fetch_mirror_mode();
+ void handle_fetch_mirror_mode(int r);
+
+ void journal_create();
+ void handle_journal_create(int r);
+
+ void mirror_image_enable();
+ void handle_mirror_image_enable(int r);
+
+ void complete(int r);
+
+ // cleanup
+ void journal_remove();
+ void handle_journal_remove(int r);
+
+ void remove_object_map();
+ void handle_remove_object_map(int r);
+
+ void remove_header_object();
+ void handle_remove_header_object(int r);
+
+ void remove_id_object();
+ void handle_remove_id_object(int r);
+
+ void remove_from_dir();
+ void handle_remove_from_dir(int r);
+
+};
+
+} //namespace image
+} //namespace librbd
+
+extern template class librbd::image::CreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H
diff --git a/src/librbd/image/DetachChildRequest.cc b/src/librbd/image/DetachChildRequest.cc
new file mode 100644
index 000000000..ab39dbcd7
--- /dev/null
+++ b/src/librbd/image/DetachChildRequest.cc
@@ -0,0 +1,392 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/DetachChildRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/trash/RemoveRequest.h"
+#include <string>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::DetachChildRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+DetachChildRequest<I>::~DetachChildRequest() {
+ ceph_assert(m_parent_image_ctx == nullptr);
+}
+
+template <typename I>
+void DetachChildRequest<I>::send() {
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+
+ // use oldest snapshot or HEAD for parent spec
+ if (!m_image_ctx.snap_info.empty()) {
+ m_parent_spec = m_image_ctx.snap_info.begin()->second.parent.spec;
+ } else {
+ m_parent_spec = m_image_ctx.parent_md.spec;
+ }
+ }
+
+ if (m_parent_spec.pool_id == -1) {
+ // ignore potential race with parent disappearing
+ m_image_ctx.op_work_queue->queue(create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::finish>(this), 0);
+ return;
+ } else if (!m_image_ctx.test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) {
+ clone_v1_remove_child();
+ return;
+ }
+
+ clone_v2_child_detach();
+}
+
+template <typename I>
+void DetachChildRequest<I>::clone_v2_child_detach() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::child_detach(&op, m_parent_spec.snap_id,
+ {m_image_ctx.md_ctx.get_id(),
+ m_image_ctx.md_ctx.get_namespace(),
+ m_image_ctx.id});
+
+ int r = util::create_ioctx(m_image_ctx.md_ctx, "parent image",
+ m_parent_spec.pool_id,
+ m_parent_spec.pool_namespace, &m_parent_io_ctx);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ finish(r);
+ return;
+ }
+
+ m_parent_header_name = util::header_name(m_parent_spec.image_id);
+
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_child_detach>(this);
+ r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DetachChildRequest<I>::handle_clone_v2_child_detach(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error detaching child from parent: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ clone_v2_get_snapshot();
+}
+
+template <typename I>
+void DetachChildRequest<I>::clone_v2_get_snapshot() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::snapshot_get_start(&op, m_parent_spec.snap_id);
+
+ m_out_bl.clear();
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_get_snapshot>(this);
+ int r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DetachChildRequest<I>::handle_clone_v2_get_snapshot(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ bool remove_snapshot = false;
+ if (r == 0) {
+ cls::rbd::SnapshotInfo snap_info;
+ auto it = m_out_bl.cbegin();
+ r = cls_client::snapshot_get_finish(&it, &snap_info);
+ if (r == 0) {
+ m_parent_snap_namespace = snap_info.snapshot_namespace;
+ m_parent_snap_name = snap_info.name;
+
+ if (cls::rbd::get_snap_namespace_type(m_parent_snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH &&
+ snap_info.child_count == 0) {
+ // snapshot is in trash w/ zero children, so remove it
+ remove_snapshot = true;
+ }
+ }
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ ldout(cct, 5) << "failed to retrieve snapshot: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ if (!remove_snapshot) {
+ finish(0);
+ return;
+ }
+
+ clone_v2_open_parent();
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_open_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ m_parent_image_ctx = I::create("", m_parent_spec.image_id, nullptr,
+ m_parent_io_ctx, false);
+
+ // ensure non-primary images can be modified
+ m_parent_image_ctx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_open_parent>(this);
+ m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_open_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 5) << "failed to open parent for read/write: "
+ << cpp_strerror(r) << dendl;
+ m_parent_image_ctx = nullptr;
+ finish(0);
+ return;
+ }
+
+ // do not attempt to open the parent journal when removing the trash
+ // snapshot, because the parent may be not promoted
+ if (m_parent_image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ std::unique_lock image_locker{m_parent_image_ctx->image_lock};
+ m_parent_image_ctx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
+ // disallow any proxied maintenance operations
+ {
+ std::shared_lock owner_locker{m_parent_image_ctx->owner_lock};
+ if (m_parent_image_ctx->exclusive_lock != nullptr) {
+ m_parent_image_ctx->exclusive_lock->block_requests(0);
+ }
+ }
+
+ clone_v2_remove_snapshot();
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_remove_snapshot() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_remove_snapshot>(this);
+ m_parent_image_ctx->operations->snap_remove(m_parent_snap_namespace,
+ m_parent_snap_name, ctx);
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_remove_snapshot(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ ldout(cct, 5) << "failed to remove trashed clone snapshot: "
+ << cpp_strerror(r) << dendl;
+ clone_v2_close_parent();
+ return;
+ }
+
+ if (m_parent_image_ctx->snaps.empty()) {
+ clone_v2_get_parent_trash_entry();
+ } else {
+ clone_v2_close_parent();
+ }
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_get_parent_trash_entry() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::trash_get_start(&op, m_parent_image_ctx->id);
+
+ m_out_bl.clear();
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_get_parent_trash_entry>(this);
+ int r = m_parent_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_get_parent_trash_entry(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ ldout(cct, 5) << "failed to get parent trash entry: " << cpp_strerror(r)
+ << dendl;
+ clone_v2_close_parent();
+ return;
+ }
+
+ bool in_trash = false;
+
+ if (r == 0) {
+ cls::rbd::TrashImageSpec trash_spec;
+ auto it = m_out_bl.cbegin();
+ r = cls_client::trash_get_finish(&it, &trash_spec);
+
+ if (r == 0 &&
+ trash_spec.source == cls::rbd::TRASH_IMAGE_SOURCE_USER_PARENT &&
+ trash_spec.state == cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ trash_spec.deferment_end_time <= ceph_clock_now()) {
+ in_trash = true;
+ }
+ }
+
+ if (in_trash) {
+ clone_v2_remove_parent_from_trash();
+ } else {
+ clone_v2_close_parent();
+ }
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_remove_parent_from_trash() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_remove_parent_from_trash>(this);
+ auto req = librbd::trash::RemoveRequest<I>::create(
+ m_parent_io_ctx, m_parent_image_ctx, m_image_ctx.op_work_queue, false,
+ m_no_op, ctx);
+ req->send();
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_remove_parent_from_trash(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 5) << "failed to remove parent image:" << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_parent_image_ctx = nullptr;
+ finish(0);
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v2_close_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v2_close_parent>(this);
+ m_parent_image_ctx->state->close(ctx);
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v2_close_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 5) << "failed to close parent image:" << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_parent_image_ctx = nullptr;
+ finish(0);
+}
+
+template<typename I>
+void DetachChildRequest<I>::clone_v1_remove_child() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ m_parent_spec.pool_namespace = "";
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
+
+ auto aio_comp = create_rados_callback<
+ DetachChildRequest<I>,
+ &DetachChildRequest<I>::handle_clone_v1_remove_child>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template<typename I>
+void DetachChildRequest<I>::handle_clone_v1_remove_child(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "failed to remove child from children list: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void DetachChildRequest<I>::finish(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::DetachChildRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/DetachChildRequest.h b/src/librbd/image/DetachChildRequest.h
new file mode 100644
index 000000000..646b7ec62
--- /dev/null
+++ b/src/librbd/image/DetachChildRequest.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+#include "librbd/internal.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class DetachChildRequest {
+public:
+ static DetachChildRequest* create(ImageCtxT& image_ctx, Context* on_finish) {
+ return new DetachChildRequest(image_ctx, on_finish);
+ }
+
+ DetachChildRequest(ImageCtxT& image_ctx, Context* on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+ }
+ ~DetachChildRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * (v1) | (v2)
+ * /--------------/ \--------------\
+ * | |
+ * v v
+ * REMOVE_CHILD CHILD_DETACH
+ * | |
+ * | v
+ * | GET_SNAPSHOT
+ * | (snapshot in-use) . |
+ * |/. . . . . . . . . . . . . . . |
+ * | v
+ * | OPEN_PARENT
+ * | |
+ * | v (has more children)
+ * | REMOVE_SNAPSHOT ---------------\
+ * | | |
+ * | v (noent) |
+ * | (auto-delete when GET_PARENT_TRASH_ENTRY . . . .\|
+ * | last child detached) | |
+ * | v v
+ * | REMOVE_PARENT_FROM_TRASH CLOSE_PARENT
+ * | | |
+ * |/------------------------------/--------------------------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ Context* m_on_finish;
+
+ librados::IoCtx m_parent_io_ctx;
+ cls::rbd::ParentImageSpec m_parent_spec;
+ std::string m_parent_header_name;
+
+ cls::rbd::SnapshotNamespace m_parent_snap_namespace;
+ std::string m_parent_snap_name;
+
+ ImageCtxT* m_parent_image_ctx = nullptr;
+
+ ceph::bufferlist m_out_bl;
+ NoOpProgressContext m_no_op;
+
+ void clone_v2_child_detach();
+ void handle_clone_v2_child_detach(int r);
+
+ void clone_v2_get_snapshot();
+ void handle_clone_v2_get_snapshot(int r);
+
+ void clone_v2_open_parent();
+ void handle_clone_v2_open_parent(int r);
+
+ void clone_v2_remove_snapshot();
+ void handle_clone_v2_remove_snapshot(int r);
+
+ void clone_v2_get_parent_trash_entry();
+ void handle_clone_v2_get_parent_trash_entry(int r);
+
+ void clone_v2_remove_parent_from_trash();
+ void handle_clone_v2_remove_parent_from_trash(int r);
+
+ void clone_v2_close_parent();
+ void handle_clone_v2_close_parent(int r);
+
+ void clone_v1_remove_child();
+ void handle_clone_v1_remove_child(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::DetachChildRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H
diff --git a/src/librbd/image/DetachParentRequest.cc b/src/librbd/image/DetachParentRequest.cc
new file mode 100644
index 000000000..74b1b0f67
--- /dev/null
+++ b/src/librbd/image/DetachParentRequest.cc
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/DetachParentRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::DetachParentRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void DetachParentRequest<I>::send() {
+ detach_parent();
+}
+
+template <typename I>
+void DetachParentRequest<I>::detach_parent() {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (!m_legacy_parent) {
+ librbd::cls_client::parent_detach(&op);
+ } else {
+ librbd::cls_client::remove_parent(&op);
+ }
+
+ auto aio_comp = create_rados_callback<
+ DetachParentRequest<I>,
+ &DetachParentRequest<I>::handle_detach_parent>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DetachParentRequest<I>::handle_detach_parent(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_legacy_parent && r == -EOPNOTSUPP) {
+ ldout(cct, 10) << "retrying using legacy parent method" << dendl;
+ m_legacy_parent = true;
+ detach_parent();
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "detach parent encountered an error: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void DetachParentRequest<I>::finish(int r) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::DetachParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/DetachParentRequest.h b/src/librbd/image/DetachParentRequest.h
new file mode 100644
index 000000000..17c86aaac
--- /dev/null
+++ b/src/librbd/image/DetachParentRequest.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class DetachParentRequest {
+public:
+ static DetachParentRequest* create(ImageCtxT& image_ctx, Context* on_finish) {
+ return new DetachParentRequest(image_ctx, on_finish);
+ }
+
+ DetachParentRequest(ImageCtxT& image_ctx, Context* on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | * * * * * *
+ * | * * -EOPNOTSUPP
+ * v v *
+ * DETACH_PARENT * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT& m_image_ctx;
+ Context* m_on_finish;
+
+ bool m_legacy_parent = false;
+
+ void detach_parent();
+ void handle_detach_parent(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::DetachParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H
diff --git a/src/librbd/image/GetMetadataRequest.cc b/src/librbd/image/GetMetadataRequest.cc
new file mode 100644
index 000000000..1410c9005
--- /dev/null
+++ b/src/librbd/image/GetMetadataRequest.cc
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/GetMetadataRequest.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::GetMetadataRequest: " \
+ << this << " " << __func__ << ": "
+
+#define MAX_KEYS 64U
+
+namespace librbd {
+namespace image {
+namespace {
+
+static const std::string INTERNAL_KEY_PREFIX{".rbd"};
+
+} // anonymous namespace
+
+using util::create_rados_callback;
+
+template <typename I>
+GetMetadataRequest<I>::GetMetadataRequest(
+ IoCtx &io_ctx, const std::string &oid, bool filter_internal,
+ const std::string& filter_key_prefix, const std::string& last_key,
+ uint32_t max_results, KeyValues* key_values, Context *on_finish)
+ : m_io_ctx(io_ctx), m_oid(oid), m_filter_internal(filter_internal),
+ m_filter_key_prefix(filter_key_prefix), m_last_key(last_key),
+ m_max_results(max_results), m_key_values(key_values),
+ m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext*>(m_io_ctx.cct())) {
+}
+
+template <typename I>
+void GetMetadataRequest<I>::send() {
+ metadata_list();
+}
+
+template <typename I>
+void GetMetadataRequest<I>::metadata_list() {
+ ldout(m_cct, 15) << "start_key=" << m_last_key << dendl;
+
+ m_expected_results = MAX_KEYS;
+ if (m_max_results > 0) {
+ m_expected_results = std::min<uint32_t>(
+ m_expected_results, m_max_results - m_key_values->size());
+ }
+
+ librados::ObjectReadOperation op;
+ cls_client::metadata_list_start(&op, m_last_key, m_expected_results);
+
+ auto aio_comp = create_rados_callback<
+ GetMetadataRequest<I>, &GetMetadataRequest<I>::handle_metadata_list>(this);
+ m_out_bl.clear();
+ m_io_ctx.aio_operate(m_oid, aio_comp, &op, &m_out_bl);
+ aio_comp->release();
+}
+
+template <typename I>
+void GetMetadataRequest<I>::handle_metadata_list(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ KeyValues metadata;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::metadata_list_finish(&it, &metadata);
+ }
+
+ if (r == -ENOENT || r == -EOPNOTSUPP) {
+ finish(0);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve image metadata: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ for (auto it = metadata.begin(); it != metadata.end(); ++it) {
+ if (m_filter_internal &&
+ boost::starts_with(it->first, INTERNAL_KEY_PREFIX)) {
+ continue;
+ } else if (!m_filter_key_prefix.empty() &&
+ !boost::starts_with(it->first, m_filter_key_prefix)) {
+ continue;
+ }
+ m_key_values->insert({it->first, std::move(it->second)});
+ }
+ if (!metadata.empty()) {
+ m_last_key = metadata.rbegin()->first;
+ }
+
+ if (metadata.size() == m_expected_results &&
+ (m_max_results == 0 || m_key_values->size() < m_max_results)) {
+ metadata_list();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetMetadataRequest<I>::finish(int r) {
+ ldout(m_cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::GetMetadataRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/GetMetadataRequest.h b/src/librbd/image/GetMetadataRequest.h
new file mode 100644
index 000000000..08fc2de71
--- /dev/null
+++ b/src/librbd/image/GetMetadataRequest.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_GET_METADATA_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_GET_METADATA_REQUEST_H
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include <string>
+#include <map>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class GetMetadataRequest {
+public:
+ typedef std::map<std::string, bufferlist> KeyValues;
+
+ static GetMetadataRequest* create(
+ IoCtx &io_ctx, const std::string &oid, bool filter_internal,
+ const std::string& filter_key_prefix, const std::string& last_key,
+ uint32_t max_results, KeyValues* key_values, Context *on_finish) {
+ return new GetMetadataRequest(io_ctx, oid, filter_internal,
+ filter_key_prefix, last_key, max_results,
+ key_values, on_finish);
+ }
+
+ GetMetadataRequest(
+ IoCtx &io_ctx, const std::string &oid, bool filter_internal,
+ const std::string& filter_key_prefix, const std::string& last_key,
+ uint32_t max_results, KeyValues* key_values, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | /-------\
+ * | | |
+ * v v |
+ * METADATA_LIST ---/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ librados::IoCtx m_io_ctx;
+ std::string m_oid;
+ bool m_filter_internal;
+ std::string m_filter_key_prefix;
+ std::string m_last_key;
+ uint32_t m_max_results;
+ KeyValues* m_key_values;
+ Context* m_on_finish;
+
+ CephContext* m_cct;
+ bufferlist m_out_bl;
+ uint32_t m_expected_results = 0;
+
+ void metadata_list();
+ void handle_metadata_list(int r);
+
+ void finish(int r);
+
+};
+
+} //namespace image
+} //namespace librbd
+
+extern template class librbd::image::GetMetadataRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_GET_METADATA_REQUEST_H
diff --git a/src/librbd/image/ListWatchersRequest.cc b/src/librbd/image/ListWatchersRequest.cc
new file mode 100644
index 000000000..7ccbd136f
--- /dev/null
+++ b/src/librbd/image/ListWatchersRequest.cc
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ListWatchersRequest.h"
+#include "common/RWLock.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Utils.h"
+
+#include <algorithm>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::ListWatchersRequest: " << this \
+ << " " << __func__ << ": "
+
+static std::ostream& operator<<(std::ostream& os, const obj_watch_t& watch) {
+ os << "{addr=" << watch.addr << ", "
+ << "watcher_id=" << watch.watcher_id << ", "
+ << "cookie=" << watch.cookie << "}";
+ return os;
+}
+
+namespace librbd {
+namespace image {
+
+using librados::IoCtx;
+using util::create_rados_callback;
+
+template<typename I>
+ListWatchersRequest<I>::ListWatchersRequest(I &image_ctx, int flags,
+ std::list<obj_watch_t> *watchers,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_flags(flags), m_watchers(watchers),
+ m_on_finish(on_finish), m_cct(m_image_ctx.cct) {
+ ceph_assert((m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) == 0 ||
+ (m_flags & LIST_WATCHERS_MIRROR_INSTANCES_ONLY) == 0);
+}
+
+template<typename I>
+void ListWatchersRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+
+ list_image_watchers();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::list_image_watchers() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_watchers(&m_object_watchers, &m_ret_val);
+
+ using klass = ListWatchersRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_list_image_watchers>(this);
+
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+ rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::handle_list_image_watchers(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0 && m_ret_val < 0) {
+ r = m_ret_val;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "error listing image watchers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(m_cct, 20) << "object_watchers=" << m_object_watchers << dendl;
+ list_mirror_watchers();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::list_mirror_watchers() {
+ if ((m_object_watchers.empty()) ||
+ (m_flags & (LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES |
+ LIST_WATCHERS_MIRROR_INSTANCES_ONLY)) == 0) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_watchers(&m_mirror_watchers, &m_ret_val);
+
+ using klass = ListWatchersRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_list_mirror_watchers>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, rados_completion,
+ &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void ListWatchersRequest<I>::handle_list_mirror_watchers(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0 && m_ret_val < 0) {
+ r = m_ret_val;
+ }
+ if (r < 0 && r != -ENOENT) {
+ ldout(m_cct, 1) << "error listing mirror watchers: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ ldout(m_cct, 20) << "mirror_watchers=" << m_mirror_watchers << dendl;
+ finish(0);
+}
+
+template<typename I>
+void ListWatchersRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ m_watchers->clear();
+
+ if (m_object_watchers.size() > 0) {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ uint64_t watch_handle = m_image_ctx.image_watcher != nullptr ?
+ m_image_ctx.image_watcher->get_watch_handle() : 0;
+
+ for (auto &w : m_object_watchers) {
+ if ((m_flags & LIST_WATCHERS_FILTER_OUT_MY_INSTANCE) != 0) {
+ if (w.cookie == watch_handle) {
+ ldout(m_cct, 20) << "filtering out my instance: " << w << dendl;
+ continue;
+ }
+ }
+ auto it = std::find_if(m_mirror_watchers.begin(),
+ m_mirror_watchers.end(),
+ [w] (obj_watch_t &watcher) {
+ return (strncmp(w.addr, watcher.addr,
+ sizeof(w.addr)) == 0);
+ });
+ if ((m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) != 0) {
+ if (it != m_mirror_watchers.end()) {
+ ldout(m_cct, 20) << "filtering out mirror instance: " << w << dendl;
+ continue;
+ }
+ } else if ((m_flags & LIST_WATCHERS_MIRROR_INSTANCES_ONLY) != 0) {
+ if (it == m_mirror_watchers.end()) {
+ ldout(m_cct, 20) << "filtering out non-mirror instance: " << w
+ << dendl;
+ continue;
+ }
+ }
+ m_watchers->push_back(w);
+ }
+ }
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::ListWatchersRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/ListWatchersRequest.h b/src/librbd/image/ListWatchersRequest.h
new file mode 100644
index 000000000..2c77254a7
--- /dev/null
+++ b/src/librbd/image/ListWatchersRequest.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H
+
+#include "include/rados/rados_types.hpp"
+
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+enum {
+ LIST_WATCHERS_FILTER_OUT_MY_INSTANCE = 1 << 0,
+ LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES = 1 << 1,
+ LIST_WATCHERS_MIRROR_INSTANCES_ONLY = 1 << 3,
+};
+
+template<typename ImageCtxT = ImageCtx>
+class ListWatchersRequest {
+public:
+ static ListWatchersRequest *create(ImageCtxT &image_ctx, int flags,
+ std::list<obj_watch_t> *watchers,
+ Context *on_finish) {
+ return new ListWatchersRequest(image_ctx, flags, watchers, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * LIST_IMAGE_WATCHERS
+ * |
+ * v
+ * LIST_MIRROR_WATCHERS (skip if not needed)
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ListWatchersRequest(ImageCtxT &image_ctx, int flags, std::list<obj_watch_t> *watchers,
+ Context *on_finish);
+
+ ImageCtxT& m_image_ctx;
+ int m_flags;
+ std::list<obj_watch_t> *m_watchers;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ int m_ret_val;
+ bufferlist m_out_bl;
+ std::list<obj_watch_t> m_object_watchers;
+ std::list<obj_watch_t> m_mirror_watchers;
+
+ void list_image_watchers();
+ void handle_list_image_watchers(int r);
+
+ void list_mirror_watchers();
+ void handle_list_mirror_watchers(int r);
+
+ void finish(int r);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::ListWatchersRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H
diff --git a/src/librbd/image/OpenRequest.cc b/src/librbd/image/OpenRequest.cc
new file mode 100644
index 000000000..70008d712
--- /dev/null
+++ b/src/librbd/image/OpenRequest.cc
@@ -0,0 +1,727 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/OpenRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ConfigWatcher.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/PluginRegistry.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/ObjectCacherObjectDispatch.h"
+#include "librbd/cache/WriteAroundObjectDispatch.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+#include "librbd/io/SimpleSchedulerObjectDispatch.h"
+#include <boost/algorithm/string/predicate.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::OpenRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+OpenRequest<I>::OpenRequest(I *image_ctx, uint64_t flags,
+ Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_skip_open_parent_image(flags & OPEN_FLAG_SKIP_OPEN_PARENT),
+ m_on_finish(on_finish), m_error_result(0) {
+ if ((flags & OPEN_FLAG_OLD_FORMAT) != 0) {
+ m_image_ctx->old_format = true;
+ }
+ if ((flags & OPEN_FLAG_IGNORE_MIGRATING) != 0) {
+ m_image_ctx->ignore_migrating = true;
+ }
+}
+
+template <typename I>
+void OpenRequest<I>::send() {
+ if (m_image_ctx->old_format) {
+ send_v1_detect_header();
+ } else {
+ send_v2_detect_header();
+ }
+}
+
+template <typename I>
+void OpenRequest<I>::send_v1_detect_header() {
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v1_detect_header>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(util::old_header_name(m_image_ctx->name),
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v1_detect_header(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ if (*result != -ENOENT) {
+ lderr(cct) << "failed to stat image header: " << cpp_strerror(*result)
+ << dendl;
+ }
+ send_close_image(*result);
+ } else {
+ ldout(cct, 1) << "RBD image format 1 is deprecated. "
+ << "Please copy this image to image format 2." << dendl;
+
+ m_image_ctx->old_format = true;
+ m_image_ctx->header_oid = util::old_header_name(m_image_ctx->name);
+ m_image_ctx->apply_metadata({}, true);
+
+ send_refresh();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_detect_header() {
+ if (m_image_ctx->id.empty()) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v2_detect_header>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+ comp, &op, &m_out_bl);
+ comp->release();
+ } else {
+ send_v2_get_name();
+ }
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_detect_header(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ENOENT) {
+ send_v1_detect_header();
+ } else if (*result < 0) {
+ lderr(cct) << "failed to stat v2 image header: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ } else {
+ m_image_ctx->old_format = false;
+ send_v2_get_id();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_id() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_id_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_v2_get_id>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+ comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_id(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_id_finish(&it, &m_image_ctx->id);
+ }
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve image id: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ } else {
+ send_v2_get_initial_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_name() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::dir_get_name_start(&op, m_image_ctx->id);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_name>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_name(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::dir_get_name_finish(&it, &m_image_ctx->name);
+ }
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to retrieve name: "
+ << cpp_strerror(*result) << dendl;
+ send_close_image(*result);
+ } else if (*result == -ENOENT) {
+ // image does not exist in directory, look in the trash bin
+ ldout(cct, 10) << "image id " << m_image_ctx->id << " does not exist in "
+ << "rbd directory, searching in rbd trash..." << dendl;
+ send_v2_get_name_from_trash();
+ } else {
+ send_v2_get_initial_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_name_from_trash() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::trash_get_start(&op, m_image_ctx->id);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_name_from_trash>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(RBD_TRASH, comp, &op, &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_name_from_trash(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ cls::rbd::TrashImageSpec trash_spec;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::trash_get_finish(&it, &trash_spec);
+ m_image_ctx->name = trash_spec.name;
+ }
+ if (*result < 0) {
+ if (*result == -EOPNOTSUPP) {
+ *result = -ENOENT;
+ }
+ if (*result == -ENOENT) {
+ ldout(cct, 5) << "failed to retrieve name for image id "
+ << m_image_ctx->id << dendl;
+ } else {
+ lderr(cct) << "failed to retrieve name from trash: "
+ << cpp_strerror(*result) << dendl;
+ }
+ send_close_image(*result);
+ } else {
+ send_v2_get_initial_metadata();
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_initial_metadata() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->old_format = false;
+ m_image_ctx->header_oid = util::header_name(m_image_ctx->id);
+
+ librados::ObjectReadOperation op;
+ cls_client::get_size_start(&op, CEPH_NOSNAP);
+ cls_client::get_object_prefix_start(&op);
+ cls_client::get_features_start(&op, true);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_initial_metadata>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_initial_metadata(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (*result >= 0) {
+ uint64_t size;
+ *result = cls_client::get_size_finish(&it, &size, &m_image_ctx->order);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_object_prefix_finish(&it,
+ &m_image_ctx->object_prefix);
+ }
+
+ if (*result >= 0) {
+ uint64_t incompatible_features;
+ *result = cls_client::get_features_finish(&it, &m_image_ctx->features,
+ &incompatible_features);
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve initial metadata: "
+ << cpp_strerror(*result) << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ if (m_image_ctx->test_features(RBD_FEATURE_STRIPINGV2)) {
+ send_v2_get_stripe_unit_count();
+ } else {
+ send_v2_get_create_timestamp();
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_stripe_unit_count() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_stripe_unit_count_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_stripe_unit_count>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_stripe_unit_count(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_stripe_unit_count_finish(
+ &it, &m_image_ctx->stripe_unit, &m_image_ctx->stripe_count);
+ }
+
+ if (*result == -ENOEXEC || *result == -EINVAL) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to read striping metadata: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_v2_get_create_timestamp();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_create_timestamp() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_create_timestamp_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_create_timestamp>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_create_timestamp(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_create_timestamp_finish(&it,
+ &m_image_ctx->create_timestamp);
+ }
+ if (*result < 0 && *result != -EOPNOTSUPP) {
+ lderr(cct) << "failed to retrieve create_timestamp: "
+ << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_v2_get_access_modify_timestamp();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_access_modify_timestamp() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_access_timestamp_start(&op);
+ cls_client::get_modify_timestamp_start(&op);
+ //TODO: merge w/ create timestamp query after luminous EOLed
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_access_modify_timestamp>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_access_modify_timestamp(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_access_timestamp_finish(&it,
+ &m_image_ctx->access_timestamp);
+ if (*result == 0)
+ *result = cls_client::get_modify_timestamp_finish(&it,
+ &m_image_ctx->modify_timestamp);
+ }
+ if (*result < 0 && *result != -EOPNOTSUPP) {
+ lderr(cct) << "failed to retrieve access/modify_timestamp: "
+ << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_v2_get_data_pool();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_data_pool() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_data_pool_start(&op);
+
+ using klass = OpenRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_data_pool>(this);
+ m_out_bl.clear();
+ m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_data_pool(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ int64_t data_pool_id = -1;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::get_data_pool_finish(&it, &data_pool_id);
+ } else if (*result == -EOPNOTSUPP) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to read data pool: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ if (data_pool_id != -1) {
+ *result = util::create_ioctx(m_image_ctx->md_ctx, "data pool", data_pool_id,
+ {}, &m_image_ctx->data_ctx);
+ if (*result < 0) {
+ if (*result != -ENOENT) {
+ send_close_image(*result);
+ return nullptr;
+ }
+ m_image_ctx->data_ctx.close();
+ } else {
+ m_image_ctx->rebuild_data_io_context();
+ }
+ } else {
+ data_pool_id = m_image_ctx->md_ctx.get_id();
+ }
+
+ m_image_ctx->init_layout(data_pool_id);
+ send_refresh();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_refresh() {
+ m_image_ctx->init();
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_image_ctx->config_watcher = ConfigWatcher<I>::create(*m_image_ctx);
+ m_image_ctx->config_watcher->init();
+
+ using klass = OpenRequest<I>;
+ RefreshRequest<I> *req = RefreshRequest<I>::create(
+ *m_image_ctx, false, m_skip_open_parent_image,
+ create_context_callback<klass, &klass::handle_refresh>(this));
+ req->send();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_refresh(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ send_init_plugin_registry();
+ return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_init_plugin_registry() {
+ CephContext *cct = m_image_ctx->cct;
+
+ auto plugins = m_image_ctx->config.template get_val<std::string>(
+ "rbd_plugins");
+ ldout(cct, 10) << __func__ << ": plugins=" << plugins << dendl;
+
+ auto ctx = create_context_callback<
+ OpenRequest<I>, &OpenRequest<I>::handle_init_plugin_registry>(this);
+ m_image_ctx->plugin_registry->init(plugins, ctx);
+}
+
+template <typename I>
+Context* OpenRequest<I>::handle_init_plugin_registry(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize plugin registry: "
+ << cpp_strerror(*result) << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ return send_init_cache(result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_init_cache(int *result) {
+ if (!m_image_ctx->cache || m_image_ctx->child != nullptr ||
+ !m_image_ctx->data_ctx.is_valid()) {
+ return send_register_watch(result);
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ size_t max_dirty = m_image_ctx->config.template get_val<Option::size_t>(
+ "rbd_cache_max_dirty");
+ auto writethrough_until_flush = m_image_ctx->config.template get_val<bool>(
+ "rbd_cache_writethrough_until_flush");
+ auto cache_policy = m_image_ctx->config.template get_val<std::string>(
+ "rbd_cache_policy");
+ if (cache_policy == "writearound") {
+ auto cache = cache::WriteAroundObjectDispatch<I>::create(
+ m_image_ctx, max_dirty, writethrough_until_flush);
+ cache->init();
+
+ m_image_ctx->readahead.set_max_readahead_size(0);
+ } else if (cache_policy == "writethrough" || cache_policy == "writeback") {
+ if (cache_policy == "writethrough") {
+ max_dirty = 0;
+ }
+
+ auto cache = cache::ObjectCacherObjectDispatch<I>::create(
+ m_image_ctx, max_dirty, writethrough_until_flush);
+ cache->init();
+
+ // readahead requires the object cacher cache
+ m_image_ctx->readahead.set_trigger_requests(
+ m_image_ctx->config.template get_val<uint64_t>("rbd_readahead_trigger_requests"));
+ m_image_ctx->readahead.set_max_readahead_size(
+ m_image_ctx->config.template get_val<Option::size_t>("rbd_readahead_max_bytes"));
+ }
+ return send_register_watch(result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_register_watch(int *result) {
+ if ((m_image_ctx->read_only_flags & IMAGE_READ_ONLY_FLAG_USER) != 0U) {
+ return send_set_snap(result);
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = OpenRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_register_watch>(this);
+ m_image_ctx->register_watch(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_register_watch(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -EPERM) {
+ ldout(cct, 5) << "user does not have write permission" << dendl;
+ send_close_image(*result);
+ return nullptr;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to register watch: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ return send_set_snap(result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_set_snap(int *result) {
+ if (m_image_ctx->snap_name.empty() &&
+ m_image_ctx->open_snap_id == CEPH_NOSNAP) {
+ *result = 0;
+ return finalize(*result);
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ uint64_t snap_id = CEPH_NOSNAP;
+ std::swap(m_image_ctx->open_snap_id, snap_id);
+ if (snap_id == CEPH_NOSNAP) {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ snap_id = m_image_ctx->get_snap_id(m_image_ctx->snap_namespace,
+ m_image_ctx->snap_name);
+ }
+ if (snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "failed to find snapshot " << m_image_ctx->snap_name << dendl;
+ send_close_image(-ENOENT);
+ return nullptr;
+ }
+
+ using klass = OpenRequest<I>;
+ SetSnapRequest<I> *req = SetSnapRequest<I>::create(
+ *m_image_ctx, snap_id,
+ create_context_callback<klass, &klass::handle_set_snap>(this));
+ req->send();
+ return nullptr;
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_set_snap(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to set image snapshot: " << cpp_strerror(*result)
+ << dendl;
+ send_close_image(*result);
+ return nullptr;
+ }
+
+ return finalize(*result);
+}
+
+template <typename I>
+Context *OpenRequest<I>::finalize(int r) {
+ if (r == 0) {
+ auto io_scheduler_cfg =
+ m_image_ctx->config.template get_val<std::string>("rbd_io_scheduler");
+
+ if (io_scheduler_cfg == "simple" && !m_image_ctx->read_only) {
+ auto io_scheduler =
+ io::SimpleSchedulerObjectDispatch<I>::create(m_image_ctx);
+ io_scheduler->init();
+ }
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+void OpenRequest<I>::send_close_image(int error_result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_error_result = error_result;
+
+ using klass = OpenRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_close_image>(
+ this);
+ CloseRequest<I> *req = CloseRequest<I>::create(m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_close_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close image: " << cpp_strerror(*result) << dendl;
+ }
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ }
+ return m_on_finish;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::OpenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/OpenRequest.h b/src/librbd/image/OpenRequest.h
new file mode 100644
index 000000000..0fe218a39
--- /dev/null
+++ b/src/librbd/image/OpenRequest.h
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
+
+#include "include/buffer.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class OpenRequest {
+public:
+ static OpenRequest *create(ImageCtxT *image_ctx, uint64_t flags,
+ Context *on_finish) {
+ return new OpenRequest(image_ctx, flags, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (v1)
+ * |-----> V1_DETECT_HEADER
+ * | |
+ * | \-------------------------------\
+ * | (v2) |
+ * \-----> V2_DETECT_HEADER |
+ * | |
+ * v |
+ * V2_GET_ID|NAME |
+ * | |
+ * v (skip if have name) |
+ * V2_GET_NAME_FROM_TRASH |
+ * | |
+ * v |
+ * V2_GET_INITIAL_METADATA |
+ * | |
+ * v |
+ * V2_GET_STRIPE_UNIT_COUNT (skip if |
+ * | disabled) |
+ * v |
+ * V2_GET_CREATE_TIMESTAMP |
+ * | |
+ * v |
+ * V2_GET_ACCESS_MODIFIY_TIMESTAMP |
+ * | |
+ * v |
+ * V2_GET_DATA_POOL --------------> REFRESH
+ * |
+ * v
+ * INIT_PLUGIN_REGISTRY
+ * |
+ * v
+ * INIT_CACHE
+ * |
+ * v
+ * REGISTER_WATCH (skip if
+ * | read-only)
+ * v
+ * SET_SNAP (skip if no snap)
+ * |
+ * v
+ * <finish>
+ * ^
+ * (on error) |
+ * * * * * * * > CLOSE ------------------------/
+ *
+ * @endverbatim
+ */
+
+ OpenRequest(ImageCtxT *image_ctx, uint64_t flags, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ bool m_skip_open_parent_image;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ int m_error_result;
+
+ void send_v1_detect_header();
+ Context *handle_v1_detect_header(int *result);
+
+ void send_v2_detect_header();
+ Context *handle_v2_detect_header(int *result);
+
+ void send_v2_get_id();
+ Context *handle_v2_get_id(int *result);
+
+ void send_v2_get_name();
+ Context *handle_v2_get_name(int *result);
+
+ void send_v2_get_name_from_trash();
+ Context *handle_v2_get_name_from_trash(int *result);
+
+ void send_v2_get_initial_metadata();
+ Context *handle_v2_get_initial_metadata(int *result);
+
+ void send_v2_get_stripe_unit_count();
+ Context *handle_v2_get_stripe_unit_count(int *result);
+
+ void send_v2_get_create_timestamp();
+ Context *handle_v2_get_create_timestamp(int *result);
+
+ void send_v2_get_access_modify_timestamp();
+ Context *handle_v2_get_access_modify_timestamp(int *result);
+
+ void send_v2_get_data_pool();
+ Context *handle_v2_get_data_pool(int *result);
+
+ void send_refresh();
+ Context *handle_refresh(int *result);
+
+ void send_init_plugin_registry();
+ Context* handle_init_plugin_registry(int *result);
+
+ Context *send_init_cache(int *result);
+
+ Context *send_register_watch(int *result);
+ Context *handle_register_watch(int *result);
+
+ Context *send_set_snap(int *result);
+ Context *handle_set_snap(int *result);
+
+ Context *finalize(int r);
+
+ void send_close_image(int error_result);
+ Context *handle_close_image(int *result);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::OpenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
diff --git a/src/librbd/image/PreRemoveRequest.cc b/src/librbd/image/PreRemoveRequest.cc
new file mode 100644
index 000000000..fa4141834
--- /dev/null
+++ b/src/librbd/image/PreRemoveRequest.cc
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/PreRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/image/ListWatchersRequest.h"
+#include "librbd/journal/DisabledPolicy.h"
+#include "librbd/operation/SnapshotRemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::PreRemoveRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+namespace {
+
+bool auto_delete_snapshot(const SnapInfo& snap_info) {
+ auto snap_namespace_type = cls::rbd::get_snap_namespace_type(
+ snap_info.snap_namespace);
+ switch (snap_namespace_type) {
+ case cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool ignore_snapshot(const SnapInfo& snap_info) {
+ auto snap_namespace_type = cls::rbd::get_snap_namespace_type(
+ snap_info.snap_namespace);
+ switch (snap_namespace_type) {
+ case cls::rbd::SNAPSHOT_NAMESPACE_TYPE_MIRROR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+} // anonymous namespace
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void PreRemoveRequest<I>::send() {
+ auto cct = m_image_ctx->cct;
+ if (m_image_ctx->operations_disabled) {
+ lderr(cct) << "image operations disabled due to unsupported op features"
+ << dendl;
+ finish(-EROFS);
+ return;
+ }
+
+ acquire_exclusive_lock();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::acquire_exclusive_lock() {
+ // lock for write for set_exclusive_lock_policy()
+ std::unique_lock owner_locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ owner_locker.unlock();
+ validate_image_removal();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // refuse to release exclusive lock when (in the midst of) removing
+ // the image
+ m_image_ctx->set_exclusive_lock_policy(
+ new exclusive_lock::StandardPolicy<I>(m_image_ctx));
+
+ // do not attempt to open the journal when removing the image in case
+ // it's corrupt
+ if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_image_ctx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
+ m_exclusive_lock = m_image_ctx->exclusive_lock;
+
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>,
+ &PreRemoveRequest<I>::handle_exclusive_lock>(this, m_exclusive_lock);
+ m_exclusive_lock->acquire_lock(ctx);
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_exclusive_lock(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 || !m_image_ctx->exclusive_lock->is_lock_owner()) {
+ if (!m_force) {
+ lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl;
+ finish(-EBUSY);
+ } else {
+ ldout(cct, 5) << "cannot obtain exclusive lock - "
+ << "proceeding due to force flag set" << dendl;
+ shut_down_exclusive_lock();
+ }
+ return;
+ }
+
+ validate_image_removal();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::shut_down_exclusive_lock() {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ owner_locker.unlock();
+ validate_image_removal();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>,
+ &PreRemoveRequest<I>::handle_shut_down_exclusive_lock>(this);
+
+ m_exclusive_lock = m_image_ctx->exclusive_lock;
+ m_exclusive_lock->shut_down(ctx);
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_shut_down_exclusive_lock(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ m_exclusive_lock->put();
+ m_exclusive_lock = nullptr;
+
+ if (r < 0) {
+ lderr(cct) << "error shutting down exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ceph_assert(m_image_ctx->exclusive_lock == nullptr);
+ validate_image_removal();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::validate_image_removal() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_image_ctx->ignore_migrating &&
+ m_image_ctx->test_features(RBD_FEATURE_MIGRATING)) {
+ lderr(cct) << "image in migration state - not removing" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ check_image_snaps();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::check_image_snaps() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_image_ctx->image_lock.lock_shared();
+ for (auto& snap_info : m_image_ctx->snap_info) {
+ if (auto_delete_snapshot(snap_info.second)) {
+ m_snap_infos.insert(snap_info);
+ } else if (!ignore_snapshot(snap_info.second)) {
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(cct, 5) << "image has snapshots - not removing" << dendl;
+ finish(-ENOTEMPTY);
+ return;
+ }
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ list_image_watchers();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::list_image_watchers() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ int flags = LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES;
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>,
+ &PreRemoveRequest<I>::handle_list_image_watchers>(this);
+ auto req = ListWatchersRequest<I>::create(*m_image_ctx, flags, &m_watchers,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_list_image_watchers(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "error listing image watchers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ check_image_watchers();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::check_image_watchers() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ if (!m_watchers.empty()) {
+ lderr(cct) << "image has watchers - not removing" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ check_group();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::check_group() {
+ if (m_image_ctx->old_format) {
+ finish(0);
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::image_group_get_start(&op);
+
+ auto rados_completion = create_rados_callback<
+ PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_check_group>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid,
+ rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_check_group(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ cls::rbd::GroupSpec s;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::image_group_get_finish(&it, &s);
+ }
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(cct) << "error fetching group for image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (s.is_valid()) {
+ lderr(cct) << "image is in a group - not removing" << dendl;
+ finish(-EMLINK);
+ return;
+ }
+
+ remove_snapshot();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::remove_snapshot() {
+ if (m_snap_infos.empty()) {
+ finish(0);
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ auto snap_id = m_snap_infos.begin()->first;
+ auto& snap_info = m_snap_infos.begin()->second;
+ ldout(cct, 20) << "snap_id=" << snap_id << ", "
+ << "snap_name=" << snap_info.name << dendl;
+
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ PreRemoveRequest<I>, &PreRemoveRequest<I>::handle_remove_snapshot>(this);
+ auto req = librbd::operation::SnapshotRemoveRequest<I>::create(
+ *m_image_ctx, snap_info.snap_namespace, snap_info.name,
+ snap_id, ctx);
+ req->send();
+
+}
+
+template <typename I>
+void PreRemoveRequest<I>::handle_remove_snapshot(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ ldout(cct, 5) << "skipping attached child" << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = -ECHILD;
+ }
+ } else if (r < 0 && r != -ENOENT) {
+ auto snap_id = m_snap_infos.begin()->first;
+ lderr(cct) << "failed to auto-prune snapshot " << snap_id << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ ceph_assert(!m_snap_infos.empty());
+ m_snap_infos.erase(m_snap_infos.begin());
+
+ remove_snapshot();
+}
+
+template <typename I>
+void PreRemoveRequest<I>::finish(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+
+ m_on_finish->complete(m_ret_val);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::PreRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/PreRemoveRequest.h b/src/librbd/image/PreRemoveRequest.h
new file mode 100644
index 000000000..06b3bf2f8
--- /dev/null
+++ b/src/librbd/image/PreRemoveRequest.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "librbd/ImageCtx.h"
+#include <list>
+#include <map>
+
+class Context;
+
+namespace librbd {
+namespace image {
+
+template <typename ImageCtxT>
+class PreRemoveRequest {
+public:
+
+ static PreRemoveRequest *create(ImageCtxT *image_ctx, bool force,
+ Context *on_finish) {
+ return new PreRemoveRequest(image_ctx, force, on_finish);
+ }
+
+ PreRemoveRequest(ImageCtxT *image_ctx, bool force, Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | (skip if
+ * v not needed) (error)
+ * ACQUIRE EXCLUSIVE LOCK * * * * * * > SHUT DOWN EXCLUSIVE LOCK
+ * | |
+ * v |
+ * CHECK IMAGE WATCHERS <------------------/
+ * |
+ * v
+ * CHECK GROUP
+ * |
+ * | /------\
+ * | | |
+ * v v |
+ * REMOVE SNAPS ----/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT* m_image_ctx;
+ bool m_force;
+ Context* m_on_finish;
+
+ decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr;
+
+ bufferlist m_out_bl;
+ std::list<obj_watch_t> m_watchers;
+
+ std::map<uint64_t, SnapInfo> m_snap_infos;
+ int m_ret_val = 0;
+
+ void acquire_exclusive_lock();
+ void handle_exclusive_lock(int r);
+
+ void shut_down_exclusive_lock();
+ void handle_shut_down_exclusive_lock(int r);
+
+ void validate_image_removal();
+ void check_image_snaps();
+
+ void list_image_watchers();
+ void handle_list_image_watchers(int r);
+
+ void check_image_watchers();
+
+ void check_group();
+ void handle_check_group(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::PreRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H
diff --git a/src/librbd/image/RefreshParentRequest.cc b/src/librbd/image/RefreshParentRequest.cc
new file mode 100644
index 000000000..348226c39
--- /dev/null
+++ b/src/librbd/image/RefreshParentRequest.cc
@@ -0,0 +1,244 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/RefreshParentRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/migration/OpenSourceImageRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RefreshParentRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+RefreshParentRequest<I>::RefreshParentRequest(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info, Context *on_finish)
+ : m_child_image_ctx(child_image_ctx), m_parent_md(parent_md),
+ m_migration_info(migration_info), m_on_finish(on_finish),
+ m_parent_image_ctx(nullptr), m_parent_snap_id(CEPH_NOSNAP),
+ m_error_result(0) {
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_refresh_required(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ ceph_assert(ceph_mutex_is_locked(child_image_ctx.image_lock));
+ return (is_open_required(child_image_ctx, parent_md, migration_info) ||
+ is_close_required(child_image_ctx, parent_md, migration_info));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_close_required(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ return (child_image_ctx.parent != nullptr &&
+ !does_parent_exist(child_image_ctx, parent_md, migration_info));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_open_required(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ return (does_parent_exist(child_image_ctx, parent_md, migration_info) &&
+ (child_image_ctx.parent == nullptr ||
+ child_image_ctx.parent->md_ctx.get_id() != parent_md.spec.pool_id ||
+ child_image_ctx.parent->md_ctx.get_namespace() !=
+ parent_md.spec.pool_namespace ||
+ child_image_ctx.parent->id != parent_md.spec.image_id ||
+ child_image_ctx.parent->snap_id != parent_md.spec.snap_id));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::does_parent_exist(
+ I &child_image_ctx, const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info) {
+ if (child_image_ctx.child != nullptr &&
+ child_image_ctx.child->migration_info.empty() && parent_md.overlap == 0) {
+ // intermediate, non-migrating images should only open their parent if they
+ // overlap
+ return false;
+ }
+
+ return (parent_md.spec.pool_id > -1 && parent_md.overlap > 0) ||
+ !migration_info.empty();
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send() {
+ if (is_open_required(m_child_image_ctx, m_parent_md, m_migration_info)) {
+ send_open_parent();
+ } else {
+ // parent will be closed (if necessary) during finalize
+ send_complete(0);
+ }
+}
+
+template <typename I>
+void RefreshParentRequest<I>::apply() {
+ ceph_assert(ceph_mutex_is_wlocked(m_child_image_ctx.image_lock));
+ std::swap(m_child_image_ctx.parent, m_parent_image_ctx);
+}
+
+template <typename I>
+void RefreshParentRequest<I>::finalize(Context *on_finish) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_finish = on_finish;
+ if (m_parent_image_ctx != nullptr) {
+ send_close_parent();
+ } else {
+ send_complete(0);
+ }
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_open_parent() {
+ ceph_assert(m_parent_md.spec.pool_id >= 0);
+
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ if (!m_migration_info.empty()) {
+ auto ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ RefreshParentRequest<I>,
+ &RefreshParentRequest<I>::handle_open_parent, false>(this));
+ auto req = migration::OpenSourceImageRequest<I>::create(
+ m_child_image_ctx.md_ctx, &m_child_image_ctx, m_parent_md.spec.snap_id,
+ m_migration_info, &m_parent_image_ctx, ctx);
+ req->send();
+ return;
+ }
+
+ librados::IoCtx parent_io_ctx;
+ int r = util::create_ioctx(m_child_image_ctx.md_ctx, "parent image",
+ m_parent_md.spec.pool_id,
+ m_parent_md.spec.pool_namespace, &parent_io_ctx);
+ if (r < 0) {
+ send_complete(r);
+ return;
+ }
+
+ m_parent_image_ctx = new I("", m_parent_md.spec.image_id,
+ m_parent_md.spec.snap_id, parent_io_ctx, true);
+ m_parent_image_ctx->child = &m_child_image_ctx;
+
+ // set rados flags for reading the parent image
+ if (m_child_image_ctx.config.template get_val<bool>("rbd_balance_parent_reads")) {
+ m_parent_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS);
+ } else if (m_child_image_ctx.config.template get_val<bool>("rbd_localize_parent_reads")) {
+ m_parent_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS);
+ }
+
+ auto ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ RefreshParentRequest<I>,
+ &RefreshParentRequest<I>::handle_open_parent, false>(this));
+ m_parent_image_ctx->state->open(0U, ctx);
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_open_parent(int *result) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+ save_result(result);
+ if (*result < 0) {
+ lderr(cct) << "failed to open parent image: " << cpp_strerror(*result)
+ << dendl;
+
+ // image already closed by open state machine
+ m_parent_image_ctx = nullptr;
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_close_parent() {
+ ceph_assert(m_parent_image_ctx != nullptr);
+
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ auto ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ RefreshParentRequest<I>,
+ &RefreshParentRequest<I>::handle_close_parent, false>(this));
+ m_parent_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_close_parent(int *result) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+ m_parent_image_ctx = nullptr;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close parent image: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ send_reset_existence_cache();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_reset_existence_cache() {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_child_image_ctx, create_context_callback<
+ RefreshParentRequest<I>,
+ &RefreshParentRequest<I>::handle_reset_existence_cache, false>(this));
+ m_child_image_ctx.io_object_dispatcher->reset_existence_cache(ctx);
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_reset_existence_cache(int *result) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to reset object existence cache: "
+ << cpp_strerror(*result) << dendl;
+ }
+
+ if (m_error_result < 0) {
+ // propagate errors from opening the image
+ *result = m_error_result;
+ } else {
+ *result = 0;
+ }
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_complete(int r) {
+ CephContext *cct = m_child_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_finish->complete(r);
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RefreshParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RefreshParentRequest.h b/src/librbd/image/RefreshParentRequest.h
new file mode 100644
index 000000000..086d8ec1b
--- /dev/null
+++ b/src/librbd/image/RefreshParentRequest.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/Types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class RefreshParentRequest {
+public:
+ static RefreshParentRequest *create(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info,
+ Context *on_finish) {
+ return new RefreshParentRequest(child_image_ctx, parent_md, migration_info,
+ on_finish);
+ }
+
+ static bool is_refresh_required(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+
+ void send();
+ void apply();
+ void finalize(Context *on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (open required)
+ * |----------------> OPEN_PARENT * * * * * * * * * * * * * * *
+ * | | *
+ * | v (on error) *
+ * \----------------> <apply> *
+ * | *
+ * | (close required) *
+ * |-----------------> CLOSE_PARENT *
+ * | | *
+ * | v *
+ * | RESET_EXISTENCE *
+ * | | *
+ * | v *
+ * \-----------------> <finish> < * * * *
+ *
+ * @endverbatim
+ */
+
+ RefreshParentRequest(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info, Context *on_finish);
+
+ ImageCtxT &m_child_image_ctx;
+ ParentImageInfo m_parent_md;
+ MigrationInfo m_migration_info;
+ Context *m_on_finish;
+
+ ImageCtxT *m_parent_image_ctx;
+ uint64_t m_parent_snap_id;
+
+ int m_error_result;
+
+ static bool is_close_required(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+ static bool is_open_required(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+ static bool does_parent_exist(ImageCtxT &child_image_ctx,
+ const ParentImageInfo &parent_md,
+ const MigrationInfo &migration_info);
+
+ void send_open_parent();
+ Context *handle_open_parent(int *result);
+
+ void send_close_parent();
+ Context *handle_close_parent(int *result);
+
+ void send_reset_existence_cache();
+ Context *handle_reset_existence_cache(int *result);
+
+ void send_complete(int r);
+
+ void save_result(int *result) {
+ if (m_error_result == 0 && *result < 0) {
+ m_error_result = *result;
+ }
+ }
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RefreshParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
diff --git a/src/librbd/image/RefreshRequest.cc b/src/librbd/image/RefreshRequest.cc
new file mode 100644
index 000000000..24159c55b
--- /dev/null
+++ b/src/librbd/image/RefreshRequest.cc
@@ -0,0 +1,1575 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/ceph_assert.h"
+
+#include "librbd/image/RefreshRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/Utils.h"
+#include "librbd/image/GetMetadataRequest.h"
+#include "librbd/image/RefreshParentRequest.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/journal/Policy.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RefreshRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_callback;
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+RefreshRequest<I>::RefreshRequest(I &image_ctx, bool acquiring_lock,
+ bool skip_open_parent, Context *on_finish)
+ : m_image_ctx(image_ctx), m_acquiring_lock(acquiring_lock),
+ m_skip_open_parent_image(skip_open_parent),
+ m_on_finish(create_async_context_callback(m_image_ctx, on_finish)),
+ m_error_result(0), m_flush_aio(false), m_exclusive_lock(nullptr),
+ m_object_map(nullptr), m_journal(nullptr), m_refresh_parent(nullptr) {
+ m_pool_metadata_io_ctx.dup(image_ctx.md_ctx);
+ m_pool_metadata_io_ctx.set_namespace("");
+}
+
+template <typename I>
+RefreshRequest<I>::~RefreshRequest() {
+ // these require state machine to close
+ ceph_assert(m_exclusive_lock == nullptr);
+ ceph_assert(m_object_map == nullptr);
+ ceph_assert(m_journal == nullptr);
+ ceph_assert(m_refresh_parent == nullptr);
+ ceph_assert(!m_blocked_writes);
+}
+
+template <typename I>
+void RefreshRequest<I>::send() {
+ if (m_image_ctx.old_format) {
+ send_v1_read_header();
+ } else {
+ send_v2_get_mutable_metadata();
+ }
+}
+
+template <typename I>
+void RefreshRequest<I>::send_get_migration_header() {
+ if (m_image_ctx.ignore_migrating) {
+ m_migration_spec = {};
+ if (m_image_ctx.old_format) {
+ send_v1_get_snapshots();
+ } else {
+ send_v2_get_metadata();
+ }
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::migration_get_start(&op);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_migration_header>(this);
+ m_out_bl.clear();
+ m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_get_migration_header(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result >= 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::migration_get_finish(&it, &m_migration_spec);
+ } else if (*result == -ENOENT) {
+ ldout(cct, 5) << this << " " << __func__ << ": no migration header found"
+ << ", retrying" << dendl;
+ send();
+ return nullptr;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve migration header: "
+ << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ switch(m_migration_spec.header_type) {
+ case cls::rbd::MIGRATION_HEADER_TYPE_SRC:
+ if (!m_read_only) {
+ lderr(cct) << "image being migrated" << dendl;
+ *result = -EROFS;
+ return m_on_finish;
+ }
+ ldout(cct, 1) << this << " " << __func__ << ": migrating to: "
+ << m_migration_spec << dendl;
+ break;
+ case cls::rbd::MIGRATION_HEADER_TYPE_DST:
+ ldout(cct, 1) << this << " " << __func__ << ": migrating from: "
+ << m_migration_spec << dendl;
+ switch (m_migration_spec.state) {
+ case cls::rbd::MIGRATION_STATE_PREPARING:
+ ldout(cct, 5) << this << " " << __func__ << ": current migration state: "
+ << m_migration_spec.state << ", retrying" << dendl;
+ send();
+ return nullptr;
+ case cls::rbd::MIGRATION_STATE_PREPARED:
+ case cls::rbd::MIGRATION_STATE_EXECUTING:
+ case cls::rbd::MIGRATION_STATE_EXECUTED:
+ break;
+ case cls::rbd::MIGRATION_STATE_ABORTING:
+ if (!m_read_only) {
+ lderr(cct) << this << " " << __func__ << ": migration is being aborted"
+ << dendl;
+ *result = -EROFS;
+ return m_on_finish;
+ }
+ break;
+ default:
+ lderr(cct) << this << " " << __func__ << ": migration is in an "
+ << "unexpected state" << dendl;
+ *result = -EINVAL;
+ return m_on_finish;
+ }
+ break;
+ default:
+ ldout(cct, 1) << this << " " << __func__ << ": migration type "
+ << m_migration_spec.header_type << dendl;
+ *result = -EBADMSG;
+ return m_on_finish;
+ }
+
+ if (m_image_ctx.old_format) {
+ send_v1_get_snapshots();
+ } else {
+ send_v2_get_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_read_header() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, nullptr, nullptr);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_read_header>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_read_header(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+ rbd_obj_header_ondisk v1_header;
+ bool migrating = false;
+ if (*result < 0) {
+ return m_on_finish;
+ } else if (m_out_bl.length() < sizeof(v1_header)) {
+ lderr(cct) << "v1 header too small" << dendl;
+ *result = -EIO;
+ return m_on_finish;
+ } else if (memcmp(RBD_HEADER_TEXT, m_out_bl.c_str(),
+ sizeof(RBD_HEADER_TEXT)) != 0) {
+ if (memcmp(RBD_MIGRATE_HEADER_TEXT, m_out_bl.c_str(),
+ sizeof(RBD_MIGRATE_HEADER_TEXT)) == 0) {
+ ldout(cct, 1) << this << " " << __func__ << ": migration v1 header detected"
+ << dendl;
+ migrating = true;
+ } else {
+ lderr(cct) << "unrecognized v1 header" << dendl;
+ *result = -ENXIO;
+ return m_on_finish;
+ }
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ m_read_only = m_image_ctx.read_only;
+ m_read_only_flags = m_image_ctx.read_only_flags;
+ }
+
+ memcpy(&v1_header, m_out_bl.c_str(), sizeof(v1_header));
+ m_order = v1_header.options.order;
+ m_size = v1_header.image_size;
+ m_object_prefix = v1_header.block_name;
+ if (migrating) {
+ send_get_migration_header();
+ } else {
+ m_migration_spec = {};
+ send_v1_get_snapshots();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_get_snapshots() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::old_snapshot_list_start(&op);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_get_snapshots>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_get_snapshots(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+ std::vector<std::string> snap_names;
+ std::vector<uint64_t> snap_sizes;
+ if (*result >= 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::old_snapshot_list_finish(&it, &snap_names,
+ &snap_sizes, &m_snapc);
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve v1 snapshots: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if (!m_snapc.is_valid()) {
+ lderr(cct) << "v1 image snap context is invalid" << dendl;
+ *result = -EIO;
+ return m_on_finish;
+ }
+
+ m_snap_infos.clear();
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ m_snap_infos.push_back({m_snapc.snaps[i],
+ {cls::rbd::UserSnapshotNamespace{}},
+ snap_names[i], snap_sizes[i], {}, 0});
+ }
+
+ send_v1_get_locks();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_get_locks() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v1_get_locks>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_get_locks(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ if (*result >= 0) {
+ auto it = m_out_bl.cbegin();
+ ClsLockType lock_type;
+ *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+ &lock_type, &m_lock_tag);
+ if (*result >= 0) {
+ m_exclusive_locked = (lock_type == ClsLockType::EXCLUSIVE);
+ }
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve locks: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v1_apply();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // ensure we are not in a rados callback when applying updates
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v1_apply>(this);
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_apply(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ apply();
+ return send_flush_aio();
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_mutable_metadata() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ uint64_t snap_id;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ snap_id = m_image_ctx.snap_id;
+ m_read_only = m_image_ctx.read_only;
+ m_read_only_flags = m_image_ctx.read_only_flags;
+ }
+
+ // mask out the non-primary read-only flag since its state can change
+ bool read_only = (
+ ((m_read_only_flags & ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY) != 0) ||
+ (snap_id != CEPH_NOSNAP));
+ librados::ObjectReadOperation op;
+ cls_client::get_size_start(&op, CEPH_NOSNAP);
+ cls_client::get_features_start(&op, read_only);
+ cls_client::get_flags_start(&op, CEPH_NOSNAP);
+ cls_client::get_snapcontext_start(&op);
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_mutable_metadata>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_mutable_metadata(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (*result >= 0) {
+ uint8_t order;
+ *result = cls_client::get_size_finish(&it, &m_size, &order);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_features_finish(&it, &m_features,
+ &m_incompatible_features);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_flags_finish(&it, &m_flags);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_snapcontext_finish(&it, &m_snapc);
+ }
+
+ if (*result >= 0) {
+ ClsLockType lock_type;
+ *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+ &lock_type, &m_lock_tag);
+ if (*result >= 0) {
+ m_exclusive_locked = (lock_type == ClsLockType::EXCLUSIVE);
+ }
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve mutable metadata: "
+ << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ uint64_t unsupported = m_incompatible_features & ~RBD_FEATURES_ALL;
+ if (unsupported != 0ULL) {
+ lderr(cct) << "Image uses unsupported features: " << unsupported << dendl;
+ *result = -ENOSYS;
+ return m_on_finish;
+ }
+
+ if (!m_snapc.is_valid()) {
+ lderr(cct) << "image snap context is invalid!" << dendl;
+ *result = -EIO;
+ return m_on_finish;
+ }
+
+ if (m_acquiring_lock && (m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ ldout(cct, 5) << "ignoring dynamically disabled exclusive lock" << dendl;
+ m_features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+ m_incomplete_update = true;
+ } else {
+ m_incomplete_update = false;
+ }
+
+ if (((m_incompatible_features & RBD_FEATURE_NON_PRIMARY) != 0U) &&
+ ((m_read_only_flags & IMAGE_READ_ONLY_FLAG_NON_PRIMARY) == 0U) &&
+ ((m_image_ctx.read_only_mask & IMAGE_READ_ONLY_FLAG_NON_PRIMARY) != 0U)) {
+ // implies we opened a non-primary image in R/W mode
+ ldout(cct, 5) << "adding non-primary read-only image flag" << dendl;
+ m_read_only_flags |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ } else if ((((m_incompatible_features & RBD_FEATURE_NON_PRIMARY) == 0U) ||
+ ((m_image_ctx.read_only_mask &
+ IMAGE_READ_ONLY_FLAG_NON_PRIMARY) == 0U)) &&
+ ((m_read_only_flags & IMAGE_READ_ONLY_FLAG_NON_PRIMARY) != 0U)) {
+ ldout(cct, 5) << "removing non-primary read-only image flag" << dendl;
+ m_read_only_flags &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ }
+ m_read_only = (m_read_only_flags != 0U);
+
+ m_legacy_parent = false;
+ send_v2_get_parent();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_parent() {
+ // NOTE: remove support when Mimic is EOLed
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": legacy=" << m_legacy_parent
+ << dendl;
+
+ librados::ObjectReadOperation op;
+ if (!m_legacy_parent) {
+ cls_client::parent_get_start(&op);
+ cls_client::parent_overlap_get_start(&op, CEPH_NOSNAP);
+ } else {
+ cls_client::get_parent_start(&op, CEPH_NOSNAP);
+ }
+
+ auto aio_comp = create_rados_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_parent>(this);
+ m_out_bl.clear();
+ m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op,
+ &m_out_bl);
+ aio_comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_parent(int *result) {
+ // NOTE: remove support when Mimic is EOLed
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ if (!m_legacy_parent) {
+ if (*result >= 0) {
+ *result = cls_client::parent_get_finish(&it, &m_parent_md.spec);
+ }
+
+ std::optional<uint64_t> parent_overlap;
+ if (*result >= 0) {
+ *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap);
+ }
+
+ if (*result >= 0) {
+ if (parent_overlap) {
+ m_parent_md.overlap = *parent_overlap;
+ m_head_parent_overlap = true;
+ } else {
+ m_parent_md.overlap = 0;
+ m_head_parent_overlap = false;
+ }
+ }
+ } else if (*result >= 0) {
+ *result = cls_client::get_parent_finish(&it, &m_parent_md.spec,
+ &m_parent_md.overlap);
+ m_head_parent_overlap = true;
+ }
+
+ if (*result == -EOPNOTSUPP && !m_legacy_parent) {
+ ldout(cct, 10) << "retrying using legacy parent method" << dendl;
+ m_legacy_parent = true;
+ send_v2_get_parent();
+ return nullptr;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to retrieve parent: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ if ((m_features & RBD_FEATURE_MIGRATING) != 0) {
+ ldout(cct, 1) << "migrating feature set" << dendl;
+ send_get_migration_header();
+ } else {
+ m_migration_spec = {};
+ send_v2_get_metadata();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_metadata() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ auto ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_metadata>(this);
+ m_metadata.clear();
+ auto req = GetMetadataRequest<I>::create(
+ m_image_ctx.md_ctx, m_image_ctx.header_oid, true,
+ ImageCtx::METADATA_CONF_PREFIX, ImageCtx::METADATA_CONF_PREFIX, 0U,
+ &m_metadata, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_metadata(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve metadata: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v2_get_pool_metadata();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_pool_metadata() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ auto ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_pool_metadata>(this);
+ auto req = GetMetadataRequest<I>::create(
+ m_pool_metadata_io_ctx, RBD_INFO, true, ImageCtx::METADATA_CONF_PREFIX,
+ ImageCtx::METADATA_CONF_PREFIX, 0U, &m_metadata, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_pool_metadata(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve pool metadata: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ bool thread_safe = m_image_ctx.image_watcher->is_unregistered();
+ m_image_ctx.apply_metadata(m_metadata, thread_safe);
+
+ send_v2_get_op_features();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_op_features() {
+ if ((m_features & RBD_FEATURE_OPERATIONS) == 0LL) {
+ m_op_features = 0;
+ send_v2_get_group();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::op_features_get_start(&op);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_get_op_features>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_op_features(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ // -EOPNOTSUPP handler not required since feature bit implies OSD
+ // supports the method
+ if (*result >= 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::op_features_get_finish(&it, &m_op_features);
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve op features: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v2_get_group();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_group() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::image_group_get_start(&op);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_group>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_group(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": "
+ << "r=" << *result << dendl;
+
+ if (*result >= 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::image_group_get_finish(&it, &m_group_spec);
+ }
+
+ if (*result == -EOPNOTSUPP) {
+ m_group_spec = {};
+ } else if (*result < 0) {
+ lderr(cct) << "failed to retrieve group: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ m_legacy_snapshot = LEGACY_SNAPSHOT_DISABLED;
+ send_v2_get_snapshots();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_snapshots() {
+ m_snap_infos.resize(m_snapc.snaps.size());
+ m_snap_flags.resize(m_snapc.snaps.size());
+ m_snap_parents.resize(m_snapc.snaps.size());
+ m_snap_protection.resize(m_snapc.snaps.size());
+
+ if (m_snapc.snaps.empty()) {
+ send_v2_refresh_parent();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ for (auto snap_id : m_snapc.snaps) {
+ if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) {
+ /// NOTE: remove after Luminous is retired
+ cls_client::get_snapshot_name_start(&op, snap_id);
+ cls_client::get_size_start(&op, snap_id);
+ if (m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) {
+ cls_client::get_snapshot_timestamp_start(&op, snap_id);
+ }
+ } else {
+ cls_client::snapshot_get_start(&op, snap_id);
+ }
+
+ if (m_legacy_parent) {
+ cls_client::get_parent_start(&op, snap_id);
+ } else {
+ cls_client::parent_overlap_get_start(&op, snap_id);
+ }
+
+ cls_client::get_flags_start(&op, snap_id);
+ cls_client::get_protection_status_start(&op, snap_id);
+ }
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *comp = create_rados_callback<
+ klass, &klass::handle_v2_get_snapshots>(this);
+ m_out_bl.clear();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_snapshots(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+ auto it = m_out_bl.cbegin();
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) {
+ /// NOTE: remove after Luminous is retired
+ std::string snap_name;
+ if (*result >= 0) {
+ *result = cls_client::get_snapshot_name_finish(&it, &snap_name);
+ }
+
+ uint64_t snap_size;
+ if (*result >= 0) {
+ uint8_t order;
+ *result = cls_client::get_size_finish(&it, &snap_size, &order);
+ }
+
+ utime_t snap_timestamp;
+ if (*result >= 0 &&
+ m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) {
+ /// NOTE: remove after Jewel is retired
+ *result = cls_client::get_snapshot_timestamp_finish(&it,
+ &snap_timestamp);
+ }
+
+ if (*result >= 0) {
+ m_snap_infos[i] = {m_snapc.snaps[i],
+ {cls::rbd::UserSnapshotNamespace{}},
+ snap_name, snap_size, snap_timestamp, 0};
+ }
+ } else if (*result >= 0) {
+ *result = cls_client::snapshot_get_finish(&it, &m_snap_infos[i]);
+ }
+
+ if (*result >= 0) {
+ if (m_legacy_parent) {
+ *result = cls_client::get_parent_finish(&it, &m_snap_parents[i].spec,
+ &m_snap_parents[i].overlap);
+ } else {
+ std::optional<uint64_t> parent_overlap;
+ *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap);
+ if (*result >= 0) {
+ if (parent_overlap && m_parent_md.spec.pool_id > -1) {
+ m_snap_parents[i].spec = m_parent_md.spec;
+ m_snap_parents[i].overlap = *parent_overlap;
+ } else {
+ m_snap_parents[i] = {};
+ }
+ }
+ }
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_flags_finish(&it, &m_snap_flags[i]);
+ }
+
+ if (*result >= 0) {
+ *result = cls_client::get_protection_status_finish(
+ &it, &m_snap_protection[i]);
+ }
+
+ if (*result < 0) {
+ break;
+ }
+ }
+
+ if (*result == -ENOENT && m_enoent_retries++ < MAX_ENOENT_RETRIES) {
+ ldout(cct, 10) << "out-of-sync snapshot state detected, retrying" << dendl;
+ send_v2_get_mutable_metadata();
+ return nullptr;
+ } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_DISABLED &&
+ *result == -EOPNOTSUPP) {
+ ldout(cct, 10) << "retrying using legacy snapshot methods" << dendl;
+ m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED;
+ send_v2_get_snapshots();
+ return nullptr;
+ } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_ENABLED &&
+ *result == -EOPNOTSUPP) {
+ ldout(cct, 10) << "retrying using legacy snapshot methods (jewel)" << dendl;
+ m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP;
+ send_v2_get_snapshots();
+ return nullptr;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to retrieve snapshots: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_v2_refresh_parent();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_refresh_parent() {
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+
+ ParentImageInfo parent_md;
+ MigrationInfo migration_info;
+ int r = get_parent_info(m_image_ctx.snap_id, &parent_md, &migration_info);
+ if (!m_skip_open_parent_image && (r < 0 ||
+ RefreshParentRequest<I>::is_refresh_required(m_image_ctx, parent_md,
+ migration_info))) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_refresh_parent>(this);
+ m_refresh_parent = RefreshParentRequest<I>::create(
+ m_image_ctx, parent_md, migration_info, ctx);
+ }
+ }
+
+ if (m_refresh_parent != nullptr) {
+ m_refresh_parent->send();
+ } else {
+ send_v2_init_exclusive_lock();
+ }
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ENOENT && m_enoent_retries++ < MAX_ENOENT_RETRIES) {
+ ldout(cct, 10) << "out-of-sync parent info detected, retrying" << dendl;
+ ceph_assert(m_refresh_parent != nullptr);
+ delete m_refresh_parent;
+ m_refresh_parent = nullptr;
+ send_v2_get_mutable_metadata();
+ return nullptr;
+ } else if (*result < 0) {
+ lderr(cct) << "failed to refresh parent image: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ send_v2_apply();
+ return nullptr;
+ }
+
+ send_v2_init_exclusive_lock();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_init_exclusive_lock() {
+ if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0 ||
+ m_read_only || !m_image_ctx.snap_name.empty() ||
+ m_image_ctx.exclusive_lock != nullptr) {
+ send_v2_open_object_map();
+ return;
+ }
+
+ // implies exclusive lock dynamically enabled or image open in-progress
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // TODO need safe shut down
+ m_exclusive_lock = m_image_ctx.create_exclusive_lock();
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_init_exclusive_lock>(this);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ m_exclusive_lock->init(m_features, ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_init_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ save_result(result);
+ }
+
+ // object map and journal will be opened when exclusive lock is
+ // acquired (if features are enabled)
+ send_v2_apply();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_open_journal() {
+ bool journal_disabled = (
+ (m_features & RBD_FEATURE_JOURNALING) == 0 ||
+ m_read_only ||
+ !m_image_ctx.snap_name.empty() ||
+ m_image_ctx.journal != nullptr ||
+ m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner());
+ bool journal_disabled_by_policy;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ journal_disabled_by_policy = (
+ !journal_disabled &&
+ m_image_ctx.get_journal_policy()->journal_disabled());
+ }
+
+ if (journal_disabled || journal_disabled_by_policy) {
+ // journal dynamically enabled -- doesn't own exclusive lock
+ if ((m_features & RBD_FEATURE_JOURNALING) != 0 &&
+ !journal_disabled_by_policy &&
+ m_image_ctx.exclusive_lock != nullptr &&
+ m_image_ctx.journal == nullptr) {
+ auto ctx = new LambdaContext([this](int) {
+ send_v2_block_writes();
+ });
+ m_image_ctx.exclusive_lock->set_require_lock(
+ true, librbd::io::DIRECTION_BOTH, ctx);
+ return;
+ }
+
+ send_v2_block_writes();
+ return;
+ }
+
+ // implies journal dynamically enabled since ExclusiveLock will init
+ // the journal upon acquiring the lock
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_open_journal>(this);
+
+ // TODO need safe close
+ m_journal = m_image_ctx.create_journal();
+ m_journal->open(ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_open_journal(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize journal: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ }
+
+ send_v2_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_block_writes() {
+ bool disabled_journaling = false;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ disabled_journaling = ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
+ (m_features & RBD_FEATURE_JOURNALING) == 0 &&
+ m_image_ctx.journal != nullptr);
+ }
+
+ if (!disabled_journaling) {
+ send_v2_apply();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // we need to block writes temporarily to avoid in-flight journal
+ // writes
+ m_blocked_writes = true;
+ Context *ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_v2_block_writes>(this);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ m_image_ctx.io_image_dispatcher->block_writes(ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_block_writes(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ }
+ send_v2_apply();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_open_object_map() {
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0 ||
+ m_image_ctx.object_map != nullptr ||
+ (m_image_ctx.snap_name.empty() &&
+ (m_read_only ||
+ m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->is_lock_owner()))) {
+ send_v2_open_journal();
+ return;
+ }
+
+ // implies object map dynamically enabled or image open in-progress
+ // since SetSnapRequest loads the object map for a snapshot and
+ // ExclusiveLock loads the object map for HEAD
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ if (m_image_ctx.snap_name.empty()) {
+ m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP);
+ } else {
+ for (size_t snap_idx = 0; snap_idx < m_snap_infos.size(); ++snap_idx) {
+ if (m_snap_infos[snap_idx].name == m_image_ctx.snap_name) {
+ m_object_map = m_image_ctx.create_object_map(
+ m_snapc.snaps[snap_idx].val);
+ break;
+ }
+ }
+
+ if (m_object_map == nullptr) {
+ lderr(cct) << "failed to locate snapshot: " << m_image_ctx.snap_name
+ << dendl;
+ send_v2_open_journal();
+ return;
+ }
+ }
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_open_object_map>(this);
+ m_object_map->open(ctx);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_open_object_map(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to open object map: " << cpp_strerror(*result)
+ << dendl;
+ m_object_map->put();
+ m_object_map = nullptr;
+
+ if (*result != -EFBIG) {
+ save_result(result);
+ }
+ }
+
+ send_v2_open_journal();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // ensure we are not in a rados callback when applying updates
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_apply>(this);
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_apply(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ apply();
+
+ return send_v2_finalize_refresh_parent();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_finalize_refresh_parent() {
+ if (m_refresh_parent == nullptr) {
+ return send_v2_shut_down_exclusive_lock();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_finalize_refresh_parent>(this);
+ m_refresh_parent->finalize(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_finalize_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ ceph_assert(m_refresh_parent != nullptr);
+ delete m_refresh_parent;
+ m_refresh_parent = nullptr;
+
+ return send_v2_shut_down_exclusive_lock();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_shut_down_exclusive_lock() {
+ if (m_exclusive_lock == nullptr) {
+ return send_v2_close_journal();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // exclusive lock feature was dynamically disabled. in-flight IO will be
+ // flushed and in-flight requests will be canceled before releasing lock
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_shut_down_exclusive_lock>(this);
+ m_exclusive_lock->shut_down(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_shut_down_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to shut down exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ save_result(result);
+ }
+
+ {
+ std::unique_lock owner_locker{m_image_ctx.owner_lock};
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr);
+ }
+
+ ceph_assert(m_exclusive_lock != nullptr);
+ m_exclusive_lock->put();
+ m_exclusive_lock = nullptr;
+
+ return send_v2_close_journal();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_close_journal() {
+ if (m_journal == nullptr) {
+ return send_v2_close_object_map();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // journal feature was dynamically disabled
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_close_journal>(this);
+ m_journal->close(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_close_journal(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ save_result(result);
+ lderr(cct) << "failed to close journal: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ ceph_assert(m_journal != nullptr);
+ m_journal->put();
+ m_journal = nullptr;
+
+ ceph_assert(m_blocked_writes);
+ m_blocked_writes = false;
+
+ m_image_ctx.io_image_dispatcher->unblock_writes();
+ return send_v2_close_object_map();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_close_object_map() {
+ if (m_object_map == nullptr) {
+ return send_flush_aio();
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ // object map was dynamically disabled
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_v2_close_object_map>(this);
+ m_object_map->close(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_close_object_map(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close object map: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ ceph_assert(m_object_map != nullptr);
+
+ m_object_map->put();
+ m_object_map = nullptr;
+
+ return send_flush_aio();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_flush_aio() {
+ if (m_incomplete_update && m_error_result == 0) {
+ // if this was a partial refresh, notify ImageState
+ m_error_result = -ERESTART;
+ }
+
+ if (m_flush_aio) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ auto ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_flush_aio>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ m_image_ctx, io::IMAGE_DISPATCH_LAYER_REFRESH, aio_comp,
+ io::FLUSH_SOURCE_REFRESH, {});
+ req->send();
+ return nullptr;
+ } else if (m_error_result < 0) {
+ // propagate saved error back to caller
+ Context *ctx = create_context_callback<
+ RefreshRequest<I>, &RefreshRequest<I>::handle_error>(this);
+ m_image_ctx.op_work_queue->queue(ctx, 0);
+ return nullptr;
+ }
+
+ return m_on_finish;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_flush_aio(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to flush pending AIO: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ return handle_error(result);
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_error(int *result) {
+ if (m_error_result < 0) {
+ *result = m_error_result;
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+ }
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::scoped_lock locker{m_image_ctx.owner_lock, m_image_ctx.image_lock};
+
+ m_image_ctx.read_only_flags = m_read_only_flags;
+ m_image_ctx.read_only = m_read_only;
+ m_image_ctx.size = m_size;
+ m_image_ctx.lockers = m_lockers;
+ m_image_ctx.lock_tag = m_lock_tag;
+ m_image_ctx.exclusive_locked = m_exclusive_locked;
+
+ std::map<uint64_t, uint64_t> migration_reverse_snap_seq;
+
+ if (m_image_ctx.old_format) {
+ m_image_ctx.order = m_order;
+ m_image_ctx.features = 0;
+ m_image_ctx.flags = 0;
+ m_image_ctx.op_features = 0;
+ m_image_ctx.operations_disabled = false;
+ m_image_ctx.object_prefix = std::move(m_object_prefix);
+ m_image_ctx.init_layout(m_image_ctx.md_ctx.get_id());
+ } else {
+ // HEAD revision doesn't have a defined overlap so it's only
+ // applicable to snapshots
+ if (!m_head_parent_overlap) {
+ m_parent_md = {};
+ }
+
+ m_image_ctx.features = m_features;
+ m_image_ctx.flags = m_flags;
+ m_image_ctx.op_features = m_op_features;
+ m_image_ctx.operations_disabled = (
+ (m_op_features & ~RBD_OPERATION_FEATURES_ALL) != 0ULL);
+ m_image_ctx.group_spec = m_group_spec;
+
+ bool migration_info_valid;
+ int r = get_migration_info(&m_image_ctx.parent_md,
+ &m_image_ctx.migration_info,
+ &migration_info_valid);
+ ceph_assert(r == 0); // validated in refresh parent step
+
+ if (migration_info_valid) {
+ for (auto it : m_image_ctx.migration_info.snap_map) {
+ migration_reverse_snap_seq[it.second.front()] = it.first;
+ }
+ } else {
+ m_image_ctx.parent_md = m_parent_md;
+ m_image_ctx.migration_info = {};
+ }
+
+ librados::Rados rados(m_image_ctx.md_ctx);
+ int8_t require_osd_release;
+ r = rados.get_min_compatible_osd(&require_osd_release);
+ if (r == 0 && require_osd_release >= CEPH_RELEASE_OCTOPUS) {
+ m_image_ctx.enable_sparse_copyup = true;
+ }
+ }
+
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ std::vector<librados::snap_t>::const_iterator it = std::find(
+ m_image_ctx.snaps.begin(), m_image_ctx.snaps.end(),
+ m_snapc.snaps[i].val);
+ if (it == m_image_ctx.snaps.end()) {
+ m_flush_aio = true;
+ ldout(cct, 20) << "new snapshot id=" << m_snapc.snaps[i].val
+ << " name=" << m_snap_infos[i].name
+ << " size=" << m_snap_infos[i].image_size
+ << dendl;
+ }
+ }
+
+ m_image_ctx.snaps.clear();
+ m_image_ctx.snap_info.clear();
+ m_image_ctx.snap_ids.clear();
+ auto overlap = m_image_ctx.parent_md.overlap;
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ uint64_t flags = m_image_ctx.old_format ? 0 : m_snap_flags[i];
+ uint8_t protection_status = m_image_ctx.old_format ?
+ static_cast<uint8_t>(RBD_PROTECTION_STATUS_UNPROTECTED) :
+ m_snap_protection[i];
+ ParentImageInfo parent;
+ if (!m_image_ctx.old_format) {
+ if (!m_image_ctx.migration_info.empty()) {
+ parent = m_image_ctx.parent_md;
+ auto it = migration_reverse_snap_seq.find(m_snapc.snaps[i].val);
+ if (it != migration_reverse_snap_seq.end()) {
+ parent.spec.snap_id = it->second;
+ parent.overlap = m_snap_infos[i].image_size;
+ } else {
+ overlap = std::min(overlap, m_snap_infos[i].image_size);
+ parent.overlap = overlap;
+ }
+ } else {
+ parent = m_snap_parents[i];
+ }
+ }
+ m_image_ctx.add_snap(m_snap_infos[i].snapshot_namespace,
+ m_snap_infos[i].name, m_snapc.snaps[i].val,
+ m_snap_infos[i].image_size, parent,
+ protection_status, flags,
+ m_snap_infos[i].timestamp);
+ }
+ m_image_ctx.parent_md.overlap = std::min(overlap, m_image_ctx.size);
+ m_image_ctx.snapc = m_snapc;
+
+ if (m_image_ctx.snap_id != CEPH_NOSNAP &&
+ m_image_ctx.get_snap_id(m_image_ctx.snap_namespace,
+ m_image_ctx.snap_name) != m_image_ctx.snap_id) {
+ lderr(cct) << "tried to read from a snapshot that no longer exists: "
+ << m_image_ctx.snap_name << dendl;
+ m_image_ctx.snap_exists = false;
+ }
+
+ if (m_refresh_parent != nullptr) {
+ m_refresh_parent->apply();
+ }
+ if (m_image_ctx.data_ctx.is_valid()) {
+ m_image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(m_image_ctx.snapc.seq,
+ m_image_ctx.snaps);
+ m_image_ctx.rebuild_data_io_context();
+ }
+
+ // handle dynamically enabled / disabled features
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK,
+ m_image_ctx.image_lock)) {
+ // disabling exclusive lock will automatically handle closing
+ // object map and journaling
+ ceph_assert(m_exclusive_lock == nullptr);
+ m_exclusive_lock = m_image_ctx.exclusive_lock;
+ } else {
+ if (m_exclusive_lock != nullptr) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr);
+ std::swap(m_exclusive_lock, m_image_ctx.exclusive_lock);
+ }
+ if (!m_image_ctx.test_features(RBD_FEATURE_JOURNALING,
+ m_image_ctx.image_lock)) {
+ if (!m_image_ctx.clone_copy_on_read && m_image_ctx.journal != nullptr) {
+ m_image_ctx.exclusive_lock->unset_require_lock(io::DIRECTION_READ);
+ }
+ std::swap(m_journal, m_image_ctx.journal);
+ } else if (m_journal != nullptr) {
+ std::swap(m_journal, m_image_ctx.journal);
+ }
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+ m_image_ctx.image_lock) ||
+ m_object_map != nullptr) {
+ std::swap(m_object_map, m_image_ctx.object_map);
+ }
+ }
+}
+
+template <typename I>
+int RefreshRequest<I>::get_parent_info(uint64_t snap_id,
+ ParentImageInfo *parent_md,
+ MigrationInfo *migration_info) {
+ bool migration_info_valid;
+ int r = get_migration_info(parent_md, migration_info, &migration_info_valid);
+ if (r < 0) {
+ return r;
+ }
+
+ if (migration_info_valid) {
+ return 0;
+ } else if (snap_id == CEPH_NOSNAP) {
+ *parent_md = m_parent_md;
+ *migration_info = {};
+ return 0;
+ } else {
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ if (m_snapc.snaps[i].val == snap_id) {
+ *parent_md = m_snap_parents[i];
+ *migration_info = {};
+ return 0;
+ }
+ }
+ }
+ return -ENOENT;
+}
+
+template <typename I>
+int RefreshRequest<I>::get_migration_info(ParentImageInfo *parent_md,
+ MigrationInfo *migration_info,
+ bool* migration_info_valid) {
+ CephContext *cct = m_image_ctx.cct;
+ if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST ||
+ (m_migration_spec.state != cls::rbd::MIGRATION_STATE_PREPARED &&
+ m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTING &&
+ m_migration_spec.state != cls::rbd::MIGRATION_STATE_ABORTING)) {
+ if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_SRC &&
+ m_migration_spec.pool_id != -1 &&
+ m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTED) {
+ lderr(cct) << this << " " << __func__ << ": invalid migration spec"
+ << dendl;
+ return -EINVAL;
+ }
+
+ *migration_info_valid = false;
+ return 0;
+ }
+
+ if (!m_migration_spec.source_spec.empty()) {
+ // use special pool id just to indicate a parent (migration source image)
+ // exists
+ parent_md->spec.pool_id = std::numeric_limits<int64_t>::max();
+ parent_md->spec.pool_namespace = "";
+ parent_md->spec.image_id = "";
+ } else {
+ parent_md->spec.pool_id = m_migration_spec.pool_id;
+ parent_md->spec.pool_namespace = m_migration_spec.pool_namespace;
+ parent_md->spec.image_id = m_migration_spec.image_id;
+ }
+ parent_md->spec.snap_id = CEPH_NOSNAP;
+ parent_md->overlap = std::min(m_size, m_migration_spec.overlap);
+
+ auto snap_seqs = m_migration_spec.snap_seqs;
+ // If new snapshots have been created on destination image after
+ // migration stared, map the source CEPH_NOSNAP to the earliest of
+ // these snapshots.
+ snapid_t snap_id = snap_seqs.empty() ? 0 : snap_seqs.rbegin()->second;
+ auto it = std::upper_bound(m_snapc.snaps.rbegin(), m_snapc.snaps.rend(),
+ snap_id);
+ if (it != m_snapc.snaps.rend()) {
+ snap_seqs[CEPH_NOSNAP] = *it;
+ } else {
+ snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP;
+ }
+
+ std::set<uint64_t> snap_ids;
+ for (auto& it : snap_seqs) {
+ snap_ids.insert(it.second);
+ }
+ uint64_t overlap = snap_ids.find(CEPH_NOSNAP) != snap_ids.end() ?
+ parent_md->overlap : 0;
+ for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+ if (snap_ids.find(m_snapc.snaps[i].val) != snap_ids.end()) {
+ overlap = std::max(overlap, m_snap_infos[i].image_size);
+ }
+ }
+
+ *migration_info = {m_migration_spec.pool_id, m_migration_spec.pool_namespace,
+ m_migration_spec.image_name, m_migration_spec.image_id,
+ m_migration_spec.source_spec, {}, overlap,
+ m_migration_spec.flatten};
+ *migration_info_valid = true;
+
+ deep_copy::util::compute_snap_map(m_image_ctx.cct, 0, CEPH_NOSNAP, {},
+ snap_seqs, &migration_info->snap_map);
+ return 0;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RefreshRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RefreshRequest.h b/src/librbd/image/RefreshRequest.h
new file mode 100644
index 000000000..42f4b4669
--- /dev/null
+++ b/src/librbd/image/RefreshRequest.h
@@ -0,0 +1,275 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/utime.h"
+#include "common/snap_types.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include <string>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template<typename> class RefreshParentRequest;
+
+template<typename ImageCtxT = ImageCtx>
+class RefreshRequest {
+public:
+ static constexpr int MAX_ENOENT_RETRIES = 10;
+
+ static RefreshRequest *create(ImageCtxT &image_ctx, bool acquiring_lock,
+ bool skip_open_parent, Context *on_finish) {
+ return new RefreshRequest(image_ctx, acquiring_lock, skip_open_parent,
+ on_finish);
+ }
+
+ RefreshRequest(ImageCtxT &image_ctx, bool acquiring_lock,
+ bool skip_open_parent, Context *on_finish);
+ ~RefreshRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> < * * * * * * * * * * * * * * * * * * * * * * * * * * (ENOENT)
+ * ^ | *
+ * * | (v1) *
+ * * |-----> V1_READ_HEADER -------------> GET_MIGRATION_HEADER (skip if not
+ * * | | migrating)
+ * * | (v2) v
+ * * \-----> V2_GET_MUTABLE_METADATA V1_GET_SNAPSHOTS
+ * * * | |
+ * * * | -EOPNOTSUPP v
+ * * * | * * * V1_GET_LOCKS
+ * * * | * * |
+ * * * v v * v
+ * * * V2_GET_PARENT <apply>
+ * * * | |
+ * * v |
+ * * * * * * GET_MIGRATION_HEADER (skip if not |
+ * (ENOENT) | migrating) |
+ * v |
+ * * V2_GET_METADATA |
+ * * | |
+ * * v |
+ * * V2_GET_POOL_METADATA |
+ * * | |
+ * * v (skip if not enabled) |
+ * * V2_GET_OP_FEATURES |
+ * * | |
+ * * v |
+ * * V2_GET_GROUP |
+ * * | |
+ * * | -EOPNOTSUPP |
+ * * | * * * |
+ * * | * * |
+ * * v v * |
+ * * * V2_GET_SNAPSHOTS (skip if no snaps) |
+ * (ENOENT) | |
+ * * v |
+ * * * V2_REFRESH_PARENT (skip if no parent or |
+ * (ENOENT) | refresh not needed) |
+ * v |
+ * V2_INIT_EXCLUSIVE_LOCK (skip if lock |
+ * | active or disabled) |
+ * v |
+ * V2_OPEN_OBJECT_MAP (skip if map |
+ * | active or disabled) |
+ * v |
+ * V2_OPEN_JOURNAL (skip if journal |
+ * | active or disabled) |
+ * v |
+ * V2_BLOCK_WRITES (skip if journal not |
+ * | disabled) |
+ * v |
+ * <apply> |
+ * | |
+ * v |
+ * V2_FINALIZE_REFRESH_PARENT (skip if refresh |
+ * | not needed) |
+ * (error) v |
+ * * * * * > V2_SHUT_DOWN_EXCLUSIVE_LOCK (skip if lock |
+ * | active or enabled) |
+ * v |
+ * V2_CLOSE_JOURNAL (skip if journal inactive |
+ * | or enabled) |
+ * v |
+ * V2_CLOSE_OBJECT_MAP (skip if map inactive |
+ * | or enabled) |
+ * | |
+ * \-------------------\/--------------------/
+ * |
+ * v
+ * FLUSH (skip if no new
+ * | snapshots)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ enum LegacySnapshot {
+ LEGACY_SNAPSHOT_DISABLED,
+ LEGACY_SNAPSHOT_ENABLED,
+ LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP
+ };
+
+ ImageCtxT &m_image_ctx;
+ bool m_acquiring_lock;
+ bool m_skip_open_parent_image;
+ Context *m_on_finish;
+
+ cls::rbd::MigrationSpec m_migration_spec;
+ int m_error_result;
+ bool m_flush_aio;
+ decltype(m_image_ctx.exclusive_lock) m_exclusive_lock;
+ decltype(m_image_ctx.object_map) m_object_map;
+ decltype(m_image_ctx.journal) m_journal;
+ RefreshParentRequest<ImageCtxT> *m_refresh_parent;
+
+ bufferlist m_out_bl;
+
+ bool m_legacy_parent = false;
+ LegacySnapshot m_legacy_snapshot = LEGACY_SNAPSHOT_DISABLED;
+
+ int m_enoent_retries = 0;
+
+ uint8_t m_order = 0;
+ uint64_t m_size = 0;
+ uint64_t m_features = 0;
+ uint64_t m_incompatible_features = 0;
+ uint64_t m_flags = 0;
+ uint64_t m_op_features = 0;
+ uint32_t m_read_only_flags = 0U;
+ bool m_read_only = false;
+
+ librados::IoCtx m_pool_metadata_io_ctx;
+ std::map<std::string, bufferlist> m_metadata;
+
+ std::string m_object_prefix;
+ ParentImageInfo m_parent_md;
+ bool m_head_parent_overlap = false;
+ cls::rbd::GroupSpec m_group_spec;
+
+ ::SnapContext m_snapc;
+ std::vector<cls::rbd::SnapshotInfo> m_snap_infos;
+ std::vector<ParentImageInfo> m_snap_parents;
+ std::vector<uint8_t> m_snap_protection;
+ std::vector<uint64_t> m_snap_flags;
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> m_lockers;
+ std::string m_lock_tag;
+ bool m_exclusive_locked = false;
+
+ bool m_blocked_writes = false;
+ bool m_incomplete_update = false;
+
+ void send_get_migration_header();
+ Context *handle_get_migration_header(int *result);
+
+ void send_v1_read_header();
+ Context *handle_v1_read_header(int *result);
+
+ void send_v1_get_snapshots();
+ Context *handle_v1_get_snapshots(int *result);
+
+ void send_v1_get_locks();
+ Context *handle_v1_get_locks(int *result);
+
+ void send_v1_apply();
+ Context *handle_v1_apply(int *result);
+
+ void send_v2_get_mutable_metadata();
+ Context *handle_v2_get_mutable_metadata(int *result);
+
+ void send_v2_get_parent();
+ Context *handle_v2_get_parent(int *result);
+
+ void send_v2_get_metadata();
+ Context *handle_v2_get_metadata(int *result);
+
+ void send_v2_get_pool_metadata();
+ Context *handle_v2_get_pool_metadata(int *result);
+
+ void send_v2_get_op_features();
+ Context *handle_v2_get_op_features(int *result);
+
+ void send_v2_get_group();
+ Context *handle_v2_get_group(int *result);
+
+ void send_v2_get_snapshots();
+ Context *handle_v2_get_snapshots(int *result);
+
+ void send_v2_get_snapshots_legacy();
+ Context *handle_v2_get_snapshots_legacy(int *result);
+
+ void send_v2_refresh_parent();
+ Context *handle_v2_refresh_parent(int *result);
+
+ void send_v2_init_exclusive_lock();
+ Context *handle_v2_init_exclusive_lock(int *result);
+
+ void send_v2_open_journal();
+ Context *handle_v2_open_journal(int *result);
+
+ void send_v2_block_writes();
+ Context *handle_v2_block_writes(int *result);
+
+ void send_v2_open_object_map();
+ Context *handle_v2_open_object_map(int *result);
+
+ void send_v2_apply();
+ Context *handle_v2_apply(int *result);
+
+ Context *send_v2_finalize_refresh_parent();
+ Context *handle_v2_finalize_refresh_parent(int *result);
+
+ Context *send_v2_shut_down_exclusive_lock();
+ Context *handle_v2_shut_down_exclusive_lock(int *result);
+
+ Context *send_v2_close_journal();
+ Context *handle_v2_close_journal(int *result);
+
+ Context *send_v2_close_object_map();
+ Context *handle_v2_close_object_map(int *result);
+
+ Context *send_flush_aio();
+ Context *handle_flush_aio(int *result);
+
+ Context *handle_error(int *result);
+
+ void save_result(int *result) {
+ if (m_error_result == 0 && *result < 0) {
+ m_error_result = *result;
+ }
+ }
+
+ void apply();
+ int get_parent_info(uint64_t snap_id, ParentImageInfo *parent_md,
+ MigrationInfo *migration_info);
+ int get_migration_info(ParentImageInfo *parent_md,
+ MigrationInfo *migration_info,
+ bool* migration_info_valid);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RefreshRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
diff --git a/src/librbd/image/RemoveRequest.cc b/src/librbd/image/RemoveRequest.cc
new file mode 100644
index 000000000..42af593b1
--- /dev/null
+++ b/src/librbd/image/RemoveRequest.cc
@@ -0,0 +1,617 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/RemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/internal.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/image/PreRemoveRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/journal/TypeTraits.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/operation/TrimRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RemoveRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+using librados::IoCtx;
+using util::create_context_callback;
+using util::create_async_context_callback;
+using util::create_rados_callback;
+
+template<typename I>
+RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, const std::string &image_name,
+ const std::string &image_id, bool force,
+ bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_ioctx(ioctx), m_image_name(image_name), m_image_id(image_id),
+ m_force(force), m_from_trash_remove(from_trash_remove),
+ m_prog_ctx(prog_ctx), m_op_work_queue(op_work_queue),
+ m_on_finish(on_finish) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+template<typename I>
+RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, I *image_ctx, bool force,
+ bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_ioctx(ioctx), m_image_name(image_ctx->name), m_image_id(image_ctx->id),
+ m_image_ctx(image_ctx), m_force(force),
+ m_from_trash_remove(from_trash_remove), m_prog_ctx(prog_ctx),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish),
+ m_cct(image_ctx->cct), m_header_oid(image_ctx->header_oid),
+ m_old_format(image_ctx->old_format), m_unknown_format(false) {
+}
+
+template<typename I>
+void RemoveRequest<I>::send() {
+ ldout(m_cct, 20) << dendl;
+
+ open_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::open_image() {
+ if (m_image_ctx != nullptr) {
+ pre_remove_image();
+ return;
+ }
+
+ m_image_ctx = I::create(m_image_id.empty() ? m_image_name : "", m_image_id,
+ nullptr, m_ioctx, false);
+
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_open_image>(
+ this);
+
+ m_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_open_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_image_ctx = nullptr;
+
+ if (r != -ENOENT) {
+ lderr(m_cct) << "error opening image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_image();
+ return;
+ }
+
+ m_image_id = m_image_ctx->id;
+ m_image_name = m_image_ctx->name;
+ m_header_oid = m_image_ctx->header_oid;
+ m_old_format = m_image_ctx->old_format;
+ m_unknown_format = false;
+
+ pre_remove_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::pre_remove_image() {
+ ldout(m_cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_pre_remove_image>(this);
+ auto req = PreRemoveRequest<I>::create(m_image_ctx, m_force, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_pre_remove_image(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ECHILD) {
+ r = -ENOTEMPTY;
+ }
+ send_close_image(r);
+ return;
+ }
+
+ if (!m_image_ctx->data_ctx.is_valid()) {
+ detach_child();
+ return;
+ }
+
+ trim_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::trim_image() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ klass, &klass::handle_trim_image>(this));
+
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ auto req = librbd::operation::TrimRequest<I>::create(
+ *m_image_ctx, ctx, m_image_ctx->size, 0, m_prog_ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_trim_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove some object(s): "
+ << cpp_strerror(r) << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ if (m_old_format) {
+ send_close_image(r);
+ return;
+ }
+
+ detach_child();
+}
+
+template<typename I>
+void RemoveRequest<I>::detach_child() {
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_detach_child>(this);
+ auto req = DetachChildRequest<I>::create(*m_image_ctx, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_detach_child(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to detach child from parent: "
+ << cpp_strerror(r) << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ send_disable_mirror();
+}
+
+template<typename I>
+void RemoveRequest<I>::send_disable_mirror() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_disable_mirror>(this);
+
+ mirror::DisableRequest<I> *req =
+ mirror::DisableRequest<I>::create(m_image_ctx, m_force, !m_force, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_disable_mirror(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ r = 0;
+ } else if (r < 0) {
+ lderr(m_cct) << "error disabling image mirroring: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ // one last chance to ensure all snapshots have been deleted
+ m_image_ctx->image_lock.lock_shared();
+ if (!m_image_ctx->snap_info.empty()) {
+ ldout(m_cct, 5) << "image has snapshots - not removing" << dendl;
+ m_ret_val = -ENOTEMPTY;
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ send_close_image(r);
+}
+
+template<typename I>
+void RemoveRequest<I>::send_close_image(int r) {
+ ldout(m_cct, 20) << dendl;
+
+ m_ret_val = r;
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_send_close_image>(this);
+
+ m_image_ctx->state->close(ctx);
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_send_close_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "error encountered while closing image: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ m_image_ctx = nullptr;
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ finish(r);
+ return;
+ }
+
+ remove_header();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_header() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_remove_header>(this);
+ int r = m_ioctx.aio_remove(m_header_oid, rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_header(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ remove_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_header_v2() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_header_oid.empty()) {
+ m_header_oid = util::header_name(m_image_id);
+ }
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_remove_header_v2>(this);
+ int r = m_ioctx.aio_remove(m_header_oid, rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_header_v2(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_journal_remove();
+}
+
+template<typename I>
+void RemoveRequest<I>::send_journal_remove() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_journal_remove>(this);
+
+ typename journal::TypeTraits<I>::ContextWQ* context_wq;
+ Journal<I>::get_work_queue(m_cct, &context_wq);
+
+ journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create(
+ m_ioctx, m_image_id, Journal<>::IMAGE_CLIENT_ID, context_wq, ctx);
+ req->send();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_journal_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else {
+ r = 0;
+ }
+
+ send_object_map_remove();
+}
+
+template<typename I>
+void RemoveRequest<I>::send_object_map_remove() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_object_map_remove>(this);
+
+ int r = ObjectMap<>::aio_remove(m_ioctx,
+ m_image_id,
+ rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_object_map_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else {
+ r = 0;
+ }
+
+ mirror_image_remove();
+}
+
+template<typename I>
+void RemoveRequest<I>::mirror_image_remove() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_remove(&op, m_image_id);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_mirror_image_remove>(this);
+ int r = m_ioctx.aio_operate(RBD_MIRRORING, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_mirror_image_remove(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) {
+ lderr(m_cct) << "failed to remove mirror image state: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_from_trash_remove) {
+ // both the id object and the directory entry have been removed in
+ // a previous call to trash_move.
+ finish(0);
+ return;
+ }
+
+ remove_id_object();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_image() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_old_format || m_unknown_format) {
+ remove_v1_image();
+ } else {
+ remove_v2_image();
+ }
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_v1_image() {
+ ldout(m_cct, 20) << dendl;
+
+ Context *ctx = new LambdaContext([this] (int r) {
+ r = tmap_rm(m_ioctx, m_image_name);
+ handle_remove_v1_image(r);
+ });
+
+ m_op_work_queue->queue(ctx, 0);
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_v1_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_old_format = (r == 0);
+ if (r == 0 || (r < 0 && !m_unknown_format)) {
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing image from v1 directory: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+ return;
+ }
+
+ if (!m_old_format) {
+ remove_v2_image();
+ }
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_v2_image() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_image_id.empty()) {
+ dir_get_image_id();
+ return;
+ } else if (m_image_name.empty()) {
+ dir_get_image_name();
+ return;
+ }
+
+ remove_header_v2();
+ return;
+}
+
+template<typename I>
+void RemoveRequest<I>::dir_get_image_id() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_id_start(&op, m_image_name);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_dir_get_image_id>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_dir_get_image_id(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error fetching image id: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::dir_get_id_finish(&iter, &m_image_id);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+ }
+
+ remove_header_v2();
+}
+
+template<typename I>
+void RemoveRequest<I>::dir_get_image_name() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_name_start(&op, m_image_id);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_dir_get_image_name>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_dir_get_image_name(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error fetching image name: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::dir_get_name_finish(&iter, &m_image_name);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+ }
+
+ remove_header_v2();
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_id_object() {
+ ldout(m_cct, 20) << dendl;
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_remove_id_object>(this);
+ int r = m_ioctx.aio_remove(util::id_obj_name(m_image_name), rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_remove_id_object(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing id object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ dir_remove_image();
+}
+
+template<typename I>
+void RemoveRequest<I>::dir_remove_image() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::dir_remove_image(&op, m_image_name, m_image_id);
+
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_dir_remove_image>(this);
+ int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template<typename I>
+void RemoveRequest<I>::handle_dir_remove_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing image from v2 directory: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ finish(r);
+}
+
+template<typename I>
+void RemoveRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RemoveRequest.h b/src/librbd/image/RemoveRequest.h
new file mode 100644
index 000000000..b03f8fc7c
--- /dev/null
+++ b/src/librbd/image/RemoveRequest.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/image/TypeTraits.h"
+#include "common/Timer.h"
+
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+class ProgressContext;
+
+namespace image {
+
+template<typename ImageCtxT = ImageCtx>
+class RemoveRequest {
+private:
+ // mock unit testing support
+ typedef ::librbd::image::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::ContextWQ ContextWQ;
+public:
+ static RemoveRequest *create(librados::IoCtx &ioctx,
+ const std::string &image_name,
+ const std::string &image_id,
+ bool force, bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new RemoveRequest(ioctx, image_name, image_id, force,
+ from_trash_remove, prog_ctx, op_work_queue,
+ on_finish);
+ }
+
+ static RemoveRequest *create(librados::IoCtx &ioctx, ImageCtxT *image_ctx,
+ bool force, bool from_trash_remove,
+ ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new RemoveRequest(ioctx, image_ctx, force, from_trash_remove,
+ prog_ctx, op_work_queue, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * (skip if already opened) OPEN IMAGE------------------\
+ * | |
+ * v |
+ * PRE REMOVE IMAGE * * * |
+ * | * |
+ * v * |
+ * (skip if invalid data pool) TRIM IMAGE * * * * * |
+ * | * |
+ * v * |
+ * DETACH CHILD * |
+ * | * |
+ * v * v
+ * CLOSE IMAGE < * * * * |
+ * | |
+ * error v |
+ * /------<--------\ REMOVE HEADER<--------------/
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE JOURNAL
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * v ^ REMOVE OBJECTMAP
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE MIRROR IMAGE
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE ID OBJECT
+ * | | / |
+ * | |-------<-------/ |
+ * | | v
+ * | | REMOVE IMAGE
+ * | | / |
+ * | \-------<-------/ |
+ * | v
+ * \------------------>------------<finish>
+ *
+ * @endverbatim
+ */
+
+ RemoveRequest(librados::IoCtx &ioctx, const std::string &image_name,
+ const std::string &image_id, bool force, bool from_trash_remove,
+ ProgressContext &prog_ctx, ContextWQ *op_work_queue,
+ Context *on_finish);
+
+ RemoveRequest(librados::IoCtx &ioctx, ImageCtxT *image_ctx, bool force,
+ bool from_trash_remove, ProgressContext &prog_ctx,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ librados::IoCtx &m_ioctx;
+ std::string m_image_name;
+ std::string m_image_id;
+ ImageCtxT *m_image_ctx = nullptr;
+ bool m_force;
+ bool m_from_trash_remove;
+ ProgressContext &m_prog_ctx;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ std::string m_header_oid;
+ bool m_old_format = false;
+ bool m_unknown_format = true;
+
+ librados::IoCtx m_parent_io_ctx;
+
+ decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr;
+
+ int m_ret_val = 0;
+ bufferlist m_out_bl;
+ std::list<obj_watch_t> m_watchers;
+
+ std::map<uint64_t, SnapInfo> m_snap_infos;
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void send_journal_remove();
+ void handle_journal_remove(int r);
+
+ void send_object_map_remove();
+ void handle_object_map_remove(int r);
+
+ void mirror_image_remove();
+ void handle_mirror_image_remove(int r);
+
+ void pre_remove_image();
+ void handle_pre_remove_image(int r);
+
+ void trim_image();
+ void handle_trim_image(int r);
+
+ void detach_child();
+ void handle_detach_child(int r);
+
+ void send_disable_mirror();
+ void handle_disable_mirror(int r);
+
+ void send_close_image(int r);
+ void handle_send_close_image(int r);
+
+ void remove_header();
+ void handle_remove_header(int r);
+
+ void remove_header_v2();
+ void handle_remove_header_v2(int r);
+
+ void remove_image();
+
+ void remove_v1_image();
+ void handle_remove_v1_image(int r);
+
+ void remove_v2_image();
+
+ void dir_get_image_id();
+ void handle_dir_get_image_id(int r);
+
+ void dir_get_image_name();
+ void handle_dir_get_image_name(int r);
+
+ void remove_id_object();
+ void handle_remove_id_object(int r);
+
+ void dir_remove_image();
+ void handle_dir_remove_image(int r);
+
+ void finish(int r);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H
diff --git a/src/librbd/image/SetFlagsRequest.cc b/src/librbd/image/SetFlagsRequest.cc
new file mode 100644
index 000000000..fa00ed981
--- /dev/null
+++ b/src/librbd/image/SetFlagsRequest.cc
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/SetFlagsRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::SetFlagsRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+SetFlagsRequest<I>::SetFlagsRequest(I *image_ctx, uint64_t flags,
+ uint64_t mask, Context *on_finish)
+ : m_image_ctx(image_ctx), m_flags(flags), m_mask(mask),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void SetFlagsRequest<I>::send() {
+ send_set_flags();
+}
+
+template <typename I>
+void SetFlagsRequest<I>::send_set_flags() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ std::vector<uint64_t> snap_ids;
+ snap_ids.push_back(CEPH_NOSNAP);
+ for (auto it : m_image_ctx->snap_info) {
+ snap_ids.push_back(it.first);
+ }
+
+ Context *ctx = create_context_callback<
+ SetFlagsRequest<I>, &SetFlagsRequest<I>::handle_set_flags>(this);
+ C_Gather *gather_ctx = new C_Gather(cct, ctx);
+
+ for (auto snap_id : snap_ids) {
+ librados::ObjectWriteOperation op;
+ cls_client::set_flags(&op, snap_id, m_flags, m_mask);
+
+ librados::AioCompletion *comp =
+ create_rados_callback(gather_ctx->new_sub());
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+Context *SetFlagsRequest<I>::handle_set_flags(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "set_flags failed: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return m_on_finish;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::SetFlagsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/SetFlagsRequest.h b/src/librbd/image/SetFlagsRequest.h
new file mode 100644
index 000000000..be67e176a
--- /dev/null
+++ b/src/librbd/image/SetFlagsRequest.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H
+
+#include "include/buffer.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class SetFlagsRequest {
+public:
+ static SetFlagsRequest *create(ImageCtxT *image_ctx, uint64_t flags,
+ uint64_t mask, Context *on_finish) {
+ return new SetFlagsRequest(image_ctx, flags, mask, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . .
+ * v v .
+ * SET_FLAGS . (for every snapshot)
+ * | . .
+ * v . . .
+ * <finis>
+ *
+ * @endverbatim
+ */
+
+ SetFlagsRequest(ImageCtxT *image_ctx, uint64_t flags, uint64_t mask,
+ Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_flags;
+ uint64_t m_mask;
+ Context *m_on_finish;
+
+ void send_set_flags();
+ Context *handle_set_flags(int *result);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::SetFlagsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H
diff --git a/src/librbd/image/SetSnapRequest.cc b/src/librbd/image/SetSnapRequest.cc
new file mode 100644
index 000000000..fbc234aef
--- /dev/null
+++ b/src/librbd/image/SetSnapRequest.cc
@@ -0,0 +1,368 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/SetSnapRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshParentRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::SetSnapRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+
+template <typename I>
+SetSnapRequest<I>::SetSnapRequest(I &image_ctx, uint64_t snap_id,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish),
+ m_exclusive_lock(nullptr), m_object_map(nullptr), m_refresh_parent(nullptr),
+ m_writes_blocked(false) {
+}
+
+template <typename I>
+SetSnapRequest<I>::~SetSnapRequest() {
+ ceph_assert(!m_writes_blocked);
+ delete m_refresh_parent;
+ if (m_object_map) {
+ m_object_map->put();
+ }
+ if (m_exclusive_lock) {
+ m_exclusive_lock->put();
+ }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send() {
+ if (m_snap_id == CEPH_NOSNAP) {
+ send_init_exclusive_lock();
+ } else {
+ send_block_writes();
+ }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_init_exclusive_lock() {
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if (m_image_ctx.exclusive_lock != nullptr) {
+ ceph_assert(m_image_ctx.snap_id == CEPH_NOSNAP);
+ send_complete();
+ return;
+ }
+ }
+
+ if (m_image_ctx.read_only ||
+ !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ int r = 0;
+ if (send_refresh_parent(&r) != nullptr) {
+ send_complete();
+ }
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_exclusive_lock = ExclusiveLock<I>::create(m_image_ctx);
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_init_exclusive_lock>(this);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ m_exclusive_lock->init(m_image_ctx.features, ctx);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_init_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to initialize exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ finalize();
+ return m_on_finish;
+ }
+ return send_refresh_parent(result);
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_block_writes() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ m_writes_blocked = true;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_block_writes>(this);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ m_image_ctx.io_image_dispatcher->block_writes(ctx);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_block_writes(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result)
+ << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ auto it = m_image_ctx.snap_info.find(m_snap_id);
+ if (it == m_image_ctx.snap_info.end()) {
+ ldout(cct, 5) << "failed to locate snapshot '" << m_snap_id << "'"
+ << dendl;
+
+ *result = -ENOENT;
+ finalize();
+ return m_on_finish;
+ }
+ }
+
+ return send_shut_down_exclusive_lock(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_shut_down_exclusive_lock(int *result) {
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ m_exclusive_lock = m_image_ctx.exclusive_lock;
+ }
+
+ if (m_exclusive_lock == nullptr) {
+ return send_refresh_parent(result);
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_shut_down_exclusive_lock>(this);
+ m_exclusive_lock->shut_down(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_shut_down_exclusive_lock(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to shut down exclusive lock: "
+ << cpp_strerror(*result) << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_refresh_parent(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ParentImageInfo parent_md;
+ bool refresh_parent;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+
+ const auto parent_info = m_image_ctx.get_parent_info(m_snap_id);
+ if (parent_info == nullptr) {
+ *result = -ENOENT;
+ lderr(cct) << "failed to retrieve snapshot parent info" << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ parent_md = *parent_info;
+ refresh_parent = RefreshParentRequest<I>::is_refresh_required(
+ m_image_ctx, parent_md, m_image_ctx.migration_info);
+ }
+
+ if (!refresh_parent) {
+ if (m_snap_id == CEPH_NOSNAP) {
+ // object map is loaded when exclusive lock is acquired
+ *result = apply();
+ finalize();
+ return m_on_finish;
+ } else {
+ // load snapshot object map
+ return send_open_object_map(result);
+ }
+ }
+
+ ldout(cct, 10) << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_refresh_parent>(this);
+ m_refresh_parent = RefreshParentRequest<I>::create(m_image_ctx, parent_md,
+ m_image_ctx.migration_info,
+ ctx);
+ m_refresh_parent->send();
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to refresh snapshot parent: " << cpp_strerror(*result)
+ << dendl;
+ finalize();
+ return m_on_finish;
+ }
+
+ if (m_snap_id == CEPH_NOSNAP) {
+ // object map is loaded when exclusive lock is acquired
+ *result = apply();
+ if (*result < 0) {
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_finalize_refresh_parent(result);
+ } else {
+ // load snapshot object map
+ return send_open_object_map(result);
+ }
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_open_object_map(int *result) {
+ if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ *result = apply();
+ if (*result < 0) {
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_finalize_refresh_parent(result);
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_open_object_map>(this);
+ m_object_map = ObjectMap<I>::create(m_image_ctx, m_snap_id);
+ m_object_map->open(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_open_object_map(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to open object map: " << cpp_strerror(*result)
+ << dendl;
+ m_object_map->put();
+ m_object_map = nullptr;
+ }
+
+ *result = apply();
+ if (*result < 0) {
+ finalize();
+ return m_on_finish;
+ }
+
+ return send_finalize_refresh_parent(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_finalize_refresh_parent(int *result) {
+ if (m_refresh_parent == nullptr) {
+ finalize();
+ return m_on_finish;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = SetSnapRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_finalize_refresh_parent>(this);
+ m_refresh_parent->finalize(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_finalize_refresh_parent(int *result) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close parent image: " << cpp_strerror(*result)
+ << dendl;
+ }
+ finalize();
+ return m_on_finish;
+}
+
+template <typename I>
+int SetSnapRequest<I>::apply() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << __func__ << dendl;
+
+ std::scoped_lock locker{m_image_ctx.owner_lock, m_image_ctx.image_lock};
+ if (m_snap_id != CEPH_NOSNAP) {
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr);
+ int r = m_image_ctx.snap_set(m_snap_id);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ std::swap(m_image_ctx.exclusive_lock, m_exclusive_lock);
+ m_image_ctx.snap_unset();
+ }
+
+ if (m_refresh_parent != nullptr) {
+ m_refresh_parent->apply();
+ }
+
+ std::swap(m_object_map, m_image_ctx.object_map);
+ return 0;
+}
+
+template <typename I>
+void SetSnapRequest<I>::finalize() {
+ if (m_writes_blocked) {
+ m_image_ctx.io_image_dispatcher->unblock_writes();
+ m_writes_blocked = false;
+ }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_complete() {
+ finalize();
+ m_on_finish->complete(0);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::SetSnapRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/SetSnapRequest.h b/src/librbd/image/SetSnapRequest.h
new file mode 100644
index 000000000..c12ea9f27
--- /dev/null
+++ b/src/librbd/image/SetSnapRequest.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
+
+#include "cls/rbd/cls_rbd_client.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+template <typename> class ExclusiveLock;
+class ImageCtx;
+template <typename> class ObjectMap;
+
+namespace image {
+
+template <typename> class RefreshParentRequest;
+
+template <typename ImageCtxT = ImageCtx>
+class SetSnapRequest {
+public:
+ static SetSnapRequest *create(ImageCtxT &image_ctx, uint64_t snap_id,
+ Context *on_finish) {
+ return new SetSnapRequest(image_ctx, snap_id, on_finish);
+ }
+
+ ~SetSnapRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (set snap)
+ * |-----------> BLOCK_WRITES
+ * | |
+ * | v
+ * | SHUTDOWN_EXCLUSIVE_LOCK (skip if lock inactive
+ * | | or disabled)
+ * | v
+ * | REFRESH_PARENT (skip if no parent
+ * | | or refresh not needed)
+ * | v
+ * | OPEN_OBJECT_MAP (skip if map disabled)
+ * | |
+ * | v
+ * | <apply>
+ * | |
+ * | v
+ * | FINALIZE_REFRESH_PARENT (skip if no parent
+ * | | or refresh not needed)
+ * | v
+ * | <finish>
+ * |
+ * \-----------> INIT_EXCLUSIVE_LOCK (skip if active or
+ * | disabled)
+ * v
+ * REFRESH_PARENT (skip if no parent
+ * | or refresh not needed)
+ * v
+ * <apply>
+ * |
+ * v
+ * FINALIZE_REFRESH_PARENT (skip if no parent
+ * | or refresh not needed)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ SetSnapRequest(ImageCtxT &image_ctx, uint64_t snap_id, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ uint64_t m_snap_id;
+ Context *m_on_finish;
+
+ ExclusiveLock<ImageCtxT> *m_exclusive_lock;
+ ObjectMap<ImageCtxT> *m_object_map;
+ RefreshParentRequest<ImageCtxT> *m_refresh_parent;
+
+ bool m_writes_blocked;
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_init_exclusive_lock();
+ Context *handle_init_exclusive_lock(int *result);
+
+ Context *send_shut_down_exclusive_lock(int *result);
+ Context *handle_shut_down_exclusive_lock(int *result);
+
+ Context *send_refresh_parent(int *result);
+ Context *handle_refresh_parent(int *result);
+
+ Context *send_open_object_map(int *result);
+ Context *handle_open_object_map(int *result);
+
+ Context *send_finalize_refresh_parent(int *result);
+ Context *handle_finalize_refresh_parent(int *result);
+
+ int apply();
+ void finalize();
+ void send_complete();
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::SetSnapRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
diff --git a/src/librbd/image/TypeTraits.h b/src/librbd/image/TypeTraits.h
new file mode 100644
index 000000000..2989e30b5
--- /dev/null
+++ b/src/librbd/image/TypeTraits.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H
+#define CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+namespace image {
+
+template <typename ImageCtxT>
+struct TypeTraits {
+ typedef asio::ContextWQ ContextWQ;
+};
+
+} // namespace image
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H
diff --git a/src/librbd/image/Types.h b/src/librbd/image/Types.h
new file mode 100644
index 000000000..44c66e227
--- /dev/null
+++ b/src/librbd/image/Types.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_IMAGE_TYPES_H
+#define LIBRBD_IMAGE_TYPES_H
+
+namespace librbd {
+namespace image {
+
+enum {
+ CREATE_FLAG_SKIP_MIRROR_ENABLE = 1 << 0,
+ CREATE_FLAG_FORCE_MIRROR_ENABLE = 1 << 1,
+ CREATE_FLAG_MIRROR_ENABLE_MASK = (CREATE_FLAG_SKIP_MIRROR_ENABLE |
+ CREATE_FLAG_FORCE_MIRROR_ENABLE),
+};
+
+} // namespace image
+} // librbd
+
+#endif // LIBRBD_IMAGE_TYPES_H
diff --git a/src/librbd/image/ValidatePoolRequest.cc b/src/librbd/image/ValidatePoolRequest.cc
new file mode 100644
index 000000000..6f2872e25
--- /dev/null
+++ b/src/librbd/image/ValidatePoolRequest.cc
@@ -0,0 +1,234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/ValidatePoolRequest.h"
+#include "include/rados/librados.hpp"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::ValidatePoolRequest: " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace image {
+
+namespace {
+
+const std::string OVERWRITE_VALIDATED("overwrite validated");
+const std::string VALIDATE("validate");
+
+} // anonymous namespace
+
+using util::create_rados_callback;
+using util::create_context_callback;
+using util::create_async_context_callback;
+
+template <typename I>
+ValidatePoolRequest<I>::ValidatePoolRequest(librados::IoCtx& io_ctx,
+ Context *on_finish)
+ : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+ m_on_finish(on_finish) {
+ // validatation should occur in default namespace
+ m_io_ctx.dup(io_ctx);
+ m_io_ctx.set_namespace("");
+ }
+
+template <typename I>
+void ValidatePoolRequest<I>::send() {
+ read_rbd_info();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::read_rbd_info() {
+ ldout(m_cct, 5) << dendl;
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_read_rbd_info>(this);
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, nullptr, nullptr);
+
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_read_rbd_info(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r >= 0) {
+ bufferlist validated_bl;
+ validated_bl.append(OVERWRITE_VALIDATED);
+
+ bufferlist validate_bl;
+ validate_bl.append(VALIDATE);
+
+ if (m_out_bl.contents_equal(validated_bl)) {
+ // already validated pool
+ finish(0);
+ return;
+ } else if (m_out_bl.contents_equal(validate_bl)) {
+ // implies snapshot was already successfully created
+ overwrite_rbd_info();
+ return;
+ }
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to read RBD info: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ create_snapshot();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::create_snapshot() {
+ ldout(m_cct, 5) << dendl;
+
+ // allocate a self-managed snapshot id if this a new pool to force
+ // self-managed snapshot mode
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_create_snapshot>(this);
+ m_io_ctx.aio_selfmanaged_snap_create(&m_snap_id, comp);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_create_snapshot(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r == -EINVAL) {
+ lderr(m_cct) << "pool not configured for self-managed RBD snapshot support"
+ << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to allocate self-managed snapshot: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ write_rbd_info();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::write_rbd_info() {
+ ldout(m_cct, 5) << dendl;
+
+ bufferlist bl;
+ bl.append(VALIDATE);
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ op.write(0, bl);
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_write_rbd_info>(this);
+ int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_write_rbd_info(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ lderr(m_cct) << "pool missing required overwrite support" << dendl;
+ m_ret_val = -EINVAL;
+ } else if (r < 0 && r != -EEXIST) {
+ lderr(m_cct) << "failed to write RBD info: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ remove_snapshot();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::remove_snapshot() {
+ ldout(m_cct, 5) << dendl;
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_remove_snapshot>(this);
+ m_io_ctx.aio_selfmanaged_snap_remove(m_snap_id, comp);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_remove_snapshot(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ // not a fatal error
+ lderr(m_cct) << "failed to remove validation snapshot: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ overwrite_rbd_info();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::overwrite_rbd_info() {
+ ldout(m_cct, 5) << dendl;
+
+ bufferlist bl;
+ bl.append(OVERWRITE_VALIDATED);
+
+ librados::ObjectWriteOperation op;
+ op.write(0, bl);
+
+ auto comp = create_rados_callback<
+ ValidatePoolRequest<I>,
+ &ValidatePoolRequest<I>::handle_overwrite_rbd_info>(this);
+ int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::handle_overwrite_rbd_info(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ lderr(m_cct) << "pool missing required overwrite support" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to validate overwrite support: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void ValidatePoolRequest<I>::finish(int r) {
+ ldout(m_cct, 5) << "r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::ValidatePoolRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/ValidatePoolRequest.h b/src/librbd/image/ValidatePoolRequest.h
new file mode 100644
index 000000000..74f384417
--- /dev/null
+++ b/src/librbd/image/ValidatePoolRequest.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+
+namespace image {
+
+template <typename ImageCtxT>
+class ValidatePoolRequest {
+public:
+ static ValidatePoolRequest* create(librados::IoCtx& io_ctx,
+ Context *on_finish) {
+ return new ValidatePoolRequest(io_ctx, on_finish);
+ }
+
+ ValidatePoolRequest(librados::IoCtx& io_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (overwrites validated)
+ * READ RBD INFO . . . . . . . . .
+ * | . .
+ * | . (snapshots validated) .
+ * | . . . . . . . . . . .
+ * v . .
+ * CREATE SNAPSHOT . .
+ * | . .
+ * v . .
+ * WRITE RBD INFO . .
+ * | . .
+ * v . .
+ * REMOVE SNAPSHOT . .
+ * | . .
+ * v . .
+ * OVERWRITE RBD INFO < . . . .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . .`
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx m_io_ctx;
+ CephContext* m_cct;
+ Context* m_on_finish;
+
+ int m_ret_val = 0;
+ bufferlist m_out_bl;
+ uint64_t m_snap_id = 0;
+
+ void read_rbd_info();
+ void handle_read_rbd_info(int r);
+
+ void create_snapshot();
+ void handle_create_snapshot(int r);
+
+ void write_rbd_info();
+ void handle_write_rbd_info(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void overwrite_rbd_info();
+ void handle_overwrite_rbd_info(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::ValidatePoolRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H
diff --git a/src/librbd/image_watcher/NotifyLockOwner.cc b/src/librbd/image_watcher/NotifyLockOwner.cc
new file mode 100644
index 000000000..e37fb597e
--- /dev/null
+++ b/src/librbd/image_watcher/NotifyLockOwner.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image_watcher/NotifyLockOwner.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "librbd/watcher/Notifier.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image_watcher::NotifyLockOwner: " \
+ << this << " " << __func__
+
+namespace librbd {
+
+namespace image_watcher {
+
+using namespace watch_notify;
+using util::create_context_callback;
+
+NotifyLockOwner::NotifyLockOwner(ImageCtx &image_ctx,
+ watcher::Notifier &notifier,
+ bufferlist &&bl, Context *on_finish)
+ : m_image_ctx(image_ctx), m_notifier(notifier), m_bl(std::move(bl)),
+ m_on_finish(on_finish) {
+}
+
+void NotifyLockOwner::send() {
+ send_notify();
+}
+
+void NotifyLockOwner::send_notify() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ m_notifier.notify(m_bl, &m_notify_response, create_context_callback<
+ NotifyLockOwner, &NotifyLockOwner::handle_notify>(this));
+}
+
+void NotifyLockOwner::handle_notify(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": r=" << r << dendl;
+
+ if (r < 0 && r != -ETIMEDOUT) {
+ lderr(cct) << ": lock owner notification failed: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ bufferlist response;
+ bool lock_owner_responded = false;
+ for (auto &it : m_notify_response.acks) {
+ if (it.second.length() > 0) {
+ if (lock_owner_responded) {
+ lderr(cct) << ": duplicate lock owners detected" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+ lock_owner_responded = true;
+ response = std::move(it.second);
+ }
+ }
+
+ if (!lock_owner_responded) {
+ ldout(cct, 1) << ": no lock owners detected" << dendl;
+ finish(-ETIMEDOUT);
+ return;
+ }
+
+ try {
+ auto iter = response.cbegin();
+
+ ResponseMessage response_message;
+ using ceph::decode;
+ decode(response_message, iter);
+
+ r = response_message.result;
+ } catch (const buffer::error &err) {
+ r = -EINVAL;
+ }
+ finish(r);
+}
+
+void NotifyLockOwner::finish(int r) {
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_watcher
+} // namespace librbd
diff --git a/src/librbd/image_watcher/NotifyLockOwner.h b/src/librbd/image_watcher/NotifyLockOwner.h
new file mode 100644
index 000000000..6249bc128
--- /dev/null
+++ b/src/librbd/image_watcher/NotifyLockOwner.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H
+#define CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H
+
+#include "include/buffer.h"
+#include "librbd/watcher/Types.h"
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace watcher { class Notifier; }
+
+namespace image_watcher {
+
+class NotifyLockOwner {
+public:
+ static NotifyLockOwner *create(ImageCtx &image_ctx,
+ watcher::Notifier &notifier,
+ bufferlist &&bl, Context *on_finish) {
+ return new NotifyLockOwner(image_ctx, notifier, std::move(bl), on_finish);
+ }
+
+ NotifyLockOwner(ImageCtx &image_ctx, watcher::Notifier &notifier,
+ bufferlist &&bl, Context *on_finish);
+
+ void send();
+
+private:
+ ImageCtx &m_image_ctx;
+ watcher::Notifier &m_notifier;
+
+ bufferlist m_bl;
+ watcher::NotifyResponse m_notify_response;
+ Context *m_on_finish;
+
+ void send_notify();
+ void handle_notify(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
new file mode 100644
index 000000000..f9ba5474b
--- /dev/null
+++ b/src/librbd/internal.cc
@@ -0,0 +1,1750 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "include/int_types.h"
+
+#include <errno.h>
+#include <limits.h>
+
+#include "include/types.h"
+#include "include/uuid.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "common/event_socket.h"
+#include "common/perf_counters.h"
+#include "osdc/Striper.h"
+#include "include/stringify.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/journal/cls_journal_client.h"
+
+#include "librbd/AsioEngine.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Operations.h"
+#include "librbd/PluginRegistry.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Io.h"
+#include "librbd/cache/Utils.h"
+#include "librbd/exclusive_lock/AutomaticPolicy.h"
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/deep_copy/MetadataCopyRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/GetMetadataRequest.h"
+#include "librbd/image/Types.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/journal/Types.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/operation/TrimRequest.h"
+
+#include "journal/Journaler.h"
+
+#include <boost/scope_exit.hpp>
+#include <boost/variant.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd: "
+
+#define rbd_howmany(x, y) (((x) + (y) - 1) / (y))
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+// list binds to list() here, so std::list is explicitly used below
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+using librados::Rados;
+
+namespace librbd {
+
+namespace {
+
+int validate_pool(IoCtx &io_ctx, CephContext *cct) {
+ if (!cct->_conf.get_val<bool>("rbd_validate_pool")) {
+ return 0;
+ }
+
+ int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
+ if (r == 0) {
+ return 0;
+ } else if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // allocate a self-managed snapshot id if this a new pool to force
+ // self-managed snapshot mode
+ uint64_t snap_id;
+ r = io_ctx.selfmanaged_snap_create(&snap_id);
+ if (r == -EINVAL) {
+ lderr(cct) << "pool not configured for self-managed RBD snapshot support"
+ << dendl;
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to allocate self-managed snapshot: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = io_ctx.selfmanaged_snap_remove(snap_id);
+ if (r < 0) {
+ lderr(cct) << "failed to release self-managed snapshot " << snap_id
+ << ": " << cpp_strerror(r) << dendl;
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+ int detect_format(IoCtx &io_ctx, const string &name,
+ bool *old_format, uint64_t *size)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ if (old_format)
+ *old_format = true;
+ int r = io_ctx.stat(util::old_header_name(name), size, NULL);
+ if (r == -ENOENT) {
+ if (old_format)
+ *old_format = false;
+ r = io_ctx.stat(util::id_obj_name(name), size, NULL);
+ if (r < 0)
+ return r;
+ } else if (r < 0) {
+ return r;
+ }
+
+ ldout(cct, 20) << "detect format of " << name << " : "
+ << (old_format ? (*old_format ? "old" : "new") :
+ "don't care") << dendl;
+ return 0;
+ }
+
+ bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap)
+ {
+ return (parent_pool_id != -1 && off <= overlap);
+ }
+
+ void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
+ uint64_t size, int order, uint64_t bid)
+ {
+ uint32_t hi = bid >> 32;
+ uint32_t lo = bid & 0xFFFFFFFF;
+ uint32_t extra = rand() % 0xFFFFFFFF;
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&ondisk, 0, sizeof(ondisk));
+
+ memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT));
+ memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE,
+ sizeof(RBD_HEADER_SIGNATURE));
+ memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION));
+
+ snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x",
+ hi, lo, extra);
+
+ ondisk.image_size = size;
+ ondisk.options.order = order;
+ ondisk.options.crypt_type = RBD_CRYPT_NONE;
+ ondisk.options.comp_type = RBD_COMP_NONE;
+ ondisk.snap_seq = 0;
+ ondisk.snap_count = 0;
+ ondisk.reserved = 0;
+ ondisk.snap_names_len = 0;
+ }
+
+ void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize)
+ {
+ int obj_order = ictx->order;
+ {
+ std::shared_lock locker{ictx->image_lock};
+ info.size = ictx->get_effective_image_size(ictx->snap_id);
+ }
+ info.obj_size = 1ULL << obj_order;
+ info.num_objs = Striper::get_num_objects(ictx->layout, info.size);
+ info.order = obj_order;
+ strncpy(info.block_name_prefix, ictx->object_prefix.c_str(),
+ RBD_MAX_BLOCK_NAME_SIZE);
+ info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0';
+
+ // clear deprecated fields
+ info.parent_pool = -1L;
+ info.parent_name[0] = '\0';
+ }
+
+ uint64_t oid_to_object_no(const string& oid, const string& object_prefix)
+ {
+ istringstream iss(oid);
+ // skip object prefix and separator
+ iss.ignore(object_prefix.length() + 1);
+ uint64_t num;
+ iss >> std::hex >> num;
+ return num;
+ }
+
+ void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
+ {
+ ceph_assert(ceph_mutex_is_locked(ictx->owner_lock));
+ ceph_assert(ictx->exclusive_lock == nullptr ||
+ ictx->exclusive_lock->is_lock_owner());
+
+ C_SaferCond ctx;
+ ictx->image_lock.lock_shared();
+ operation::TrimRequest<> *req = operation::TrimRequest<>::create(
+ *ictx, &ctx, ictx->size, newsize, prog_ctx);
+ ictx->image_lock.unlock_shared();
+ req->send();
+
+ int r = ctx.wait();
+ if (r < 0) {
+ lderr(ictx->cct) << "warning: failed to remove some object(s): "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+
+ int read_header_bl(IoCtx& io_ctx, const string& header_oid,
+ bufferlist& header, uint64_t *ver)
+ {
+ int r;
+ uint64_t off = 0;
+#define READ_SIZE 4096
+ do {
+ bufferlist bl;
+ r = io_ctx.read(header_oid, bl, READ_SIZE, off);
+ if (r < 0)
+ return r;
+ header.claim_append(bl);
+ off += r;
+ } while (r == READ_SIZE);
+
+ static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT),
+ "length of rbd headers must be the same");
+
+ if (header.length() < sizeof(RBD_HEADER_TEXT) ||
+ (memcmp(RBD_HEADER_TEXT, header.c_str(),
+ sizeof(RBD_HEADER_TEXT)) != 0 &&
+ memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(),
+ sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ lderr(cct) << "unrecognized header format" << dendl;
+ return -ENXIO;
+ }
+
+ if (ver)
+ *ver = io_ctx.get_last_version();
+
+ return 0;
+ }
+
+ int read_header(IoCtx& io_ctx, const string& header_oid,
+ struct rbd_obj_header_ondisk *header, uint64_t *ver)
+ {
+ bufferlist header_bl;
+ int r = read_header_bl(io_ctx, header_oid, header_bl, ver);
+ if (r < 0)
+ return r;
+ if (header_bl.length() < (int)sizeof(*header))
+ return -EIO;
+ memcpy(header, header_bl.c_str(), sizeof(*header));
+
+ return 0;
+ }
+
+ int tmap_set(IoCtx& io_ctx, const string& imgname)
+ {
+ bufferlist cmdbl, emptybl;
+ __u8 c = CEPH_OSD_TMAP_SET;
+ encode(c, cmdbl);
+ encode(imgname, cmdbl);
+ encode(emptybl, cmdbl);
+ return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
+ }
+
+ int tmap_rm(IoCtx& io_ctx, const string& imgname)
+ {
+ bufferlist cmdbl;
+ __u8 c = CEPH_OSD_TMAP_RM;
+ encode(c, cmdbl);
+ encode(imgname, cmdbl);
+ return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
+ }
+
+ typedef boost::variant<std::string,uint64_t> image_option_value_t;
+ typedef std::map<int,image_option_value_t> image_options_t;
+ typedef std::shared_ptr<image_options_t> image_options_ref;
+
+ enum image_option_type_t {
+ STR,
+ UINT64,
+ };
+
+ const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
+ {RBD_IMAGE_OPTION_FORMAT, UINT64},
+ {RBD_IMAGE_OPTION_FEATURES, UINT64},
+ {RBD_IMAGE_OPTION_ORDER, UINT64},
+ {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
+ {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
+ {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
+ {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
+ {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
+ {RBD_IMAGE_OPTION_FEATURES_SET, UINT64},
+ {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64},
+ {RBD_IMAGE_OPTION_DATA_POOL, STR},
+ {RBD_IMAGE_OPTION_FLATTEN, UINT64},
+ {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64},
+ {RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, UINT64},
+ };
+
+ std::string image_option_name(int optname) {
+ switch (optname) {
+ case RBD_IMAGE_OPTION_FORMAT:
+ return "format";
+ case RBD_IMAGE_OPTION_FEATURES:
+ return "features";
+ case RBD_IMAGE_OPTION_ORDER:
+ return "order";
+ case RBD_IMAGE_OPTION_STRIPE_UNIT:
+ return "stripe_unit";
+ case RBD_IMAGE_OPTION_STRIPE_COUNT:
+ return "stripe_count";
+ case RBD_IMAGE_OPTION_JOURNAL_ORDER:
+ return "journal_order";
+ case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
+ return "journal_splay_width";
+ case RBD_IMAGE_OPTION_JOURNAL_POOL:
+ return "journal_pool";
+ case RBD_IMAGE_OPTION_FEATURES_SET:
+ return "features_set";
+ case RBD_IMAGE_OPTION_FEATURES_CLEAR:
+ return "features_clear";
+ case RBD_IMAGE_OPTION_DATA_POOL:
+ return "data_pool";
+ case RBD_IMAGE_OPTION_FLATTEN:
+ return "flatten";
+ case RBD_IMAGE_OPTION_CLONE_FORMAT:
+ return "clone_format";
+ case RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE:
+ return "mirror_image_mode";
+ default:
+ return "unknown (" + stringify(optname) + ")";
+ }
+ }
+
+ void image_options_create(rbd_image_options_t* opts)
+ {
+ image_options_ref* opts_ = new image_options_ref(new image_options_t());
+
+ *opts = static_cast<rbd_image_options_t>(opts_);
+ }
+
+ void image_options_create_ref(rbd_image_options_t* opts,
+ rbd_image_options_t orig)
+ {
+ image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
+ image_options_ref* opts_ = new image_options_ref(*orig_);
+
+ *opts = static_cast<rbd_image_options_t>(opts_);
+ }
+
+ void image_options_copy(rbd_image_options_t* opts,
+ const ImageOptions &orig)
+ {
+ image_options_ref* opts_ = new image_options_ref(new image_options_t());
+
+ *opts = static_cast<rbd_image_options_t>(opts_);
+
+ std::string str_val;
+ uint64_t uint64_val;
+ for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
+ switch (i.second) {
+ case STR:
+ if (orig.get(i.first, &str_val) == 0) {
+ image_options_set(*opts, i.first, str_val);
+ }
+ continue;
+ case UINT64:
+ if (orig.get(i.first, &uint64_val) == 0) {
+ image_options_set(*opts, i.first, uint64_val);
+ }
+ continue;
+ }
+ }
+ }
+
+ void image_options_destroy(rbd_image_options_t opts)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ delete opts_;
+ }
+
+ int image_options_set(rbd_image_options_t opts, int optname,
+ const std::string& optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
+ return -EINVAL;
+ }
+
+ (*opts_->get())[optname] = optval;
+ return 0;
+ }
+
+ int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
+ return -EINVAL;
+ }
+
+ (*opts_->get())[optname] = optval;
+ return 0;
+ }
+
+ int image_options_get(rbd_image_options_t opts, int optname,
+ std::string* optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
+ return -EINVAL;
+ }
+
+ image_options_t::const_iterator j = (*opts_)->find(optname);
+
+ if (j == (*opts_)->end()) {
+ return -ENOENT;
+ }
+
+ *optval = boost::get<std::string>(j->second);
+ return 0;
+ }
+
+ int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
+ return -EINVAL;
+ }
+
+ image_options_t::const_iterator j = (*opts_)->find(optname);
+
+ if (j == (*opts_)->end()) {
+ return -ENOENT;
+ }
+
+ *optval = boost::get<uint64_t>(j->second);
+ return 0;
+ }
+
+ int image_options_is_set(rbd_image_options_t opts, int optname,
+ bool* is_set)
+ {
+ if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) ==
+ IMAGE_OPTIONS_TYPE_MAPPING.end()) {
+ return -EINVAL;
+ }
+
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+ *is_set = ((*opts_)->find(optname) != (*opts_)->end());
+ return 0;
+ }
+
+ int image_options_unset(rbd_image_options_t opts, int optname)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ std::map<int, image_option_type_t>::const_iterator i =
+ IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+ if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
+ ceph_assert((*opts_)->find(optname) == (*opts_)->end());
+ return -EINVAL;
+ }
+
+ image_options_t::const_iterator j = (*opts_)->find(optname);
+
+ if (j == (*opts_)->end()) {
+ return -ENOENT;
+ }
+
+ (*opts_)->erase(j);
+ return 0;
+ }
+
+ void image_options_clear(rbd_image_options_t opts)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ (*opts_)->clear();
+ }
+
+ bool image_options_is_empty(rbd_image_options_t opts)
+ {
+ image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+ return (*opts_)->empty();
+ }
+
+ int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+
+ ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname
+ << " size = " << size << " order = " << order << dendl;
+ int r = validate_pool(io_ctx, cct);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!io_ctx.get_namespace().empty()) {
+ lderr(cct) << "attempting to add v1 image to namespace" << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 2) << "adding rbd image to directory..." << dendl;
+ r = tmap_set(io_ctx, imgname);
+ if (r < 0) {
+ lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ Rados rados(io_ctx);
+ uint64_t bid = rados.get_instance_id();
+
+ ldout(cct, 2) << "creating rbd image..." << dendl;
+ struct rbd_obj_header_ondisk header;
+ init_rbd_header(header, size, order, bid);
+
+ bufferlist bl;
+ bl.append((const char *)&header, sizeof(header));
+
+ string header_oid = util::old_header_name(imgname);
+ r = io_ctx.write(header_oid, bl, bl.length(), 0);
+ if (r < 0) {
+ lderr(cct) << "Error writing image header: " << cpp_strerror(r)
+ << dendl;
+ int remove_r = tmap_rm(io_ctx, imgname);
+ if (remove_r < 0) {
+ lderr(cct) << "Could not remove image from directory after "
+ << "header creation failed: "
+ << cpp_strerror(remove_r) << dendl;
+ }
+ return r;
+ }
+
+ ldout(cct, 2) << "done." << dendl;
+ return 0;
+ }
+
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ int *order)
+ {
+ uint64_t order_ = *order;
+ ImageOptions opts;
+
+ int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
+ ceph_assert(r == 0);
+
+ r = create(io_ctx, imgname, "", size, opts, "", "", false);
+
+ int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
+ ceph_assert(r1 == 0);
+ *order = order_;
+
+ return r;
+ }
+
+ int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
+ bool old_format, uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count)
+ {
+ if (!order)
+ return -EINVAL;
+
+ uint64_t order_ = *order;
+ uint64_t format = old_format ? 1 : 2;
+ ImageOptions opts;
+ int r;
+
+ r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ ceph_assert(r == 0);
+ r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ ceph_assert(r == 0);
+
+ r = create(io_ctx, imgname, "", size, opts, "", "", false);
+
+ int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
+ ceph_assert(r1 == 0);
+ *order = order_;
+
+ return r;
+ }
+
+ int create(IoCtx& io_ctx, const std::string &image_name,
+ const std::string &image_id, uint64_t size,
+ ImageOptions& opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable)
+ {
+ std::string id(image_id);
+ if (id.empty()) {
+ id = util::generate_image_id(io_ctx);
+ }
+
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ uint64_t option;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
+ lderr(cct) << "create does not support 'flatten' image option" << dendl;
+ return -EINVAL;
+ }
+ if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
+ lderr(cct) << "create does not support 'clone_format' image option"
+ << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 10) << __func__ << " name=" << image_name << ", "
+ << "id= " << id << ", "
+ << "size=" << size << ", opts=" << opts << dendl;
+
+ uint64_t format;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
+ format = cct->_conf.get_val<uint64_t>("rbd_default_format");
+ bool old_format = format == 1;
+
+ // make sure it doesn't already exist, in either format
+ int r = detect_format(io_ctx, image_name, NULL, NULL);
+ if (r != -ENOENT) {
+ if (r) {
+ lderr(cct) << "Could not tell if " << image_name << " already exists"
+ << dendl;
+ return r;
+ }
+ lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
+ return -EEXIST;
+ }
+
+ uint64_t order = 0;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
+ order = cct->_conf.get_val<uint64_t>("rbd_default_order");
+ }
+ r = image::CreateRequest<>::validate_order(cct, order);
+ if (r < 0) {
+ return r;
+ }
+
+ if (old_format) {
+ if ( !getenv("RBD_FORCE_ALLOW_V1") ) {
+ lderr(cct) << "Format 1 image creation unsupported. " << dendl;
+ return -EINVAL;
+ }
+ lderr(cct) << "Forced V1 image creation. " << dendl;
+ r = create_v1(io_ctx, image_name.c_str(), size, order);
+ } else {
+ AsioEngine asio_engine(io_ctx);
+
+ ConfigProxy config{cct->_conf};
+ api::Config<>::apply_pool_overrides(io_ctx, &config);
+
+ uint32_t create_flags = 0U;
+ uint64_t mirror_image_mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
+ if (skip_mirror_enable) {
+ create_flags = image::CREATE_FLAG_SKIP_MIRROR_ENABLE;
+ } else if (opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
+ &mirror_image_mode) == 0) {
+ create_flags = image::CREATE_FLAG_FORCE_MIRROR_ENABLE;
+ }
+
+ C_SaferCond cond;
+ image::CreateRequest<> *req = image::CreateRequest<>::create(
+ config, io_ctx, image_name, id, size, opts, create_flags,
+ static_cast<cls::rbd::MirrorImageMode>(mirror_image_mode),
+ non_primary_global_image_id, primary_mirror_uuid,
+ asio_engine.get_work_queue(), &cond);
+ req->send();
+
+ r = cond.wait();
+ }
+
+ int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ ceph_assert(r1 == 0);
+
+ return r;
+ }
+
+ /*
+ * Parent may be in different pool, hence different IoCtx
+ */
+ int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name,
+ uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count)
+ {
+ uint64_t order = *c_order;
+
+ ImageOptions opts;
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+
+ int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr,
+ c_name, opts, "", "");
+ opts.get(RBD_IMAGE_OPTION_ORDER, &order);
+ *c_order = order;
+ return r;
+ }
+
+ int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
+ const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
+ const char *c_name, ImageOptions& c_opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid)
+ {
+ ceph_assert((p_id == nullptr) ^ (p_name == nullptr));
+
+ CephContext *cct = (CephContext *)p_ioctx.cct();
+ if (p_snap_name == nullptr) {
+ lderr(cct) << "image to be cloned must be a snapshot" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t flatten;
+ if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
+ lderr(cct) << "clone does not support 'flatten' image option" << dendl;
+ return -EINVAL;
+ }
+
+ int r;
+ std::string parent_id;
+ if (p_id == nullptr) {
+ r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name,
+ &parent_id);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "failed to retrieve parent image id: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ } else {
+ parent_id = p_id;
+ }
+
+ std::string clone_id;
+ if (c_id == nullptr) {
+ clone_id = util::generate_image_id(c_ioctx);
+ } else {
+ clone_id = c_id;
+ }
+
+ ldout(cct, 10) << __func__ << " "
+ << "c_name=" << c_name << ", "
+ << "c_id= " << clone_id << ", "
+ << "c_opts=" << c_opts << dendl;
+
+ ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf};
+ api::Config<>::apply_pool_overrides(c_ioctx, &config);
+
+ AsioEngine asio_engine(p_ioctx);
+
+ C_SaferCond cond;
+ auto *req = image::CloneRequest<>::create(
+ config, p_ioctx, parent_id, p_snap_name,
+ {cls::rbd::UserSnapshotNamespace{}}, CEPH_NOSNAP, c_ioctx, c_name,
+ clone_id, c_opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL,
+ non_primary_global_image_id, primary_mirror_uuid,
+ asio_engine.get_work_queue(), &cond);
+ req->send();
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+ }
+
+ int rename(IoCtx& io_ctx, const char *srcname, const char *dstname)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
+ << dstname << dendl;
+
+ ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ BOOST_SCOPE_EXIT((ictx)) {
+ ictx->state->close();
+ } BOOST_SCOPE_EXIT_END
+
+ return ictx->operations->rename(dstname);
+ }
+
+ int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
+ {
+ ldout(ictx->cct, 20) << "info " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ image_info(ictx, info, infosize);
+ return 0;
+ }
+
+ int get_old_format(ImageCtx *ictx, uint8_t *old)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ *old = ictx->old_format;
+ return 0;
+ }
+
+ int get_size(ImageCtx *ictx, uint64_t *size)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ std::shared_lock l2{ictx->image_lock};
+ *size = ictx->get_effective_image_size(ictx->snap_id);
+ return 0;
+ }
+
+ int get_features(ImageCtx *ictx, uint64_t *features)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ std::shared_lock l{ictx->image_lock};
+ *features = ictx->features;
+ return 0;
+ }
+
+ int get_overlap(ImageCtx *ictx, uint64_t *overlap)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+ std::shared_lock image_locker{ictx->image_lock};
+ return ictx->get_parent_overlap(ictx->snap_id, overlap);
+ }
+
+ int get_flags(ImageCtx *ictx, uint64_t *flags)
+ {
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ std::shared_lock l2{ictx->image_lock};
+ return ictx->get_flags(ictx->snap_id, flags);
+ }
+
+ int set_image_notification(ImageCtx *ictx, int fd, int type)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ if (ictx->event_socket.is_valid())
+ return -EINVAL;
+ return ictx->event_socket.init(fd, type);
+ }
+
+ int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
+ *is_owner = false;
+
+ std::shared_lock owner_locker{ictx->owner_lock};
+ if (ictx->exclusive_lock == nullptr) {
+ return 0;
+ }
+
+ // might have been blocklisted by peer -- ensure we still own
+ // the lock by pinging the OSD
+ int r = ictx->exclusive_lock->assert_header_locked();
+ if (r == -EBUSY || r == -ENOENT) {
+ return 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ *is_owner = true;
+ return 0;
+ }
+
+ int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
+ << "lock_mode=" << lock_mode << dendl;
+
+ if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
+ return -EOPNOTSUPP;
+ }
+
+ C_SaferCond lock_ctx;
+ {
+ std::unique_lock l{ictx->owner_lock};
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) {
+ ictx->set_exclusive_lock_policy(
+ new exclusive_lock::StandardPolicy(ictx));
+ }
+
+ if (ictx->exclusive_lock->is_lock_owner()) {
+ return 0;
+ }
+
+ ictx->exclusive_lock->acquire_lock(&lock_ctx);
+ }
+
+ int r = lock_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::shared_lock l{ictx->owner_lock};
+ if (ictx->exclusive_lock == nullptr) {
+ return -EINVAL;
+ } else if (!ictx->exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ return ictx->exclusive_lock->get_unlocked_op_error();
+ }
+
+ return 0;
+ }
+
+ int lock_release(ImageCtx *ictx)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
+
+ C_SaferCond lock_ctx;
+ {
+ std::unique_lock l{ictx->owner_lock};
+
+ if (ictx->exclusive_lock == nullptr ||
+ !ictx->exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "not exclusive lock owner" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->release_lock(&lock_ctx);
+ }
+
+ int r = lock_ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ return 0;
+ }
+
+ int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
+
+ managed_lock::Locker locker;
+ C_SaferCond get_owner_ctx;
+ {
+ std::shared_lock owner_locker{ictx->owner_lock};
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
+ }
+
+ int r = get_owner_ctx.wait();
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to determine current lock owner: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ *lock_mode = RBD_LOCK_MODE_EXCLUSIVE;
+ lock_owners->clear();
+ lock_owners->emplace_back(locker.address);
+ return 0;
+ }
+
+ int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
+ const std::string &lock_owner) {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
+ << "lock_mode=" << lock_mode << ", "
+ << "lock_owner=" << lock_owner << dendl;
+
+ if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
+ return -EOPNOTSUPP;
+ }
+
+ if (ictx->read_only) {
+ return -EROFS;
+ }
+
+ managed_lock::Locker locker;
+ C_SaferCond get_owner_ctx;
+ {
+ std::shared_lock l{ictx->owner_lock};
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
+ }
+ int r = get_owner_ctx.wait();
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to determine current lock owner: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (locker.address != lock_owner) {
+ return -EBUSY;
+ }
+
+ C_SaferCond break_ctx;
+ {
+ std::shared_lock l{ictx->owner_lock};
+
+ if (ictx->exclusive_lock == nullptr) {
+ lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
+ return -EINVAL;
+ }
+
+ ictx->exclusive_lock->break_lock(locker, true, &break_ctx);
+ }
+ r = break_ctx.wait();
+ if (r == -ENOENT) {
+ return r;
+ } else if (r < 0) {
+ lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+ }
+
+ int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size)
+ {
+ CephContext *cct = (CephContext *)dest_md_ctx.cct();
+ uint64_t option;
+ if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
+ lderr(cct) << "copy does not support 'flatten' image option" << dendl;
+ return -EINVAL;
+ }
+ if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
+ lderr(cct) << "copy does not support 'clone_format' image option"
+ << dendl;
+ return -EINVAL;
+ }
+
+ ldout(cct, 20) << "copy " << src->name
+ << (src->snap_name.length() ? "@" + src->snap_name : "")
+ << " -> " << destname << " opts = " << opts << dendl;
+
+ src->image_lock.lock_shared();
+ uint64_t features = src->features;
+ uint64_t src_size = src->get_image_size(src->snap_id);
+ src->image_lock.unlock_shared();
+ uint64_t format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ uint64_t stripe_unit = src->stripe_unit;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ }
+ uint64_t stripe_count = src->stripe_count;
+ if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+ opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+ uint64_t order = src->order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ opts.set(RBD_IMAGE_OPTION_ORDER, order);
+ }
+ if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+ }
+ if (features & ~RBD_FEATURES_ALL) {
+ lderr(cct) << "librbd does not support requested features" << dendl;
+ return -ENOSYS;
+ }
+
+ int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
+ if (r < 0) {
+ lderr(cct) << "header creation failed" << dendl;
+ return r;
+ }
+ opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
+
+ ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx,
+ false);
+ r = dest->state->open(0);
+ if (r < 0) {
+ lderr(cct) << "failed to read newly created header" << dendl;
+ return r;
+ }
+
+ r = copy(src, dest, prog_ctx, sparse_size);
+
+ int close_r = dest->state->close();
+ if (r == 0 && close_r < 0) {
+ r = close_r;
+ }
+ return r;
+ }
+
+ class C_CopyWrite : public Context {
+ public:
+ C_CopyWrite(bufferlist *bl, Context* ctx)
+ : m_bl(bl), m_ctx(ctx) {}
+ void finish(int r) override {
+ delete m_bl;
+ m_ctx->complete(r);
+ }
+ private:
+ bufferlist *m_bl;
+ Context *m_ctx;
+ };
+
+ class C_CopyRead : public Context {
+ public:
+ C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset,
+ bufferlist *bl, size_t sparse_size)
+ : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl),
+ m_sparse_size(sparse_size) {
+ m_throttle->start_op();
+ }
+ void finish(int r) override {
+ if (r < 0) {
+ lderr(m_dest->cct) << "error reading from source image at offset "
+ << m_offset << ": " << cpp_strerror(r) << dendl;
+ delete m_bl;
+ m_throttle->end_op(r);
+ return;
+ }
+ ceph_assert(m_bl->length() == (size_t)r);
+
+ if (m_bl->is_zero()) {
+ delete m_bl;
+ m_throttle->end_op(r);
+ return;
+ }
+
+ if (!m_sparse_size) {
+ m_sparse_size = (1 << m_dest->order);
+ }
+
+ auto *throttle = m_throttle;
+ auto *end_op_ctx = new LambdaContext([throttle](int r) {
+ throttle->end_op(r);
+ });
+ auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx);
+
+ m_bl->rebuild(buffer::ptr_node::create(m_bl->length()));
+ size_t write_offset = 0;
+ size_t write_length = 0;
+ size_t offset = 0;
+ size_t length = m_bl->length();
+ const auto& m_ptr = m_bl->front();
+ while (offset < length) {
+ if (util::calc_sparse_extent(m_ptr,
+ m_sparse_size,
+ length,
+ &write_offset,
+ &write_length,
+ &offset)) {
+ bufferlist *write_bl = new bufferlist();
+ write_bl->push_back(
+ buffer::ptr_node::create(m_ptr, write_offset, write_length));
+ Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub());
+ auto comp = io::AioCompletion::create(ctx);
+
+ // coordinate through AIO WQ to ensure lock is acquired if needed
+ api::Io<>::aio_write(*m_dest, comp, m_offset + write_offset,
+ write_length, std::move(*write_bl),
+ LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+ std::move(read_trace));
+ write_offset = offset;
+ write_length = 0;
+ }
+ }
+ delete m_bl;
+ ceph_assert(gather_ctx->get_sub_created_count() > 0);
+ gather_ctx->activate();
+ }
+
+ ZTracer::Trace read_trace;
+
+ private:
+ SimpleThrottle *m_throttle;
+ ImageCtx *m_dest;
+ uint64_t m_offset;
+ bufferlist *m_bl;
+ size_t m_sparse_size;
+ };
+
+ int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size)
+ {
+ src->image_lock.lock_shared();
+ uint64_t src_size = src->get_image_size(src->snap_id);
+ src->image_lock.unlock_shared();
+
+ dest->image_lock.lock_shared();
+ uint64_t dest_size = dest->get_image_size(dest->snap_id);
+ dest->image_lock.unlock_shared();
+
+ CephContext *cct = src->cct;
+ if (dest_size < src_size) {
+ lderr(cct) << " src size " << src_size << " > dest size "
+ << dest_size << dendl;
+ return -EINVAL;
+ }
+
+ // ensure previous writes are visible to dest
+ C_SaferCond flush_ctx;
+ {
+ auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, src,
+ io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ *src, io::IMAGE_DISPATCH_LAYER_INTERNAL_START,
+ aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+ }
+ int r = flush_ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto req = deep_copy::MetadataCopyRequest<>::create(
+ src, dest, &ctx);
+ req->send();
+
+ r = ctx.wait();
+ if (r < 0) {
+ lderr(cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ZTracer::Trace trace;
+ if (src->blkin_trace_all) {
+ trace.init("copy", &src->trace_endpoint);
+ }
+
+ SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false);
+ uint64_t period = src->get_stripe_period();
+ unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ uint64_t object_id = 0;
+ for (uint64_t offset = 0; offset < src_size; offset += period) {
+ if (throttle.pending_error()) {
+ return throttle.wait_for_ret();
+ }
+
+ {
+ std::shared_lock image_locker{src->image_lock};
+ if (src->object_map != nullptr) {
+ bool skip = true;
+ // each period is related to src->stripe_count objects, check them all
+ for (uint64_t i=0; i < src->stripe_count; i++) {
+ if (object_id < src->object_map->size() &&
+ src->object_map->object_may_exist(object_id)) {
+ skip = false;
+ }
+ ++object_id;
+ }
+
+ if (skip) continue;
+ } else {
+ object_id += src->stripe_count;
+ }
+ }
+
+ uint64_t len = min(period, src_size - offset);
+ bufferlist *bl = new bufferlist();
+ auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
+ auto comp = io::AioCompletion::create_and_start<Context>(
+ ctx, src, io::AIO_TYPE_READ);
+ auto req = io::ImageDispatchSpec::create_read(
+ *src, io::IMAGE_DISPATCH_LAYER_NONE, comp,
+ {{offset, len}}, io::ReadResult{bl},
+ src->get_data_io_context(), fadvise_flags, 0, trace);
+
+ ctx->read_trace = trace;
+ req->send();
+
+ prog_ctx.update_progress(offset, src_size);
+ }
+
+ r = throttle.wait_for_ret();
+ if (r >= 0)
+ prog_ctx.update_progress(src_size, src_size);
+ return r;
+ }
+
+ int list_lockers(ImageCtx *ictx,
+ std::list<locker_t> *lockers,
+ bool *exclusive,
+ string *tag)
+ {
+ ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ std::shared_lock locker{ictx->image_lock};
+ if (exclusive)
+ *exclusive = ictx->exclusive_locked;
+ if (tag)
+ *tag = ictx->lock_tag;
+ if (lockers) {
+ lockers->clear();
+ map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t>::const_iterator it;
+ for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) {
+ locker_t locker;
+ locker.client = stringify(it->first.locker);
+ locker.cookie = it->first.cookie;
+ locker.address = it->second.addr.get_legacy_str();
+ lockers->push_back(locker);
+ }
+ }
+
+ return 0;
+ }
+
+ int lock(ImageCtx *ictx, bool exclusive, const string& cookie,
+ const string& tag)
+ {
+ ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive
+ << " cookie='" << cookie << "' tag='" << tag << "'"
+ << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ /**
+ * If we wanted we could do something more intelligent, like local
+ * checks that we think we will succeed. But for now, let's not
+ * duplicate that code.
+ */
+ {
+ std::shared_lock locker{ictx->image_lock};
+ r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
+ exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED,
+ cookie, tag, "", utime_t(), 0);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ ictx->notify_update();
+ return 0;
+ }
+
+ int unlock(ImageCtx *ictx, const string& cookie)
+ {
+ ldout(ictx->cct, 20) << "unlock image " << ictx
+ << " cookie='" << cookie << "'" << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ {
+ std::shared_lock locker{ictx->image_lock};
+ r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
+ RBD_LOCK_NAME, cookie);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ ictx->notify_update();
+ return 0;
+ }
+
+ int break_lock(ImageCtx *ictx, const string& client,
+ const string& cookie)
+ {
+ ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
+ << "' cookie='" << cookie << "'" << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ entity_name_t lock_client;
+ if (!lock_client.parse(client)) {
+ lderr(ictx->cct) << "Unable to parse client '" << client
+ << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (ictx->config.get_val<bool>("rbd_blocklist_on_break_lock")) {
+ typedef std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> Lockers;
+ Lockers lockers;
+ ClsLockType lock_type;
+ std::string lock_tag;
+ r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
+ RBD_LOCK_NAME, &lockers, &lock_type,
+ &lock_tag);
+ if (r < 0) {
+ lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::string client_address;
+ for (Lockers::iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ if (it->first.locker == lock_client) {
+ client_address = it->second.addr.get_legacy_str();
+ break;
+ }
+ }
+ if (client_address.empty()) {
+ return -ENOENT;
+ }
+
+ librados::Rados rados(ictx->md_ctx);
+ r = rados.blocklist_add(
+ client_address,
+ ictx->config.get_val<uint64_t>("rbd_blocklist_expire_seconds"));
+ if (r < 0) {
+ lderr(ictx->cct) << "unable to blocklist client: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid,
+ RBD_LOCK_NAME, cookie, lock_client);
+ if (r < 0)
+ return r;
+ ictx->notify_update();
+ return 0;
+ }
+
+ void rbd_ctx_cb(completion_t cb, void *arg)
+ {
+ Context *ctx = reinterpret_cast<Context *>(arg);
+ auto comp = reinterpret_cast<io::AioCompletion *>(cb);
+ ctx->complete(comp->get_return_value());
+ comp->release();
+ }
+
+ int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+ {
+ coarse_mono_time start_time;
+ ceph::timespan elapsed;
+
+ ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
+ << " len = " << len << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0)
+ return r;
+
+ uint64_t mylen = len;
+ ictx->image_lock.lock_shared();
+ r = clip_io(ictx, off, &mylen);
+ ictx->image_lock.unlock_shared();
+ if (r < 0)
+ return r;
+
+ int64_t total_read = 0;
+ uint64_t period = ictx->get_stripe_period();
+ uint64_t left = mylen;
+
+ ZTracer::Trace trace;
+ if (ictx->blkin_trace_all) {
+ trace.init("read_iterate", &ictx->trace_endpoint);
+ }
+
+ std::shared_lock owner_locker{ictx->owner_lock};
+ start_time = coarse_mono_clock::now();
+ while (left > 0) {
+ uint64_t period_off = off - (off % period);
+ uint64_t read_len = min(period_off + period - off, left);
+
+ bufferlist bl;
+
+ C_SaferCond ctx;
+ auto c = io::AioCompletion::create_and_start(&ctx, ictx,
+ io::AIO_TYPE_READ);
+ auto req = io::ImageDispatchSpec::create_read(
+ *ictx, io::IMAGE_DISPATCH_LAYER_NONE, c,
+ {{off, read_len}}, io::ReadResult{&bl},
+ ictx->get_data_io_context(), 0, 0, trace);
+ req->send();
+
+ int ret = ctx.wait();
+ if (ret < 0) {
+ return ret;
+ }
+
+ r = cb(total_read, ret, bl.c_str(), arg);
+ if (r < 0) {
+ return r;
+ }
+
+ total_read += ret;
+ left -= ret;
+ off += ret;
+ }
+
+ elapsed = coarse_mono_clock::now() - start_time;
+ ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed);
+ ictx->perfcounter->inc(l_librbd_rd);
+ ictx->perfcounter->inc(l_librbd_rd_bytes, mylen);
+ return total_read;
+ }
+
+ // validate extent against image size; clip to image size if necessary
+ int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len)
+ {
+ ceph_assert(ceph_mutex_is_locked(ictx->image_lock));
+
+ if (ictx->snap_id != CEPH_NOSNAP &&
+ ictx->get_snap_info(ictx->snap_id) == nullptr) {
+ return -ENOENT;
+ }
+ uint64_t image_size = ictx->get_effective_image_size(ictx->snap_id);
+
+ // special-case "len == 0" requests: always valid
+ if (*len == 0)
+ return 0;
+
+ // can't start past end
+ if (off >= image_size)
+ return -EINVAL;
+
+ // clip requests that extend past end to just end
+ if ((off + *len) > image_size)
+ *len = (size_t)(image_size - off);
+
+ return 0;
+ }
+
+ int invalidate_cache(ImageCtx *ictx)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ {
+ ictx->io_image_dispatcher->invalidate_cache(&ctx);
+ }
+ r = ctx.wait();
+
+ if (r < 0) {
+ ldout(cct, 20) << "failed to invalidate image cache" << dendl;
+ return r;
+ }
+
+ ictx->perfcounter->inc(l_librbd_invalidate_cache);
+
+ // Delete writeback cache if it is not initialized
+ if ((!ictx->exclusive_lock ||
+ !ictx->exclusive_lock->is_lock_owner()) &&
+ ictx->test_features(RBD_FEATURE_DIRTY_CACHE)) {
+ C_SaferCond ctx3;
+ ictx->plugin_registry->discard(&ctx3);
+ r = ctx3.wait();
+ }
+ return r;
+ }
+
+ int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp)
+ {
+ if (numcomp <= 0)
+ return -EINVAL;
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp
+ << dendl;
+ int i = 0;
+ while (i < numcomp && ictx->event_socket_completions.pop(comps[i])) {
+ ++i;
+ }
+
+ return i;
+ }
+
+ int metadata_get(ImageCtx *ictx, const string &key, string *value)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
+ }
+
+ int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "metadata_list " << ictx << dendl;
+
+ int r = ictx->state->refresh_if_required();
+ if (r < 0) {
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto req = image::GetMetadataRequest<>::create(
+ ictx->md_ctx, ictx->header_oid, false, "", start, max, pairs, &ctx);
+ req->send();
+
+ return ctx.wait();
+ }
+
+ int list_watchers(ImageCtx *ictx,
+ std::list<librbd::image_watcher_t> &watchers)
+ {
+ int r;
+ std::string header_oid;
+ std::list<obj_watch_t> obj_watchers;
+
+ if (ictx->old_format) {
+ header_oid = util::old_header_name(ictx->name);
+ } else {
+ header_oid = util::header_name(ictx->id);
+ }
+
+ r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) {
+ librbd::image_watcher_t watcher;
+ watcher.addr = i->addr;
+ watcher.id = i->watcher_id;
+ watcher.cookie = i->cookie;
+
+ watchers.push_back(watcher);
+ }
+
+ return 0;
+ }
+
+}
+
+std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) {
+ os << "[";
+
+ const char *delimiter = "";
+ for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) {
+ if (i.second == librbd::STR) {
+ std::string val;
+ if (opts.get(i.first, &val) == 0) {
+ os << delimiter << librbd::image_option_name(i.first) << "=" << val;
+ delimiter = ", ";
+ }
+ } else if (i.second == librbd::UINT64) {
+ uint64_t val;
+ if (opts.get(i.first, &val) == 0) {
+ os << delimiter << librbd::image_option_name(i.first) << "=" << val;
+ delimiter = ", ";
+ }
+ }
+ }
+
+ os << "]";
+
+ return os;
+}
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
new file mode 100644
index 000000000..9bd03228a
--- /dev/null
+++ b/src/librbd/internal.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_INTERNAL_H
+#define CEPH_LIBRBD_INTERNAL_H
+
+#include "include/int_types.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "include/buffer_fwd.h"
+#include "include/rbd/librbd.hpp"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/ceph_time.h"
+#include "librbd/Types.h"
+
+namespace librbd {
+
+ struct ImageCtx;
+ namespace io { struct AioCompletion; }
+
+ class NoOpProgressContext : public ProgressContext
+ {
+ public:
+ NoOpProgressContext()
+ {
+ }
+ int update_progress(uint64_t offset, uint64_t src_size) override
+ {
+ return 0;
+ }
+ };
+
+ int detect_format(librados::IoCtx &io_ctx, const std::string &name,
+ bool *old_format, uint64_t *size);
+
+ bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap);
+
+ std::string image_option_name(int optname);
+ void image_options_create(rbd_image_options_t* opts);
+ void image_options_create_ref(rbd_image_options_t* opts,
+ rbd_image_options_t orig);
+ void image_options_copy(rbd_image_options_t *opts,
+ const ImageOptions &orig);
+ void image_options_destroy(rbd_image_options_t opts);
+ int image_options_set(rbd_image_options_t opts, int optname,
+ const std::string& optval);
+ int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval);
+ int image_options_get(rbd_image_options_t opts, int optname,
+ std::string* optval);
+ int image_options_get(rbd_image_options_t opts, int optname,
+ uint64_t* optval);
+ int image_options_is_set(rbd_image_options_t opts, int optname,
+ bool* is_set);
+ int image_options_unset(rbd_image_options_t opts, int optname);
+ void image_options_clear(rbd_image_options_t opts);
+ bool image_options_is_empty(rbd_image_options_t opts);
+
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ int *order);
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ bool old_format, uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count);
+ int create(IoCtx& io_ctx, const std::string &image_name,
+ const std::string &image_id, uint64_t size, ImageOptions& opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid,
+ bool skip_mirror_enable);
+ int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name,
+ uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count);
+ int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
+ const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
+ const char *c_name, ImageOptions& c_opts,
+ const std::string &non_primary_global_image_id,
+ const std::string &primary_mirror_uuid);
+ int rename(librados::IoCtx& io_ctx, const char *srcname, const char *dstname);
+ int info(ImageCtx *ictx, image_info_t& info, size_t image_size);
+ int get_old_format(ImageCtx *ictx, uint8_t *old);
+ int get_size(ImageCtx *ictx, uint64_t *size);
+ int get_features(ImageCtx *ictx, uint64_t *features);
+ int get_overlap(ImageCtx *ictx, uint64_t *overlap);
+ int get_flags(ImageCtx *ictx, uint64_t *flags);
+ int set_image_notification(ImageCtx *ictx, int fd, int type);
+ int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner);
+ int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode);
+ int lock_release(ImageCtx *ictx);
+ int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners);
+ int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
+ const std::string &lock_owner);
+
+ int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
+ ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size);
+ int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size);
+
+ /* cooperative locking */
+ int list_lockers(ImageCtx *ictx,
+ std::list<locker_t> *locks,
+ bool *exclusive,
+ std::string *tag);
+ int lock(ImageCtx *ictx, bool exclusive, const std::string& cookie,
+ const std::string& tag);
+ int lock_shared(ImageCtx *ictx, const std::string& cookie,
+ const std::string& tag);
+ int unlock(ImageCtx *ictx, const std::string& cookie);
+ int break_lock(ImageCtx *ictx, const std::string& client,
+ const std::string& cookie);
+
+ void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx);
+
+ int read_header_bl(librados::IoCtx& io_ctx, const std::string& md_oid,
+ ceph::bufferlist& header, uint64_t *ver);
+ int read_header(librados::IoCtx& io_ctx, const std::string& md_oid,
+ struct rbd_obj_header_ondisk *header, uint64_t *ver);
+ int tmap_set(librados::IoCtx& io_ctx, const std::string& imgname);
+ int tmap_rm(librados::IoCtx& io_ctx, const std::string& imgname);
+ void image_info(const ImageCtx *ictx, image_info_t& info, size_t info_size);
+ uint64_t oid_to_object_no(const std::string& oid,
+ const std::string& object_prefix);
+ int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len);
+ void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
+ uint64_t size, int order, uint64_t bid);
+
+ int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg);
+
+ int invalidate_cache(ImageCtx *ictx);
+ int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp);
+ int metadata_list(ImageCtx *ictx, const string &last, uint64_t max, map<string, bufferlist> *pairs);
+ int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value);
+
+ int list_watchers(ImageCtx *ictx, std::list<librbd::image_watcher_t> &watchers);
+}
+
+std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts);
+
+#endif
diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc
new file mode 100644
index 000000000..c04b80770
--- /dev/null
+++ b/src/librbd/io/AioCompletion.cc
@@ -0,0 +1,294 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/AioCompletion.h"
+#include <errno.h>
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Types.h"
+#include <boost/asio/dispatch.hpp>
+#include <boost/asio/post.hpp>
+
+#ifdef WITH_LTTNG
+#include "tracing/librbd.h"
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+int AioCompletion::wait_for_complete() {
+ tracepoint(librbd, aio_wait_for_complete_enter, this);
+ {
+ std::unique_lock<std::mutex> locker(lock);
+ while (state != AIO_STATE_COMPLETE) {
+ cond.wait(locker);
+ }
+ }
+ tracepoint(librbd, aio_wait_for_complete_exit, 0);
+ return 0;
+}
+
+void AioCompletion::finalize() {
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ // finalize any pending error results since we won't be
+ // atomically incrementing rval anymore
+ int err_r = error_rval;
+ if (err_r < 0) {
+ rval = err_r;
+ }
+
+ ssize_t r = rval;
+ ldout(cct, 20) << "r=" << r << dendl;
+ if (r >= 0 && aio_type == AIO_TYPE_READ) {
+ read_result.assemble_result(cct);
+ }
+}
+
+void AioCompletion::complete() {
+ ceph_assert(ictx != nullptr);
+
+ ssize_t r = rval;
+ if ((aio_type == AIO_TYPE_CLOSE) || (aio_type == AIO_TYPE_OPEN && r < 0)) {
+ ictx = nullptr;
+ external_callback = false;
+ } else {
+ CephContext *cct = ictx->cct;
+
+ tracepoint(librbd, aio_complete_enter, this, r);
+ if (ictx->perfcounter != nullptr) {
+ ceph::timespan elapsed = coarse_mono_clock::now() - start_time;
+ switch (aio_type) {
+ case AIO_TYPE_GENERIC:
+ case AIO_TYPE_OPEN:
+ break;
+ case AIO_TYPE_READ:
+ ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break;
+ case AIO_TYPE_WRITE:
+ ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break;
+ case AIO_TYPE_DISCARD:
+ ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break;
+ case AIO_TYPE_FLUSH:
+ ictx->perfcounter->tinc(l_librbd_flush_latency, elapsed); break;
+ case AIO_TYPE_WRITESAME:
+ ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break;
+ case AIO_TYPE_COMPARE_AND_WRITE:
+ ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break;
+ default:
+ lderr(cct) << "completed invalid aio_type: " << aio_type << dendl;
+ break;
+ }
+ }
+ }
+
+ state = AIO_STATE_CALLBACK;
+ if (complete_cb) {
+ if (external_callback) {
+ complete_external_callback();
+ } else {
+ complete_cb(rbd_comp, complete_arg);
+ complete_event_socket();
+ notify_callbacks_complete();
+ }
+ } else {
+ complete_event_socket();
+ notify_callbacks_complete();
+ }
+
+ tracepoint(librbd, aio_complete_exit);
+}
+
+void AioCompletion::init_time(ImageCtx *i, aio_type_t t) {
+ if (ictx == nullptr) {
+ ictx = i;
+ aio_type = t;
+ start_time = coarse_mono_clock::now();
+ }
+}
+
+void AioCompletion::start_op() {
+ ceph_assert(ictx != nullptr);
+
+ if (aio_type == AIO_TYPE_OPEN || aio_type == AIO_TYPE_CLOSE) {
+ // no need to track async open/close operations
+ return;
+ }
+
+ ceph_assert(!async_op.started());
+ async_op.start_op(*ictx);
+}
+
+void AioCompletion::queue_complete() {
+ uint32_t zero = 0;
+ pending_count.compare_exchange_strong(zero, 1);
+ ceph_assert(zero == 0);
+
+ add_request();
+
+ // ensure completion fires in clean lock context
+ boost::asio::post(ictx->asio_engine->get_api_strand(), [this]() {
+ complete_request(0);
+ });
+}
+
+void AioCompletion::block(CephContext* cct) {
+ ldout(cct, 20) << dendl;
+ ceph_assert(!was_armed);
+
+ get();
+ ++pending_count;
+}
+
+void AioCompletion::unblock(CephContext* cct) {
+ ldout(cct, 20) << dendl;
+ ceph_assert(was_armed);
+
+ uint32_t previous_pending_count = pending_count--;
+ ceph_assert(previous_pending_count > 0);
+
+ if (previous_pending_count == 1) {
+ queue_complete();
+ }
+ put();
+}
+
+void AioCompletion::fail(int r)
+{
+ ceph_assert(ictx != nullptr);
+ ceph_assert(r < 0);
+
+ bool queue_required = true;
+ if (aio_type == AIO_TYPE_CLOSE || aio_type == AIO_TYPE_OPEN) {
+ // executing from a safe context and the ImageCtx has been destructed
+ queue_required = false;
+ } else {
+ CephContext *cct = ictx->cct;
+ lderr(cct) << cpp_strerror(r) << dendl;
+ }
+
+ ceph_assert(!was_armed);
+ was_armed = true;
+
+ rval = r;
+
+ uint32_t previous_pending_count = pending_count.load();
+ if (previous_pending_count == 0) {
+ if (queue_required) {
+ queue_complete();
+ } else {
+ complete();
+ }
+ }
+}
+
+void AioCompletion::set_request_count(uint32_t count) {
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ ceph_assert(!was_armed);
+ was_armed = true;
+
+ ldout(cct, 20) << "pending=" << count << dendl;
+ uint32_t previous_pending_count = pending_count.fetch_add(count);
+ if (previous_pending_count == 0 && count == 0) {
+ queue_complete();
+ }
+}
+
+void AioCompletion::complete_request(ssize_t r)
+{
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ if (r > 0) {
+ rval += r;
+ } else if (r < 0 && r != -EEXIST) {
+ // might race w/ another thread setting an error code but
+ // first one wins
+ int zero = 0;
+ error_rval.compare_exchange_strong(zero, r);
+ }
+
+ uint32_t previous_pending_count = pending_count--;
+ ceph_assert(previous_pending_count > 0);
+ auto pending_count = previous_pending_count - 1;
+
+ ldout(cct, 20) << "cb=" << complete_cb << ", "
+ << "pending=" << pending_count << dendl;
+ if (pending_count == 0) {
+ finalize();
+ complete();
+ }
+ put();
+}
+
+bool AioCompletion::is_complete() {
+ tracepoint(librbd, aio_is_complete_enter, this);
+ bool done = (this->state != AIO_STATE_PENDING);
+ tracepoint(librbd, aio_is_complete_exit, done);
+ return done;
+}
+
+ssize_t AioCompletion::get_return_value() {
+ tracepoint(librbd, aio_get_return_value_enter, this);
+ ssize_t r = rval;
+ tracepoint(librbd, aio_get_return_value_exit, r);
+ return r;
+}
+
+void AioCompletion::complete_external_callback() {
+ get();
+
+ // ensure librbd external users never experience concurrent callbacks
+ // from multiple librbd-internal threads.
+ boost::asio::dispatch(ictx->asio_engine->get_api_strand(), [this]() {
+ complete_cb(rbd_comp, complete_arg);
+ complete_event_socket();
+ notify_callbacks_complete();
+ put();
+ });
+}
+
+void AioCompletion::complete_event_socket() {
+ if (ictx != nullptr && event_notify && ictx->event_socket.is_valid()) {
+ ictx->event_socket_completions.push(this);
+ ictx->event_socket.notify();
+ }
+}
+
+void AioCompletion::notify_callbacks_complete() {
+ state = AIO_STATE_COMPLETE;
+
+ {
+ std::unique_lock<std::mutex> locker(lock);
+ cond.notify_all();
+ }
+
+ if (image_dispatcher_ctx != nullptr) {
+ image_dispatcher_ctx->complete(rval);
+ }
+
+ // note: possible for image to be closed after op marked finished
+ if (async_op.started()) {
+ async_op.finish_op();
+ }
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/AioCompletion.h b/src/librbd/io/AioCompletion.h
new file mode 100644
index 000000000..4ae93fe36
--- /dev/null
+++ b/src/librbd/io/AioCompletion.h
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_AIO_COMPLETION_H
+#define CEPH_LIBRBD_IO_AIO_COMPLETION_H
+
+#include "common/ceph_time.h"
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "include/utime.h"
+#include "include/rbd/librbd.hpp"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+/**
+ * AioCompletion is the overall completion for a single
+ * rbd I/O request. It may be composed of many AioObjectRequests,
+ * which each go to a single object.
+ *
+ * The retrying of individual requests is handled at a lower level,
+ * so all AioCompletion cares about is the count of outstanding
+ * requests. The number of expected individual requests should be
+ * set initially using set_request_count() prior to issuing the
+ * requests. This ensures that the completion will not be completed
+ * within the caller's thread of execution (instead via a librados
+ * context or via a thread pool context for cache read hits).
+ */
+struct AioCompletion {
+ typedef enum {
+ AIO_STATE_PENDING = 0,
+ AIO_STATE_CALLBACK,
+ AIO_STATE_COMPLETE,
+ } aio_state_t;
+
+ mutable std::mutex lock;
+ std::condition_variable cond;
+
+ callback_t complete_cb = nullptr;
+ void *complete_arg = nullptr;
+ rbd_completion_t rbd_comp = nullptr;
+
+ /// note: only using atomic for built-in memory barrier
+ std::atomic<aio_state_t> state{AIO_STATE_PENDING};
+
+ std::atomic<ssize_t> rval{0};
+ std::atomic<int> error_rval{0};
+ std::atomic<uint32_t> ref{1};
+ std::atomic<uint32_t> pending_count{0}; ///< number of requests/blocks
+ std::atomic<bool> released{false};
+
+ ImageCtx *ictx = nullptr;
+ coarse_mono_time start_time;
+ aio_type_t aio_type = AIO_TYPE_NONE;
+
+ ReadResult read_result;
+
+ AsyncOperation async_op;
+
+ bool event_notify = false;
+ bool was_armed = false;
+ bool external_callback = false;
+
+ Context* image_dispatcher_ctx = nullptr;
+
+ template <typename T, void (T::*MF)(int)>
+ static void callback_adapter(completion_t cb, void *arg) {
+ AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
+ T *t = reinterpret_cast<T *>(arg);
+ (t->*MF)(comp->get_return_value());
+ comp->release();
+ }
+
+ static AioCompletion *create(void *cb_arg, callback_t cb_complete,
+ rbd_completion_t rbd_comp) {
+ AioCompletion *comp = new AioCompletion();
+ comp->set_complete_cb(cb_arg, cb_complete);
+ comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp);
+ return comp;
+ }
+
+ template <typename T, void (T::*MF)(int) = &T::complete>
+ static AioCompletion *create(T *obj) {
+ AioCompletion *comp = new AioCompletion();
+ comp->set_complete_cb(obj, &callback_adapter<T, MF>);
+ comp->rbd_comp = comp;
+ return comp;
+ }
+
+ template <typename T, void (T::*MF)(int) = &T::complete>
+ static AioCompletion *create_and_start(T *obj, ImageCtx *image_ctx,
+ aio_type_t type) {
+ AioCompletion *comp = create<T, MF>(obj);
+ comp->init_time(image_ctx, type);
+ comp->start_op();
+ return comp;
+ }
+
+ AioCompletion() {
+ }
+
+ ~AioCompletion() {
+ }
+
+ int wait_for_complete();
+
+ void finalize();
+
+ inline bool is_initialized(aio_type_t type) const {
+ std::unique_lock<std::mutex> locker(lock);
+ return ((ictx != nullptr) && (aio_type == type));
+ }
+ inline bool is_started() const {
+ std::unique_lock<std::mutex> locker(lock);
+ return async_op.started();
+ }
+
+ void block(CephContext* cct);
+ void unblock(CephContext* cct);
+
+ void init_time(ImageCtx *i, aio_type_t t);
+ void start_op();
+ void fail(int r);
+
+ void complete();
+
+ void set_complete_cb(void *cb_arg, callback_t cb) {
+ complete_cb = cb;
+ complete_arg = cb_arg;
+ }
+
+ void set_request_count(uint32_t num);
+ void add_request() {
+ ceph_assert(pending_count > 0);
+ get();
+ }
+ void complete_request(ssize_t r);
+
+ bool is_complete();
+
+ ssize_t get_return_value();
+
+ void get() {
+ ceph_assert(ref > 0);
+ ++ref;
+ }
+ void release() {
+ bool previous_released = released.exchange(true);
+ ceph_assert(!previous_released);
+ put();
+ }
+ void put() {
+ uint32_t previous_ref = ref--;
+ ceph_assert(previous_ref > 0);
+
+ if (previous_ref == 1) {
+ delete this;
+ }
+ }
+
+ void set_event_notify(bool s) {
+ event_notify = s;
+ }
+
+ void *get_arg() {
+ return complete_arg;
+ }
+
+private:
+ void queue_complete();
+ void complete_external_callback();
+ void complete_event_socket();
+ void notify_callbacks_complete();
+};
+
+class C_AioRequest : public Context {
+public:
+ C_AioRequest(AioCompletion *completion) : m_completion(completion) {
+ m_completion->add_request();
+ }
+ ~C_AioRequest() override {}
+ void finish(int r) override {
+ m_completion->complete_request(r);
+ }
+protected:
+ AioCompletion *m_completion;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_AIO_COMPLETION_H
diff --git a/src/librbd/io/AsyncOperation.cc b/src/librbd/io/AsyncOperation.cc
new file mode 100644
index 000000000..18db2410e
--- /dev/null
+++ b/src/librbd/io/AsyncOperation.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/AsyncOperation.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::AsyncOperation: "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+struct C_CompleteFlushes : public Context {
+ ImageCtx *image_ctx;
+ std::list<Context *> flush_contexts;
+
+ explicit C_CompleteFlushes(ImageCtx *image_ctx,
+ std::list<Context *> &&flush_contexts)
+ : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) {
+ }
+ void finish(int r) override {
+ std::shared_lock owner_locker{image_ctx->owner_lock};
+ while (!flush_contexts.empty()) {
+ Context *flush_ctx = flush_contexts.front();
+ flush_contexts.pop_front();
+
+ ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl;
+ flush_ctx->complete(0);
+ }
+ }
+};
+
+} // anonymous namespace
+
+void AsyncOperation::start_op(ImageCtx &image_ctx) {
+ ceph_assert(m_image_ctx == NULL);
+ m_image_ctx = &image_ctx;
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+ std::lock_guard l{m_image_ctx->async_ops_lock};
+ m_image_ctx->async_ops.push_front(&m_xlist_item);
+}
+
+void AsyncOperation::finish_op() {
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+
+ {
+ std::lock_guard l{m_image_ctx->async_ops_lock};
+ xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
+ ++iter;
+ ceph_assert(m_xlist_item.remove_myself());
+
+ // linked list stored newest -> oldest ops
+ if (!iter.end() && !m_flush_contexts.empty()) {
+ ldout(m_image_ctx->cct, 20) << "moving flush contexts to previous op: "
+ << *iter << dendl;
+ (*iter)->m_flush_contexts.insert((*iter)->m_flush_contexts.end(),
+ m_flush_contexts.begin(),
+ m_flush_contexts.end());
+ return;
+ }
+ }
+
+ if (!m_flush_contexts.empty()) {
+ C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx,
+ std::move(m_flush_contexts));
+ m_image_ctx->asio_engine->post(ctx, 0);
+ }
+}
+
+void AsyncOperation::flush(Context* on_finish) {
+ {
+ std::lock_guard locker{m_image_ctx->async_ops_lock};
+ xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
+ ++iter;
+
+ // linked list stored newest -> oldest ops
+ if (!iter.end()) {
+ (*iter)->m_flush_contexts.push_back(on_finish);
+ return;
+ }
+ }
+
+ m_image_ctx->asio_engine->post(on_finish, 0);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/AsyncOperation.h b/src/librbd/io/AsyncOperation.h
new file mode 100644
index 000000000..b0a37c4b8
--- /dev/null
+++ b/src/librbd/io/AsyncOperation.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_IO_ASYNC_OPERATION_H
+#define LIBRBD_IO_ASYNC_OPERATION_H
+
+#include "include/ceph_assert.h"
+#include "include/xlist.h"
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+class AsyncOperation {
+public:
+
+ AsyncOperation()
+ : m_image_ctx(NULL), m_xlist_item(this)
+ {
+ }
+
+ ~AsyncOperation()
+ {
+ ceph_assert(!m_xlist_item.is_on_list());
+ }
+
+ inline bool started() const {
+ return m_xlist_item.is_on_list();
+ }
+
+ void start_op(ImageCtx &image_ctx);
+ void finish_op();
+
+ void flush(Context *on_finish);
+
+private:
+
+ ImageCtx *m_image_ctx;
+ xlist<AsyncOperation *>::item m_xlist_item;
+ std::list<Context *> m_flush_contexts;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // LIBRBD_IO_ASYNC_OPERATION_H
diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc
new file mode 100644
index 000000000..d70851409
--- /dev/null
+++ b/src/librbd/io/CopyupRequest.cc
@@ -0,0 +1,774 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/CopyupRequest.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/deep_copy/ObjectCopyRequest.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Utils.h"
+
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \
+ << " " << __func__ << ": " \
+ << data_object_name(m_image_ctx, m_object_no) << " "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+
+namespace {
+
+template <typename I>
+class C_UpdateObjectMap : public C_AsyncObjectThrottle<I> {
+public:
+ C_UpdateObjectMap(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no, uint8_t head_object_map_state,
+ const std::vector<uint64_t> *snap_ids,
+ bool first_snap_is_clean, const ZTracer::Trace &trace,
+ size_t snap_id_idx)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no),
+ m_head_object_map_state(head_object_map_state), m_snap_ids(*snap_ids),
+ m_first_snap_is_clean(first_snap_is_clean), m_trace(trace),
+ m_snap_id_idx(snap_id_idx)
+ {
+ }
+
+ int send() override {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ if (image_ctx.exclusive_lock == nullptr) {
+ return 1;
+ }
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map == nullptr) {
+ return 1;
+ }
+
+ uint64_t snap_id = m_snap_ids[m_snap_id_idx];
+ if (snap_id == CEPH_NOSNAP) {
+ return update_head();
+ } else {
+ return update_snapshot(snap_id);
+ }
+ }
+
+ int update_head() {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ bool sent = image_ctx.object_map->template aio_update<Context>(
+ CEPH_NOSNAP, m_object_no, m_head_object_map_state, {}, m_trace, false,
+ this);
+ return (sent ? 0 : 1);
+ }
+
+ int update_snapshot(uint64_t snap_id) {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ uint8_t state = OBJECT_EXISTS;
+ if (image_ctx.test_features(RBD_FEATURE_FAST_DIFF, image_ctx.image_lock) &&
+ (m_snap_id_idx > 0 || m_first_snap_is_clean)) {
+ // first snapshot should be exists+dirty since it contains
+ // the copyup data -- later snapshots inherit the data.
+ state = OBJECT_EXISTS_CLEAN;
+ }
+
+ bool sent = image_ctx.object_map->template aio_update<Context>(
+ snap_id, m_object_no, state, {}, m_trace, true, this);
+ ceph_assert(sent);
+ return 0;
+ }
+
+private:
+ uint64_t m_object_no;
+ uint8_t m_head_object_map_state;
+ const std::vector<uint64_t> &m_snap_ids;
+ bool m_first_snap_is_clean;
+ const ZTracer::Trace &m_trace;
+ size_t m_snap_id_idx;
+};
+
+} // anonymous namespace
+
+template <typename I>
+CopyupRequest<I>::CopyupRequest(I *ictx, uint64_t objectno,
+ Extents &&image_extents,
+ const ZTracer::Trace &parent_trace)
+ : m_image_ctx(ictx), m_object_no(objectno), m_image_extents(image_extents),
+ m_trace(librbd::util::create_trace(*m_image_ctx, "copy-up", parent_trace))
+{
+ ceph_assert(m_image_ctx->data_ctx.is_valid());
+ m_async_op.start_op(*librbd::util::get_image_ctx(m_image_ctx));
+}
+
+template <typename I>
+CopyupRequest<I>::~CopyupRequest() {
+ ceph_assert(m_pending_requests.empty());
+ m_async_op.finish_op();
+}
+
+template <typename I>
+void CopyupRequest<I>::append_request(AbstractObjectWriteRequest<I> *req,
+ const Extents& object_extents) {
+ std::lock_guard locker{m_lock};
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_request=" << req << ", "
+ << "append=" << m_append_request_permitted << dendl;
+ if (m_append_request_permitted) {
+ m_pending_requests.push_back(req);
+
+ for (auto [offset, length] : object_extents) {
+ if (length > 0) {
+ m_write_object_extents.union_insert(offset, length);
+ }
+ }
+ } else {
+ m_restart_requests.push_back(req);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::send() {
+ read_from_parent();
+}
+
+template <typename I>
+void CopyupRequest<I>::read_from_parent() {
+ auto cct = m_image_ctx->cct;
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+
+ if (m_image_ctx->parent == nullptr) {
+ ldout(cct, 5) << "parent detached" << dendl;
+
+ m_image_ctx->asio_engine->post(
+ [this]() { handle_read_from_parent(-ENOENT); });
+ return;
+ } else if (is_deep_copy()) {
+ deep_copy();
+ return;
+ }
+
+ auto comp = AioCompletion::create_and_start<
+ CopyupRequest<I>,
+ &CopyupRequest<I>::handle_read_from_parent>(
+ this, librbd::util::get_image_ctx(m_image_ctx->parent), AIO_TYPE_READ);
+
+ ldout(cct, 20) << "completion=" << comp << ", "
+ << "extents=" << m_image_extents
+ << dendl;
+ auto req = io::ImageDispatchSpec::create_read(
+ *m_image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp,
+ std::move(m_image_extents),
+ ReadResult{&m_copyup_extent_map, &m_copyup_data},
+ m_image_ctx->parent->get_data_io_context(), 0, 0, m_trace);
+ req->send();
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_read_from_parent(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ m_lock.lock();
+ disable_append_requests();
+ m_lock.unlock();
+
+ lderr(cct) << "error reading from parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ convert_copyup_extent_map();
+
+ m_image_ctx->image_lock.lock_shared();
+ m_lock.lock();
+ disable_append_requests();
+
+ r = prepare_copyup_data();
+ if (r < 0) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ lderr(m_image_ctx->cct) << "failed to prepare copyup data: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_copyup_is_zero = m_copyup_data.is_zero();
+ m_copyup_required = is_copyup_required();
+ if (!m_copyup_required) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(cct, 20) << "no-op, skipping" << dendl;
+ finish(0);
+ return;
+ }
+
+ // copyup() will affect snapshots only if parent data is not all
+ // zeros.
+ if (!m_copyup_is_zero) {
+ m_snap_ids.insert(m_snap_ids.end(), m_image_ctx->snaps.rbegin(),
+ m_image_ctx->snaps.rend());
+ }
+
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ update_object_maps();
+}
+
+template <typename I>
+void CopyupRequest<I>::deep_copy() {
+ auto cct = m_image_ctx->cct;
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+ ceph_assert(m_image_ctx->parent != nullptr);
+
+ m_lock.lock();
+ m_deep_copied = true;
+ m_flatten = is_copyup_required() ? true : m_image_ctx->migration_info.flatten;
+ m_lock.unlock();
+
+ ldout(cct, 20) << "flatten=" << m_flatten << dendl;
+
+ uint32_t flags = deep_copy::OBJECT_COPY_REQUEST_FLAG_MIGRATION;
+ if (m_flatten) {
+ flags |= deep_copy::OBJECT_COPY_REQUEST_FLAG_FLATTEN;
+ }
+
+ auto ctx = librbd::util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_deep_copy>(this);
+ auto req = deep_copy::ObjectCopyRequest<I>::create(
+ m_image_ctx->parent, m_image_ctx, 0, 0,
+ m_image_ctx->migration_info.snap_map, m_object_no, flags, nullptr, ctx);
+
+ req->send();
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_deep_copy(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_image_ctx->image_lock.lock_shared();
+ m_lock.lock();
+ m_copyup_required = is_copyup_required();
+ if (r == -ENOENT && !m_flatten && m_copyup_required) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(cct, 10) << "restart deep-copy with flatten" << dendl;
+ send();
+ return;
+ }
+
+ disable_append_requests();
+
+ if (r < 0 && r != -ENOENT) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ lderr(cct) << "error encountered during deep-copy: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_copyup_required && !is_update_object_map_required(r)) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ ldout(cct, 20) << "skipping" << dendl;
+ finish(r);
+ return;
+ }
+
+ // For deep-copy, copyup() will never affect snapshots. However,
+ // this state machine is responsible for updating object maps for
+ // snapshots that have been created on destination image after
+ // migration started.
+ if (r != -ENOENT) {
+ compute_deep_copy_snap_ids();
+ }
+
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ update_object_maps();
+}
+
+template <typename I>
+void CopyupRequest<I>::update_object_maps() {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ if (m_image_ctx->object_map == nullptr) {
+ image_locker.unlock();
+ owner_locker.unlock();
+
+ copyup();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ bool copy_on_read = m_pending_requests.empty();
+ uint8_t head_object_map_state = OBJECT_EXISTS;
+ if (copy_on_read && !m_snap_ids.empty() &&
+ m_image_ctx->test_features(RBD_FEATURE_FAST_DIFF,
+ m_image_ctx->image_lock)) {
+ // HEAD is non-dirty since data is tied to first snapshot
+ head_object_map_state = OBJECT_EXISTS_CLEAN;
+ }
+
+ auto r_it = m_pending_requests.rbegin();
+ if (r_it != m_pending_requests.rend()) {
+ // last write-op determines the final object map state
+ head_object_map_state = (*r_it)->get_pre_write_object_map_state();
+ }
+
+ if ((*m_image_ctx->object_map)[m_object_no] != head_object_map_state) {
+ // (maybe) need to update the HEAD object map state
+ m_snap_ids.push_back(CEPH_NOSNAP);
+ }
+ image_locker.unlock();
+
+ ceph_assert(m_image_ctx->exclusive_lock->is_lock_owner());
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_UpdateObjectMap<I>>(),
+ boost::lambda::_1, m_image_ctx, m_object_no, head_object_map_state,
+ &m_snap_ids, m_first_snap_is_clean, m_trace, boost::lambda::_2));
+ auto ctx = librbd::util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_update_object_maps>(this);
+ auto throttle = new AsyncObjectThrottle<I>(
+ nullptr, *m_image_ctx, context_factory, ctx, nullptr, 0, m_snap_ids.size());
+ throttle->start_ops(
+ m_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_update_object_maps(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+
+ finish(r);
+ return;
+ }
+
+ copyup();
+}
+
+template <typename I>
+void CopyupRequest<I>::copyup() {
+ auto cct = m_image_ctx->cct;
+ m_image_ctx->image_lock.lock_shared();
+ auto snapc = m_image_ctx->snapc;
+ auto io_context = m_image_ctx->get_data_io_context();
+ m_image_ctx->image_lock.unlock_shared();
+
+ m_lock.lock();
+ if (!m_copyup_required) {
+ m_lock.unlock();
+
+ ldout(cct, 20) << "skipping copyup" << dendl;
+ finish(0);
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ bool copy_on_read = m_pending_requests.empty() && !m_deep_copied;
+ bool deep_copyup = !snapc.snaps.empty() && !m_copyup_is_zero;
+ if (m_copyup_is_zero) {
+ m_copyup_data.clear();
+ m_copyup_extent_map.clear();
+ }
+
+ neorados::WriteOp copyup_op;
+ neorados::WriteOp write_op;
+ neorados::WriteOp* op;
+ if (copy_on_read || deep_copyup) {
+ // copyup-op will use its own request issued to the initial object revision
+ op = &copyup_op;
+ ++m_pending_copyups;
+ } else {
+ // copyup-op can be combined with the write-ops (if any)
+ op = &write_op;
+ }
+
+ if (m_image_ctx->enable_sparse_copyup) {
+ cls_client::sparse_copyup(op, m_copyup_extent_map, m_copyup_data);
+ } else {
+ // convert the sparse read back into a standard (thick) read
+ Striper::StripedReadResult destriper;
+ destriper.add_partial_sparse_result(
+ cct, std::move(m_copyup_data), m_copyup_extent_map, 0,
+ {{0, m_image_ctx->layout.object_size}});
+
+ bufferlist thick_bl;
+ destriper.assemble_result(cct, thick_bl, false);
+ cls_client::copyup(op, thick_bl);
+ }
+ ObjectRequest<I>::add_write_hint(*m_image_ctx, op);
+
+ if (!copy_on_read) {
+ // merge all pending write ops into this single RADOS op
+ for (auto req : m_pending_requests) {
+ ldout(cct, 20) << "add_copyup_ops " << req << dendl;
+ req->add_copyup_ops(&write_op);
+ }
+
+ if (write_op.size() > 0) {
+ ++m_pending_copyups;
+ }
+ }
+ m_lock.unlock();
+
+ // issue librados ops at the end to simplify test cases
+ auto object = neorados::Object{data_object_name(m_image_ctx, m_object_no)};
+ if (copyup_op.size() > 0) {
+ // send only the copyup request with a blank snapshot context so that
+ // all snapshots are detected from the parent for this object. If
+ // this is a CoW request, a second request will be created for the
+ // actual modification.
+ ldout(cct, 20) << "copyup with empty snapshot context" << dendl;
+
+ auto copyup_io_context = *io_context;
+ copyup_io_context.write_snap_context({});
+
+ m_image_ctx->rados_api.execute(
+ object, copyup_io_context, std::move(copyup_op),
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_copyup(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+ }
+
+ if (write_op.size() > 0) {
+ // compare-and-write doesn't add any write ops (copyup+cmpext+write
+ // can't be executed in the same RADOS op because, unless the object
+ // was already present in the clone, cmpext wouldn't see it)
+ ldout(cct, 20) << (!deep_copyup && write_op.size() > 2 ?
+ "copyup + ops" : !deep_copyup ? "copyup" : "ops")
+ << " with current snapshot context" << dendl;
+
+ m_image_ctx->rados_api.execute(
+ object, *io_context, std::move(write_op),
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_copyup(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_copyup(int r) {
+ auto cct = m_image_ctx->cct;
+ unsigned pending_copyups;
+ int copyup_ret_val = r;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_pending_copyups > 0);
+ pending_copyups = --m_pending_copyups;
+ if (m_copyup_ret_val < 0) {
+ copyup_ret_val = m_copyup_ret_val;
+ } else if (r < 0) {
+ m_copyup_ret_val = r;
+ }
+ }
+
+ ldout(cct, 20) << "r=" << r << ", "
+ << "pending=" << pending_copyups << dendl;
+
+ if (pending_copyups == 0) {
+ if (copyup_ret_val < 0 && copyup_ret_val != -ENOENT) {
+ lderr(cct) << "failed to copyup object: " << cpp_strerror(copyup_ret_val)
+ << dendl;
+ complete_requests(false, copyup_ret_val);
+ }
+
+ finish(0);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::finish(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ complete_requests(true, r);
+ delete this;
+}
+
+template <typename I>
+void CopyupRequest<I>::complete_requests(bool override_restart_retval, int r) {
+ auto cct = m_image_ctx->cct;
+ remove_from_list();
+
+ while (!m_pending_requests.empty()) {
+ auto it = m_pending_requests.begin();
+ auto req = *it;
+ ldout(cct, 20) << "completing request " << req << dendl;
+ req->handle_copyup(r);
+ m_pending_requests.erase(it);
+ }
+
+ if (override_restart_retval) {
+ r = -ERESTART;
+ }
+
+ while (!m_restart_requests.empty()) {
+ auto it = m_restart_requests.begin();
+ auto req = *it;
+ ldout(cct, 20) << "restarting request " << req << dendl;
+ req->handle_copyup(r);
+ m_restart_requests.erase(it);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::disable_append_requests() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ m_append_request_permitted = false;
+}
+
+template <typename I>
+void CopyupRequest<I>::remove_from_list() {
+ std::lock_guard copyup_list_locker{m_image_ctx->copyup_list_lock};
+
+ auto it = m_image_ctx->copyup_list.find(m_object_no);
+ if (it != m_image_ctx->copyup_list.end()) {
+ m_image_ctx->copyup_list.erase(it);
+ }
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_copyup_required() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ bool copy_on_read = m_pending_requests.empty();
+ if (copy_on_read) {
+ // always force a copyup if CoR enabled
+ return true;
+ }
+
+ if (!m_copyup_is_zero) {
+ return true;
+ }
+
+ for (auto req : m_pending_requests) {
+ if (!req->is_empty_write_op()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_deep_copy() const {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+ return !m_image_ctx->migration_info.empty();
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_update_object_map_required(int r) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+
+ if (r < 0) {
+ return false;
+ }
+
+ if (m_image_ctx->object_map == nullptr) {
+ return false;
+ }
+
+ if (m_image_ctx->migration_info.empty()) {
+ // migration might have completed while IO was in-flight,
+ // assume worst-case and perform an object map update
+ return true;
+ }
+
+ auto it = m_image_ctx->migration_info.snap_map.find(CEPH_NOSNAP);
+ ceph_assert(it != m_image_ctx->migration_info.snap_map.end());
+ return it->second[0] != CEPH_NOSNAP;
+}
+
+template <typename I>
+void CopyupRequest<I>::compute_deep_copy_snap_ids() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+
+ // don't copy ids for the snaps updated by object deep copy or
+ // that don't overlap
+ std::set<uint64_t> deep_copied;
+ for (auto &it : m_image_ctx->migration_info.snap_map) {
+ if (it.first != CEPH_NOSNAP) {
+ deep_copied.insert(it.second.front());
+ }
+ }
+ ldout(m_image_ctx->cct, 15) << "deep_copied=" << deep_copied << dendl;
+
+ std::copy_if(m_image_ctx->snaps.rbegin(), m_image_ctx->snaps.rend(),
+ std::back_inserter(m_snap_ids),
+ [this, cct=m_image_ctx->cct, &deep_copied](uint64_t snap_id) {
+ if (deep_copied.count(snap_id)) {
+ m_first_snap_is_clean = true;
+ return false;
+ }
+
+ uint64_t parent_overlap = 0;
+ int r = m_image_ctx->get_parent_overlap(snap_id, &parent_overlap);
+ if (r < 0) {
+ ldout(cct, 5) << "failed getting parent overlap for snap_id: "
+ << snap_id << ": " << cpp_strerror(r) << dendl;
+ }
+ if (parent_overlap == 0) {
+ return false;
+ }
+ std::vector<std::pair<uint64_t, uint64_t>> extents;
+ util::extent_to_file(m_image_ctx, m_object_no, 0,
+ m_image_ctx->layout.object_size, extents);
+ auto overlap = m_image_ctx->prune_parent_extents(
+ extents, parent_overlap);
+ return overlap > 0;
+ });
+}
+
+template <typename I>
+void CopyupRequest<I>::convert_copyup_extent_map() {
+ auto cct = m_image_ctx->cct;
+
+ Extents image_extent_map;
+ image_extent_map.swap(m_copyup_extent_map);
+ m_copyup_extent_map.reserve(image_extent_map.size());
+
+ // convert the image-extent extent map to object-extents
+ for (auto [image_offset, image_length] : image_extent_map) {
+ striper::LightweightObjectExtents object_extents;
+ util::file_to_extents(
+ m_image_ctx, image_offset, image_length, 0, &object_extents);
+ for (auto& object_extent : object_extents) {
+ m_copyup_extent_map.emplace_back(
+ object_extent.offset, object_extent.length);
+ }
+ }
+
+ ldout(cct, 20) << "image_extents=" << image_extent_map << ", "
+ << "object_extents=" << m_copyup_extent_map << dendl;
+}
+
+template <typename I>
+int CopyupRequest<I>::prepare_copyup_data() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+ auto cct = m_image_ctx->cct;
+
+ SnapshotSparseBufferlist snapshot_sparse_bufferlist;
+ auto& sparse_bufferlist = snapshot_sparse_bufferlist[0];
+
+ bool copy_on_read = m_pending_requests.empty();
+ bool maybe_deep_copyup = !m_image_ctx->snapc.snaps.empty();
+ if (copy_on_read || maybe_deep_copyup) {
+ // stand-alone copyup that will not be overwritten until HEAD revision
+ ldout(cct, 20) << "processing full copy-up" << dendl;
+
+ uint64_t buffer_offset = 0;
+ for (auto [object_offset, object_length] : m_copyup_extent_map) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_copyup_data, buffer_offset, object_length);
+ buffer_offset += object_length;
+
+ sparse_bufferlist.insert(
+ object_offset, object_length,
+ {SPARSE_EXTENT_STATE_DATA, object_length, std::move(sub_bl)});
+ }
+ } else {
+ // copyup that will concurrently written to the HEAD revision with the
+ // associated write-ops so only process partial extents
+ uint64_t buffer_offset = 0;
+ for (auto [object_offset, object_length] : m_copyup_extent_map) {
+ interval_set<uint64_t> copyup_object_extents;
+ copyup_object_extents.insert(object_offset, object_length);
+
+ interval_set<uint64_t> intersection;
+ intersection.intersection_of(copyup_object_extents,
+ m_write_object_extents);
+
+ // extract only portions of the parent copyup data that have not
+ // been overwritten by write-ops
+ copyup_object_extents.subtract(intersection);
+ for (auto [copyup_offset, copyup_length] : copyup_object_extents) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(
+ m_copyup_data, buffer_offset + (copyup_offset - object_offset),
+ copyup_length);
+ ceph_assert(sub_bl.length() == copyup_length);
+
+ sparse_bufferlist.insert(
+ copyup_offset, copyup_length,
+ {SPARSE_EXTENT_STATE_DATA, copyup_length, std::move(sub_bl)});
+ }
+ buffer_offset += object_length;
+ }
+
+ ldout(cct, 20) << "processing partial copy-up: " << sparse_bufferlist
+ << dendl;
+ }
+
+ // Let dispatch layers have a chance to process the data
+ auto r = m_image_ctx->io_object_dispatcher->prepare_copyup(
+ m_object_no, &snapshot_sparse_bufferlist);
+ if (r < 0) {
+ return r;
+ }
+
+ // Convert sparse extents back to extent map
+ m_copyup_data.clear();
+ m_copyup_extent_map.clear();
+ m_copyup_extent_map.reserve(sparse_bufferlist.ext_count());
+ for (auto& extent : sparse_bufferlist) {
+ auto& sbe = extent.get_val();
+ if (sbe.state == SPARSE_EXTENT_STATE_DATA) {
+ m_copyup_extent_map.emplace_back(extent.get_off(), extent.get_len());
+ m_copyup_data.append(sbe.bl);
+ }
+ }
+
+ return 0;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::CopyupRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h
new file mode 100644
index 000000000..a6a20294c
--- /dev/null
+++ b/src/librbd/io/CopyupRequest.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_COPYUP_REQUEST_H
+#define CEPH_LIBRBD_IO_COPYUP_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/interval_set.h"
+#include "common/ceph_mutex.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/Types.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace ZTracer { struct Trace; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename I> class AbstractObjectWriteRequest;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CopyupRequest {
+public:
+ static CopyupRequest* create(ImageCtxT *ictx, uint64_t objectno,
+ Extents &&image_extents,
+ const ZTracer::Trace &parent_trace) {
+ return new CopyupRequest(ictx, objectno, std::move(image_extents),
+ parent_trace);
+ }
+
+ CopyupRequest(ImageCtxT *ictx, uint64_t objectno, Extents &&image_extents,
+ const ZTracer::Trace &parent_trace);
+ ~CopyupRequest();
+
+ void append_request(AbstractObjectWriteRequest<ImageCtxT> *req,
+ const Extents& object_extents);
+
+ void send();
+
+private:
+ /**
+ * Copyup requests go through the following state machine to read from the
+ * parent image, update the object map, and copyup the object:
+ *
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * /---------/ \---------\
+ * | |
+ * v v
+ * READ_FROM_PARENT DEEP_COPY
+ * | |
+ * \---------\ /---------/
+ * |
+ * v (skip if not needed)
+ * UPDATE_OBJECT_MAPS
+ * |
+ * v (skip if not needed)
+ * COPYUP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The OBJECT_MAP state is skipped if the object map isn't enabled or if
+ * an object map update isn't required. The COPYUP state is skipped if
+ * no data was read from the parent *and* there are no additional ops.
+ */
+
+ typedef std::vector<AbstractObjectWriteRequest<ImageCtxT> *> WriteRequests;
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_object_no;
+ Extents m_image_extents;
+ ZTracer::Trace m_trace;
+
+ bool m_flatten = false;
+ bool m_copyup_required = true;
+ bool m_copyup_is_zero = true;
+ bool m_deep_copied = false;
+
+ Extents m_copyup_extent_map;
+ ceph::bufferlist m_copyup_data;
+
+ AsyncOperation m_async_op;
+
+ std::vector<uint64_t> m_snap_ids;
+ bool m_first_snap_is_clean = false;
+
+ ceph::mutex m_lock = ceph::make_mutex("CopyupRequest", false);
+ WriteRequests m_pending_requests;
+ unsigned m_pending_copyups = 0;
+ int m_copyup_ret_val = 0;
+
+ WriteRequests m_restart_requests;
+ bool m_append_request_permitted = true;
+
+ interval_set<uint64_t> m_write_object_extents;
+
+ void read_from_parent();
+ void handle_read_from_parent(int r);
+
+ void deep_copy();
+ void handle_deep_copy(int r);
+
+ void update_object_maps();
+ void handle_update_object_maps(int r);
+
+ void copyup();
+ void handle_copyup(int r);
+
+ void finish(int r);
+ void complete_requests(bool override_restart_retval, int r);
+
+ void disable_append_requests();
+ void remove_from_list();
+
+ bool is_copyup_required();
+ bool is_update_object_map_required(int r);
+ bool is_deep_copy() const;
+
+ void compute_deep_copy_snap_ids();
+ void convert_copyup_extent_map();
+ int prepare_copyup_data();
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::CopyupRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_COPYUP_REQUEST_H
diff --git a/src/librbd/io/Dispatcher.h b/src/librbd/io/Dispatcher.h
new file mode 100644
index 000000000..cb64e11b2
--- /dev/null
+++ b/src/librbd/io/Dispatcher.h
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_DISPATCHER_H
+#define CEPH_LIBRBD_IO_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "common/ceph_mutex.h"
+#include "common/dout.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/Utils.h"
+#include "librbd/io/DispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::Dispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename ImageCtxT, typename DispatchInterfaceT>
+class Dispatcher : public DispatchInterfaceT {
+public:
+ typedef typename DispatchInterfaceT::Dispatch Dispatch;
+ typedef typename DispatchInterfaceT::DispatchLayer DispatchLayer;
+ typedef typename DispatchInterfaceT::DispatchSpec DispatchSpec;
+
+ Dispatcher(ImageCtxT* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ librbd::util::unique_lock_name("librbd::io::Dispatcher::lock",
+ this))) {
+ }
+
+ virtual ~Dispatcher() {
+ ceph_assert(m_dispatches.empty());
+ }
+
+ void shut_down(Context* on_finish) override {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ std::map<DispatchLayer, DispatchMeta> dispatches;
+ {
+ std::unique_lock locker{m_lock};
+ std::swap(dispatches, m_dispatches);
+ }
+
+ for (auto it : dispatches) {
+ shut_down_dispatch(it.second, &on_finish);
+ }
+ on_finish->complete(0);
+ }
+
+ void register_dispatch(Dispatch* dispatch) override {
+ auto cct = m_image_ctx->cct;
+ auto type = dispatch->get_dispatch_layer();
+ ldout(cct, 5) << "dispatch_layer=" << type << dendl;
+
+ std::unique_lock locker{m_lock};
+
+ auto result = m_dispatches.insert(
+ {type, {dispatch, new AsyncOpTracker()}});
+ ceph_assert(result.second);
+ }
+
+ bool exists(DispatchLayer dispatch_layer) override {
+ std::unique_lock locker{m_lock};
+ return m_dispatches.find(dispatch_layer) != m_dispatches.end();
+ }
+
+ void shut_down_dispatch(DispatchLayer dispatch_layer,
+ Context* on_finish) override {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "dispatch_layer=" << dispatch_layer << dendl;
+
+ DispatchMeta dispatch_meta;
+ {
+ std::unique_lock locker{m_lock};
+ auto it = m_dispatches.find(dispatch_layer);
+ if (it == m_dispatches.end()) {
+ on_finish->complete(0);
+ return;
+ }
+
+ dispatch_meta = it->second;
+ m_dispatches.erase(it);
+ }
+
+ shut_down_dispatch(dispatch_meta, &on_finish);
+ on_finish->complete(0);
+ }
+
+ void send(DispatchSpec* dispatch_spec) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "dispatch_spec=" << dispatch_spec << dendl;
+
+ auto dispatch_layer = dispatch_spec->dispatch_layer;
+
+ // apply the IO request to all layers -- this method will be re-invoked
+ // by the dispatch layer if continuing / restarting the IO
+ while (true) {
+ m_lock.lock_shared();
+ dispatch_layer = dispatch_spec->dispatch_layer;
+ auto it = m_dispatches.upper_bound(dispatch_layer);
+ if (it == m_dispatches.end()) {
+ // the request is complete if handled by all layers
+ dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE;
+ m_lock.unlock_shared();
+ break;
+ }
+
+ auto& dispatch_meta = it->second;
+ auto dispatch = dispatch_meta.dispatch;
+ auto async_op_tracker = dispatch_meta.async_op_tracker;
+ dispatch_spec->dispatch_result = DISPATCH_RESULT_INVALID;
+
+ // prevent recursive locking back into the dispatcher while handling IO
+ async_op_tracker->start_op();
+ m_lock.unlock_shared();
+
+ // advance to next layer in case we skip or continue
+ dispatch_spec->dispatch_layer = dispatch->get_dispatch_layer();
+
+ bool handled = send_dispatch(dispatch, dispatch_spec);
+ async_op_tracker->finish_op();
+
+ // handled ops will resume when the dispatch ctx is invoked
+ if (handled) {
+ return;
+ }
+ }
+
+ // skipped through to the last layer
+ dispatch_spec->dispatcher_ctx.complete(0);
+ }
+
+protected:
+ struct DispatchMeta {
+ Dispatch* dispatch = nullptr;
+ AsyncOpTracker* async_op_tracker = nullptr;
+
+ DispatchMeta() {
+ }
+ DispatchMeta(Dispatch* dispatch, AsyncOpTracker* async_op_tracker)
+ : dispatch(dispatch), async_op_tracker(async_op_tracker) {
+ }
+ };
+
+ ImageCtxT* m_image_ctx;
+
+ ceph::shared_mutex m_lock;
+ std::map<DispatchLayer, DispatchMeta> m_dispatches;
+
+ virtual bool send_dispatch(Dispatch* dispatch,
+ DispatchSpec* dispatch_spec) = 0;
+
+protected:
+ struct C_LayerIterator : public Context {
+ Dispatcher* dispatcher;
+ Context* on_finish;
+ DispatchLayer dispatch_layer;
+
+ C_LayerIterator(Dispatcher* dispatcher,
+ DispatchLayer start_layer,
+ Context* on_finish)
+ : dispatcher(dispatcher), on_finish(on_finish), dispatch_layer(start_layer) {
+ }
+
+ void complete(int r) override {
+ while (true) {
+ dispatcher->m_lock.lock_shared();
+ auto it = dispatcher->m_dispatches.upper_bound(dispatch_layer);
+ if (it == dispatcher->m_dispatches.end()) {
+ dispatcher->m_lock.unlock_shared();
+ Context::complete(r);
+ return;
+ }
+
+ auto& dispatch_meta = it->second;
+ auto dispatch = dispatch_meta.dispatch;
+
+ // prevent recursive locking back into the dispatcher while handling IO
+ dispatch_meta.async_op_tracker->start_op();
+ dispatcher->m_lock.unlock_shared();
+
+ // next loop should start after current layer
+ dispatch_layer = dispatch->get_dispatch_layer();
+
+ auto handled = execute(dispatch, this);
+ dispatch_meta.async_op_tracker->finish_op();
+
+ if (handled) {
+ break;
+ }
+ }
+ }
+
+ void finish(int r) override {
+ on_finish->complete(0);
+ }
+ virtual bool execute(Dispatch* dispatch,
+ Context* on_finish) = 0;
+ };
+
+ struct C_InvalidateCache : public C_LayerIterator {
+ C_InvalidateCache(Dispatcher* dispatcher, DispatchLayer start_layer, Context* on_finish)
+ : C_LayerIterator(dispatcher, start_layer, on_finish) {
+ }
+
+ bool execute(Dispatch* dispatch,
+ Context* on_finish) override {
+ return dispatch->invalidate_cache(on_finish);
+ }
+ };
+
+private:
+ void shut_down_dispatch(DispatchMeta& dispatch_meta,
+ Context** on_finish) {
+ auto dispatch = dispatch_meta.dispatch;
+ auto async_op_tracker = dispatch_meta.async_op_tracker;
+
+ auto ctx = *on_finish;
+ ctx = new LambdaContext(
+ [dispatch, async_op_tracker, ctx](int r) {
+ delete dispatch;
+ delete async_op_tracker;
+
+ ctx->complete(r);
+ });
+ ctx = new LambdaContext([dispatch, ctx](int r) {
+ dispatch->shut_down(ctx);
+ });
+ *on_finish = new LambdaContext([async_op_tracker, ctx](int r) {
+ async_op_tracker->wait_for_ops(ctx);
+ });
+ }
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#undef dout_subsys
+#undef dout_prefix
+#define dout_prefix *_dout
+
+#endif // CEPH_LIBRBD_IO_DISPATCHER_H
diff --git a/src/librbd/io/DispatcherInterface.h b/src/librbd/io/DispatcherInterface.h
new file mode 100644
index 000000000..2bac9ee75
--- /dev/null
+++ b/src/librbd/io/DispatcherInterface.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H
+#define CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H
+
+#include "include/int_types.h"
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+template <typename DispatchT>
+struct DispatcherInterface {
+public:
+ typedef DispatchT Dispatch;
+ typedef typename DispatchT::DispatchLayer DispatchLayer;
+ typedef typename DispatchT::DispatchSpec DispatchSpec;
+
+ virtual ~DispatcherInterface() {
+ }
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual void register_dispatch(Dispatch* dispatch) = 0;
+ virtual bool exists(DispatchLayer dispatch_layer) = 0;
+ virtual void shut_down_dispatch(DispatchLayer dispatch_layer,
+ Context* on_finish) = 0;
+
+ virtual void send(DispatchSpec* dispatch_spec) = 0;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H
diff --git a/src/librbd/io/FlushTracker.cc b/src/librbd/io/FlushTracker.cc
new file mode 100644
index 000000000..b6e2ed658
--- /dev/null
+++ b/src/librbd/io/FlushTracker.cc
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/FlushTracker.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::FlushTracker: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+FlushTracker<I>::FlushTracker(I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::io::FlushTracker::m_lock", this))) {
+}
+
+template <typename I>
+FlushTracker<I>::~FlushTracker() {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_flush_contexts.empty());
+}
+
+template <typename I>
+void FlushTracker<I>::shut_down() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ std::unique_lock locker{m_lock};
+ Contexts flush_ctxs;
+ for (auto& [flush_tid, ctxs] : m_flush_contexts) {
+ flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end());
+ }
+ m_flush_contexts.clear();
+ locker.unlock();
+
+ for (auto ctx : flush_ctxs) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+uint64_t FlushTracker<I>::start_io(uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+
+ std::unique_lock locker{m_lock};
+ auto [it, inserted] = m_tid_to_flush_tid.insert({tid, ++m_next_flush_tid});
+ auto flush_tid = it->second;
+ m_in_flight_flush_tids.insert(flush_tid);
+ locker.unlock();
+
+ ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl;
+ return flush_tid;
+}
+
+template <typename I>
+void FlushTracker<I>::finish_io(uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+
+ std::unique_lock locker{m_lock};
+ auto tid_to_flush_tid_it = m_tid_to_flush_tid.find(tid);
+ if (tid_to_flush_tid_it == m_tid_to_flush_tid.end()) {
+ return;
+ }
+
+ auto flush_tid = tid_to_flush_tid_it->second;
+ m_tid_to_flush_tid.erase(tid_to_flush_tid_it);
+ m_in_flight_flush_tids.erase(flush_tid);
+
+ ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl;
+ auto oldest_flush_tid = std::numeric_limits<uint64_t>::max();
+ if (!m_in_flight_flush_tids.empty()) {
+ oldest_flush_tid = *m_in_flight_flush_tids.begin();
+ }
+
+ // all flushes tagged before the oldest tid should be completed
+ Contexts flush_ctxs;
+ auto flush_contexts_it = m_flush_contexts.begin();
+ while (flush_contexts_it != m_flush_contexts.end()) {
+ if (flush_contexts_it->first >= oldest_flush_tid) {
+ ldout(cct, 20) << "pending IOs: [" << m_in_flight_flush_tids << "], "
+ << "pending flushes=" << m_flush_contexts << dendl;
+ break;
+ }
+
+ auto& ctxs = flush_contexts_it->second;
+ flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end());
+ flush_contexts_it = m_flush_contexts.erase(flush_contexts_it);
+ }
+ locker.unlock();
+
+ if (!flush_ctxs.empty()) {
+ ldout(cct, 20) << "completing flushes: " << flush_ctxs << dendl;
+ for (auto ctx : flush_ctxs) {
+ ctx->complete(0);
+ }
+ }
+}
+
+template <typename I>
+void FlushTracker<I>::flush(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+
+ std::unique_lock locker{m_lock};
+ if (m_in_flight_flush_tids.empty()) {
+ locker.unlock();
+ on_finish->complete(0);
+ return;
+ }
+
+ auto flush_tid = *m_in_flight_flush_tids.rbegin();
+ m_flush_contexts[flush_tid].push_back(on_finish);
+ ldout(cct, 20) << "flush_tid=" << flush_tid << ", ctx=" << on_finish << ", "
+ << "flush_contexts=" << m_flush_contexts << dendl;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::FlushTracker<librbd::ImageCtx>;
diff --git a/src/librbd/io/FlushTracker.h b/src/librbd/io/FlushTracker.h
new file mode 100644
index 000000000..cc7fcd9ae
--- /dev/null
+++ b/src/librbd/io/FlushTracker.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_FLUSH_TRACKER_H
+#define CEPH_LIBRBD_IO_FLUSH_TRACKER_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include <atomic>
+#include <list>
+#include <map>
+#include <set>
+#include <unordered_map>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class FlushTracker {
+public:
+ FlushTracker(ImageCtxT* image_ctx);
+ ~FlushTracker();
+
+ void shut_down();
+
+ uint64_t start_io(uint64_t tid);
+ void finish_io(uint64_t tid);
+
+ void flush(Context* on_finish);
+
+private:
+ typedef std::list<Context*> Contexts;
+ typedef std::map<uint64_t, Contexts> FlushContexts;
+ typedef std::set<uint64_t> Tids;
+ typedef std::unordered_map<uint64_t, uint64_t> TidToFlushTid;
+
+ ImageCtxT* m_image_ctx;
+
+ std::atomic<uint32_t> m_next_flush_tid{0};
+
+ mutable ceph::shared_mutex m_lock;
+ TidToFlushTid m_tid_to_flush_tid;
+
+ Tids m_in_flight_flush_tids;
+ FlushContexts m_flush_contexts;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::FlushTracker<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_FLUSH_TRACKER_H
diff --git a/src/librbd/io/ImageDispatch.cc b/src/librbd/io/ImageDispatch.cc
new file mode 100644
index 000000000..cc8519abe
--- /dev/null
+++ b/src/librbd/io/ImageDispatch.cc
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatch.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageDispatch: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+void start_in_flight_io(AioCompletion* aio_comp) {
+ // TODO remove AsyncOperation from AioCompletion
+ if (!aio_comp->async_op.started()) {
+ aio_comp->start_op();
+ }
+}
+
+} // anonymous namespace
+
+template <typename I>
+void ImageDispatch<I>::shut_down(Context* on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool ImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_read(
+ m_image_ctx, aio_comp, std::move(image_extents), std::move(read_result),
+ io_context, op_flags, read_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_write(
+ m_image_ctx, aio_comp, std::move(image_extents), std::move(bl),
+ io_context, op_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_discard(
+ m_image_ctx, aio_comp, std::move(image_extents), discard_granularity_bytes,
+ io_context, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_writesame(
+ m_image_ctx, aio_comp, std::move(image_extents), std::move(bl),
+ io_context, op_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_compare_and_write(
+ m_image_ctx, aio_comp, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, io_context, op_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_flush(m_image_ctx, aio_comp, flush_source, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageListSnapsRequest<I> req(
+ *m_image_ctx, aio_comp, std::move(image_extents), std::move(snap_ids),
+ list_snaps_flags, snapshot_delta, parent_trace);
+ req.send();
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::invalidate_cache(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ m_image_ctx->io_object_dispatcher->invalidate_cache(on_finish);
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageDispatch.h b/src/librbd/io/ImageDispatch.h
new file mode 100644
index 000000000..3d302e9a6
--- /dev/null
+++ b/src/librbd/io/ImageDispatch.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class ImageDispatch : public ImageDispatchInterface {
+public:
+ ImageDispatch(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_CORE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/ImageDispatchInterface.h b/src/librbd/io/ImageDispatchInterface.h
new file mode 100644
index 000000000..64cea8612
--- /dev/null
+++ b/src/librbd/io/ImageDispatchInterface.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <atomic>
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+struct AioCompletion;
+struct ImageDispatchSpec;
+
+struct ImageDispatchInterface {
+ typedef ImageDispatchLayer DispatchLayer;
+ typedef ImageDispatchSpec DispatchSpec;
+
+ virtual ~ImageDispatchInterface() {
+ }
+
+ virtual ImageDispatchLayer get_dispatch_layer() const = 0;
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ IOContext io_context, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool invalidate_cache(Context* on_finish) = 0;
+
+ virtual void remap_extents(Extents& image_extents,
+ ImageExtentsMapType type) {}
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H
diff --git a/src/librbd/io/ImageDispatchSpec.cc b/src/librbd/io/ImageDispatchSpec.cc
new file mode 100644
index 000000000..95d8224ae
--- /dev/null
+++ b/src/librbd/io/ImageDispatchSpec.cc
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include <boost/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+void ImageDispatchSpec::C_Dispatcher::complete(int r) {
+ switch (image_dispatch_spec->dispatch_result) {
+ case DISPATCH_RESULT_RESTART:
+ ceph_assert(image_dispatch_spec->dispatch_layer != 0);
+ image_dispatch_spec->dispatch_layer = static_cast<ImageDispatchLayer>(
+ image_dispatch_spec->dispatch_layer - 1);
+ [[fallthrough]];
+ case DISPATCH_RESULT_CONTINUE:
+ if (r < 0) {
+ // bubble dispatch failure through AioCompletion
+ image_dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE;
+ image_dispatch_spec->fail(r);
+ return;
+ }
+
+ image_dispatch_spec->send();
+ break;
+ case DISPATCH_RESULT_COMPLETE:
+ finish(r);
+ break;
+ case DISPATCH_RESULT_INVALID:
+ ceph_abort();
+ break;
+ }
+}
+
+void ImageDispatchSpec::C_Dispatcher::finish(int r) {
+ delete image_dispatch_spec;
+}
+
+void ImageDispatchSpec::send() {
+ image_dispatcher->send(this);
+}
+
+void ImageDispatchSpec::fail(int r) {
+ dispatch_result = DISPATCH_RESULT_COMPLETE;
+ aio_comp->fail(r);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ImageDispatchSpec.h b/src/librbd/io/ImageDispatchSpec.h
new file mode 100644
index 000000000..ee95f21be
--- /dev/null
+++ b/src/librbd/io/ImageDispatchSpec.h
@@ -0,0 +1,243 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ReadResult.h"
+#include <boost/variant/variant.hpp>
+#include <atomic>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+struct ImageDispatcherInterface;
+
+class ImageDispatchSpec {
+private:
+ // helper to avoid extra heap allocation per object IO
+ struct C_Dispatcher : public Context {
+ ImageDispatchSpec* image_dispatch_spec;
+
+ C_Dispatcher(ImageDispatchSpec* image_dispatch_spec)
+ : image_dispatch_spec(image_dispatch_spec) {
+ }
+
+ void complete(int r) override;
+ void finish(int r) override;
+ };
+
+public:
+ struct Read {
+ ReadResult read_result;
+ int read_flags;
+
+ Read(ReadResult &&read_result, int read_flags)
+ : read_result(std::move(read_result)), read_flags(read_flags) {
+ }
+ };
+
+ struct Discard {
+ uint32_t discard_granularity_bytes;
+
+ Discard(uint32_t discard_granularity_bytes)
+ : discard_granularity_bytes(discard_granularity_bytes) {
+ }
+ };
+
+ struct Write {
+ bufferlist bl;
+
+ Write(bufferlist&& bl) : bl(std::move(bl)) {
+ }
+ };
+
+ struct WriteSame {
+ bufferlist bl;
+
+ WriteSame(bufferlist&& bl) : bl(std::move(bl)) {
+ }
+ };
+
+ struct CompareAndWrite {
+ bufferlist cmp_bl;
+ bufferlist bl;
+ uint64_t *mismatch_offset;
+
+ CompareAndWrite(bufferlist&& cmp_bl, bufferlist&& bl,
+ uint64_t *mismatch_offset)
+ : cmp_bl(std::move(cmp_bl)), bl(std::move(bl)),
+ mismatch_offset(mismatch_offset) {
+ }
+ };
+
+ struct Flush {
+ FlushSource flush_source;
+
+ Flush(FlushSource flush_source) : flush_source(flush_source) {
+ }
+ };
+
+ struct ListSnaps {
+ SnapIds snap_ids;
+ int list_snaps_flags;
+ SnapshotDelta* snapshot_delta;
+
+ ListSnaps(SnapIds&& snap_ids, int list_snaps_flags,
+ SnapshotDelta* snapshot_delta)
+ : snap_ids(std::move(snap_ids)), list_snaps_flags(list_snaps_flags),
+ snapshot_delta(snapshot_delta) {
+ }
+ };
+
+ typedef boost::variant<Read,
+ Discard,
+ Write,
+ WriteSame,
+ CompareAndWrite,
+ Flush,
+ ListSnaps> Request;
+
+ C_Dispatcher dispatcher_ctx;
+
+ ImageDispatcherInterface* image_dispatcher;
+ ImageDispatchLayer dispatch_layer;
+ std::atomic<uint32_t> image_dispatch_flags = 0;
+ DispatchResult dispatch_result = DISPATCH_RESULT_INVALID;
+
+ AioCompletion* aio_comp;
+ Extents image_extents;
+ Request request;
+ IOContext io_context;
+ int op_flags;
+ ZTracer::Trace parent_trace;
+ uint64_t tid = 0;
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_read(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents),
+ Read{std::move(read_result), read_flags},
+ io_context, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_discard(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, uint64_t off, uint64_t len,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp, {{off, len}},
+ Discard{discard_granularity_bytes},
+ io_context, 0, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_write(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents,
+ bufferlist &&bl, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), Write{std::move(bl)},
+ io_context, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_write_same(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, uint64_t off, uint64_t len,
+ bufferlist &&bl, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ {{off, len}}, WriteSame{std::move(bl)},
+ io_context, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_compare_and_write(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents),
+ CompareAndWrite{std::move(cmp_bl),
+ std::move(bl),
+ mismatch_offset},
+ io_context, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_flush(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp, {},
+ Flush{flush_source}, {}, 0, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_list_snaps(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents),
+ ListSnaps{std::move(snap_ids),
+ list_snaps_flags, snapshot_delta},
+ {}, 0, parent_trace);
+ }
+
+ ~ImageDispatchSpec() {
+ aio_comp->put();
+ }
+
+ void send();
+ void fail(int r);
+
+private:
+ struct SendVisitor;
+ struct IsWriteOpVisitor;
+ struct TokenRequestedVisitor;
+
+ ImageDispatchSpec(ImageDispatcherInterface* image_dispatcher,
+ ImageDispatchLayer image_dispatch_layer,
+ AioCompletion* aio_comp, Extents&& image_extents,
+ Request&& request, IOContext io_context, int op_flags,
+ const ZTracer::Trace& parent_trace)
+ : dispatcher_ctx(this), image_dispatcher(image_dispatcher),
+ dispatch_layer(image_dispatch_layer), aio_comp(aio_comp),
+ image_extents(std::move(image_extents)), request(std::move(request)),
+ io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) {
+ ceph_assert(aio_comp->image_dispatcher_ctx == nullptr);
+ aio_comp->image_dispatcher_ctx = &dispatcher_ctx;
+ aio_comp->get();
+ }
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
diff --git a/src/librbd/io/ImageDispatcher.cc b/src/librbd/io/ImageDispatcher.cc
new file mode 100644
index 000000000..7060f8328
--- /dev/null
+++ b/src/librbd/io/ImageDispatcher.cc
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatcher.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/ImageDispatch.h"
+#include "librbd/io/ImageDispatchInterface.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/QueueImageDispatch.h"
+#include "librbd/io/QosImageDispatch.h"
+#include "librbd/io/RefreshImageDispatch.h"
+#include "librbd/io/Utils.h"
+#include "librbd/io/WriteBlockImageDispatch.h"
+#include <boost/variant.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageDispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct ImageDispatcher<I>::SendVisitor : public boost::static_visitor<bool> {
+ ImageDispatchInterface* image_dispatch;
+ ImageDispatchSpec* image_dispatch_spec;
+
+ SendVisitor(ImageDispatchInterface* image_dispatch,
+ ImageDispatchSpec* image_dispatch_spec)
+ : image_dispatch(image_dispatch),
+ image_dispatch_spec(image_dispatch_spec) {
+ }
+
+ bool operator()(ImageDispatchSpec::Read& read) const {
+ return image_dispatch->read(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ std::move(read.read_result), image_dispatch_spec->io_context,
+ image_dispatch_spec->op_flags, read.read_flags,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::Discard& discard) const {
+ return image_dispatch->discard(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ discard.discard_granularity_bytes, image_dispatch_spec->io_context,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::Write& write) const {
+ return image_dispatch->write(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents), std::move(write.bl),
+ image_dispatch_spec->io_context, image_dispatch_spec->op_flags,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::WriteSame& write_same) const {
+ return image_dispatch->write_same(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents), std::move(write_same.bl),
+ image_dispatch_spec->io_context, image_dispatch_spec->op_flags,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(
+ ImageDispatchSpec::CompareAndWrite& compare_and_write) const {
+ return image_dispatch->compare_and_write(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ std::move(compare_and_write.cmp_bl), std::move(compare_and_write.bl),
+ compare_and_write.mismatch_offset, image_dispatch_spec->io_context,
+ image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::Flush& flush) const {
+ return image_dispatch->flush(
+ image_dispatch_spec->aio_comp, flush.flush_source,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::ListSnaps& list_snaps) const {
+ return image_dispatch->list_snaps(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags,
+ list_snaps.snapshot_delta, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+};
+
+template <typename I>
+struct ImageDispatcher<I>::PreprocessVisitor
+ : public boost::static_visitor<bool> {
+ ImageDispatcher<I>* image_dispatcher;
+ ImageDispatchSpec* image_dispatch_spec;
+
+ PreprocessVisitor(ImageDispatcher<I>* image_dispatcher,
+ ImageDispatchSpec* image_dispatch_spec)
+ : image_dispatcher(image_dispatcher),
+ image_dispatch_spec(image_dispatch_spec) {
+ }
+
+ bool clip_request() const {
+ int r = util::clip_request(image_dispatcher->m_image_ctx,
+ &image_dispatch_spec->image_extents);
+ if (r < 0) {
+ image_dispatch_spec->fail(r);
+ return true;
+ }
+ return false;
+ }
+
+ bool operator()(ImageDispatchSpec::Read& read) const {
+ if ((read.read_flags & READ_FLAG_DISABLE_CLIPPING) != 0) {
+ return false;
+ }
+ return clip_request();
+ }
+
+ bool operator()(ImageDispatchSpec::Flush&) const {
+ return clip_request();
+ }
+
+ bool operator()(ImageDispatchSpec::ListSnaps&) const {
+ return false;
+ }
+
+ template <typename T>
+ bool operator()(T&) const {
+ if (clip_request()) {
+ return true;
+ }
+
+ std::shared_lock image_locker{image_dispatcher->m_image_ctx->image_lock};
+ if (image_dispatcher->m_image_ctx->snap_id != CEPH_NOSNAP ||
+ image_dispatcher->m_image_ctx->read_only) {
+ image_dispatch_spec->fail(-EROFS);
+ return true;
+ }
+ return false;
+ }
+};
+
+template <typename I>
+ImageDispatcher<I>::ImageDispatcher(I* image_ctx)
+ : Dispatcher<I, ImageDispatcherInterface>(image_ctx) {
+ // configure the core image dispatch handler on startup
+ auto image_dispatch = new ImageDispatch(image_ctx);
+ this->register_dispatch(image_dispatch);
+
+ auto queue_image_dispatch = new QueueImageDispatch(image_ctx);
+ this->register_dispatch(queue_image_dispatch);
+
+ m_qos_image_dispatch = new QosImageDispatch<I>(image_ctx);
+ this->register_dispatch(m_qos_image_dispatch);
+
+ auto refresh_image_dispatch = new RefreshImageDispatch(image_ctx);
+ this->register_dispatch(refresh_image_dispatch);
+
+ m_write_block_dispatch = new WriteBlockImageDispatch<I>(image_ctx);
+ this->register_dispatch(m_write_block_dispatch);
+}
+
+template <typename I>
+void ImageDispatcher<I>::invalidate_cache(Context* on_finish) {
+ auto image_ctx = this->m_image_ctx;
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = new C_InvalidateCache(
+ this, IMAGE_DISPATCH_LAYER_NONE, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ImageDispatcher<I>::shut_down(Context* on_finish) {
+ // TODO ensure all IOs are executed via a dispatcher
+ // ensure read-ahead / copy-on-read ops are finished since they are
+ // currently outside dispatcher tracking
+ auto async_op = new AsyncOperation();
+
+ on_finish = new LambdaContext([async_op, on_finish](int r) {
+ async_op->finish_op();
+ delete async_op;
+ on_finish->complete(0);
+ });
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ Dispatcher<I, ImageDispatcherInterface>::shut_down(on_finish);
+ });
+ async_op->start_op(*this->m_image_ctx);
+ async_op->flush(on_finish);
+}
+
+template <typename I>
+void ImageDispatcher<I>::apply_qos_schedule_tick_min(uint64_t tick) {
+ m_qos_image_dispatch->apply_qos_schedule_tick_min(tick);
+}
+
+template <typename I>
+void ImageDispatcher<I>::apply_qos_limit(uint64_t flag, uint64_t limit,
+ uint64_t burst, uint64_t burst_seconds) {
+ m_qos_image_dispatch->apply_qos_limit(flag, limit, burst, burst_seconds);
+}
+
+template <typename I>
+bool ImageDispatcher<I>::writes_blocked() const {
+ return m_write_block_dispatch->writes_blocked();
+}
+
+template <typename I>
+int ImageDispatcher<I>::block_writes() {
+ return m_write_block_dispatch->block_writes();
+}
+
+template <typename I>
+void ImageDispatcher<I>::block_writes(Context *on_blocked) {
+ m_write_block_dispatch->block_writes(on_blocked);
+}
+
+template <typename I>
+void ImageDispatcher<I>::unblock_writes() {
+ m_write_block_dispatch->unblock_writes();
+}
+
+template <typename I>
+void ImageDispatcher<I>::wait_on_writes_unblocked(Context *on_unblocked) {
+ m_write_block_dispatch->wait_on_writes_unblocked(on_unblocked);
+}
+
+template <typename I>
+void ImageDispatcher<I>::remap_extents(Extents& image_extents,
+ ImageExtentsMapType type) {
+ auto loop = [&image_extents, type](auto begin, auto end) {
+ for (auto it = begin; it != end; ++it) {
+ auto& image_dispatch_meta = it->second;
+ auto image_dispatch = image_dispatch_meta.dispatch;
+ image_dispatch->remap_extents(image_extents, type);
+ }
+ };
+
+ std::shared_lock locker{this->m_lock};
+ if (type == IMAGE_EXTENTS_MAP_TYPE_LOGICAL_TO_PHYSICAL) {
+ loop(this->m_dispatches.cbegin(), this->m_dispatches.cend());
+ } else if (type == IMAGE_EXTENTS_MAP_TYPE_PHYSICAL_TO_LOGICAL) {
+ loop(this->m_dispatches.crbegin(), this->m_dispatches.crend());
+ }
+}
+
+template <typename I>
+bool ImageDispatcher<I>::send_dispatch(
+ ImageDispatchInterface* image_dispatch,
+ ImageDispatchSpec* image_dispatch_spec) {
+ if (image_dispatch_spec->tid == 0) {
+ image_dispatch_spec->tid = ++m_next_tid;
+
+ bool finished = preprocess(image_dispatch_spec);
+ if (finished) {
+ return true;
+ }
+ }
+
+ return boost::apply_visitor(
+ SendVisitor{image_dispatch, image_dispatch_spec},
+ image_dispatch_spec->request);
+}
+
+template <typename I>
+bool ImageDispatcher<I>::preprocess(
+ ImageDispatchSpec* image_dispatch_spec) {
+ return boost::apply_visitor(
+ PreprocessVisitor{this, image_dispatch_spec},
+ image_dispatch_spec->request);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageDispatcher<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageDispatcher.h b/src/librbd/io/ImageDispatcher.h
new file mode 100644
index 000000000..c7ea56f73
--- /dev/null
+++ b/src/librbd/io/ImageDispatcher.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include "librbd/io/Dispatcher.h"
+#include "librbd/io/ImageDispatchInterface.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include <atomic>
+#include <map>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename> struct QosImageDispatch;
+template <typename> struct WriteBlockImageDispatch;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageDispatcher : public Dispatcher<ImageCtxT, ImageDispatcherInterface> {
+public:
+ ImageDispatcher(ImageCtxT* image_ctx);
+
+ void invalidate_cache(Context* on_finish) override;
+
+ void shut_down(Context* on_finish) override;
+
+ void apply_qos_schedule_tick_min(uint64_t tick) override;
+ void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst,
+ uint64_t burst_seconds) override;
+
+ bool writes_blocked() const override;
+ int block_writes() override;
+ void block_writes(Context *on_blocked) override;
+
+ void unblock_writes() override;
+ void wait_on_writes_unblocked(Context *on_unblocked) override;
+
+ void remap_extents(Extents& image_extents,
+ ImageExtentsMapType type) override;
+
+protected:
+ bool send_dispatch(
+ ImageDispatchInterface* image_dispatch,
+ ImageDispatchSpec* image_dispatch_spec) override;
+
+private:
+ struct SendVisitor;
+ struct PreprocessVisitor;
+
+ using typename Dispatcher<ImageCtxT, ImageDispatcherInterface>::C_InvalidateCache;
+
+ std::atomic<uint64_t> m_next_tid{0};
+
+ QosImageDispatch<ImageCtxT>* m_qos_image_dispatch = nullptr;
+ WriteBlockImageDispatch<ImageCtxT>* m_write_block_dispatch = nullptr;
+
+ bool preprocess(ImageDispatchSpec* image_dispatch_spec);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageDispatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H
diff --git a/src/librbd/io/ImageDispatcherInterface.h b/src/librbd/io/ImageDispatcherInterface.h
new file mode 100644
index 000000000..7c25734f9
--- /dev/null
+++ b/src/librbd/io/ImageDispatcherInterface.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H
+
+#include "include/int_types.h"
+#include "librbd/io/DispatcherInterface.h"
+#include "librbd/io/ImageDispatchInterface.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+struct ImageDispatcherInterface
+ : public DispatcherInterface<ImageDispatchInterface> {
+public:
+ virtual void apply_qos_schedule_tick_min(uint64_t tick) = 0;
+ virtual void apply_qos_limit(uint64_t flag, uint64_t limit,
+ uint64_t burst, uint64_t burst_seconds) = 0;
+
+ virtual bool writes_blocked() const = 0;
+ virtual int block_writes() = 0;
+ virtual void block_writes(Context *on_blocked) = 0;
+
+ virtual void unblock_writes() = 0;
+ virtual void wait_on_writes_unblocked(Context *on_unblocked) = 0;
+
+ virtual void invalidate_cache(Context* on_finish) = 0;
+ virtual void remap_extents(Extents& image_extents,
+ ImageExtentsMapType type) = 0;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H
diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc
new file mode 100644
index 000000000..5f8d2f0da
--- /dev/null
+++ b/src/librbd/io/ImageRequest.cc
@@ -0,0 +1,881 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Utils.h"
+#include "librbd/journal/Types.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "osdc/Striper.h"
+#include <algorithm>
+#include <functional>
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageRequest: " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+using librbd::util::get_image_ctx;
+
+namespace {
+
+template <typename I>
+struct C_AssembleSnapshotDeltas : public C_AioRequest {
+ I* image_ctx;
+ SnapshotDelta* snapshot_delta;
+
+ ceph::mutex lock = ceph::make_mutex(
+ "librbd::io::C_AssembleSnapshotDeltas::lock", false);
+ std::map<uint64_t, SnapshotDelta> object_snapshot_delta;
+
+ C_AssembleSnapshotDeltas(I* image_ctx, AioCompletion* aio_comp,
+ SnapshotDelta* snapshot_delta)
+ : C_AioRequest(aio_comp),
+ image_ctx(image_ctx), snapshot_delta(snapshot_delta) {
+ }
+
+ SnapshotDelta* get_snapshot_delta(uint64_t object_no) {
+ std::unique_lock locker{lock};
+ return &object_snapshot_delta[object_no];
+ }
+
+ void finish(int r) override {
+ auto cct = image_ctx->cct;
+
+ if (r < 0) {
+ lderr(cct) << "C_AssembleSnapshotDeltas: list snaps failed: "
+ << cpp_strerror(r) << dendl;
+ C_AioRequest::finish(r);
+ return;
+ }
+
+ std::unique_lock locker{lock};
+ *snapshot_delta = {};
+ for (auto& [object_no, object_snapshot_delta] : object_snapshot_delta) {
+ SnapshotDelta image_snapshot_delta;
+ object_to_image_intervals(object_no, object_snapshot_delta,
+ &image_snapshot_delta, snapshot_delta);
+
+ ldout(cct, 20) << "object_no=" << object_no << ", "
+ << "object_snapshot_delta="
+ << object_snapshot_delta << ", "
+ << "image_snapshot_delta=" << image_snapshot_delta
+ << dendl;
+ }
+
+ ldout(cct, 20) << "snapshot_delta=" << *snapshot_delta << dendl;
+ C_AioRequest::finish(0);
+ }
+
+ void object_to_image_intervals(
+ uint64_t object_no, const SnapshotDelta& object_snapshot_delta,
+ SnapshotDelta* image_snapshot_delta,
+ SnapshotDelta* assembled_image_snapshot_delta) {
+ for (auto& [key, object_extents] : object_snapshot_delta) {
+ for (auto& object_extent : object_extents) {
+ Extents image_extents;
+ io::util::extent_to_file(image_ctx, object_no, object_extent.get_off(),
+ object_extent.get_len(), image_extents);
+
+ auto& intervals = (*image_snapshot_delta)[key];
+ auto& assembled_intervals = (*assembled_image_snapshot_delta)[key];
+ for (auto [image_offset, image_length] : image_extents) {
+ SparseExtent sparse_extent{object_extent.get_val().state,
+ image_length};
+ intervals.insert(image_offset, image_length, sparse_extent);
+ assembled_intervals.insert(image_offset, image_length,
+ sparse_extent);
+ }
+ }
+ }
+ }
+};
+
+template <typename I>
+struct C_RBD_Readahead : public Context {
+ I *ictx;
+ uint64_t object_no;
+ io::ReadExtents extents;
+
+ C_RBD_Readahead(I *ictx, uint64_t object_no, uint64_t offset, uint64_t length)
+ : ictx(ictx), object_no(object_no), extents({{offset, length}}) {
+ ictx->readahead.inc_pending();
+ }
+
+ void finish(int r) override {
+ ceph_assert(extents.size() == 1);
+ auto& extent = extents.front();
+ ldout(ictx->cct, 20) << "C_RBD_Readahead on "
+ << data_object_name(ictx, object_no) << ": "
+ << extent.offset << "~" << extent.length << dendl;
+ ictx->readahead.dec_pending();
+ }
+};
+
+template <typename I>
+void readahead(I *ictx, const Extents& image_extents, IOContext io_context) {
+ uint64_t total_bytes = 0;
+ for (auto& image_extent : image_extents) {
+ total_bytes += image_extent.second;
+ }
+
+ ictx->image_lock.lock_shared();
+ auto total_bytes_read = ictx->total_bytes_read.fetch_add(total_bytes);
+ bool abort = (
+ ictx->readahead_disable_after_bytes != 0 &&
+ total_bytes_read > ictx->readahead_disable_after_bytes);
+ if (abort) {
+ ictx->image_lock.unlock_shared();
+ return;
+ }
+
+ uint64_t image_size = ictx->get_effective_image_size(ictx->snap_id);
+ ictx->image_lock.unlock_shared();
+
+ auto readahead_extent = ictx->readahead.update(image_extents, image_size);
+ uint64_t readahead_offset = readahead_extent.first;
+ uint64_t readahead_length = readahead_extent.second;
+
+ if (readahead_length > 0) {
+ ldout(ictx->cct, 20) << "(readahead logical) " << readahead_offset << "~"
+ << readahead_length << dendl;
+ LightweightObjectExtents readahead_object_extents;
+ io::util::file_to_extents(ictx, readahead_offset, readahead_length, 0,
+ &readahead_object_extents);
+ for (auto& object_extent : readahead_object_extents) {
+ ldout(ictx->cct, 20) << "(readahead) "
+ << data_object_name(ictx,
+ object_extent.object_no) << " "
+ << object_extent.offset << "~"
+ << object_extent.length << dendl;
+
+ auto req_comp = new C_RBD_Readahead<I>(ictx, object_extent.object_no,
+ object_extent.offset,
+ object_extent.length);
+ auto req = io::ObjectDispatchSpec::create_read(
+ ictx, io::OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ &req_comp->extents, io_context, 0, 0, {}, nullptr, req_comp);
+ req->send();
+ }
+
+ ictx->perfcounter->inc(l_librbd_readahead);
+ ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length);
+ }
+}
+
+template <typename I>
+struct C_UpdateTimestamp : public Context {
+public:
+ I& m_image_ctx;
+ bool m_modify; // if modify set to 'true', modify timestamp is updated,
+ // access timestamp otherwise
+ AsyncOperation m_async_op;
+
+ C_UpdateTimestamp(I& ictx, bool m) : m_image_ctx(ictx), m_modify(m) {
+ m_async_op.start_op(*get_image_ctx(&m_image_ctx));
+ }
+ ~C_UpdateTimestamp() override {
+ m_async_op.finish_op();
+ }
+
+ void send() {
+ librados::ObjectWriteOperation op;
+ if (m_modify) {
+ cls_client::set_modify_timestamp(&op);
+ } else {
+ cls_client::set_access_timestamp(&op);
+ }
+
+ auto comp = librbd::util::create_rados_callback(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void finish(int r) override {
+ // ignore errors updating timestamp
+ }
+};
+
+bool should_update_timestamp(const utime_t& now, const utime_t& timestamp,
+ uint64_t interval) {
+ return (interval &&
+ (static_cast<uint64_t>(now.sec()) >= interval + timestamp));
+}
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageReadRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(read_result), io_context, op_flags,
+ read_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageWriteRequest<I> req(*ictx, c, std::move(image_extents), std::move(bl),
+ io_context, op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_discard(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ IOContext io_context,
+ const ZTracer::Trace &parent_trace) {
+ ImageDiscardRequest<I> req(*ictx, c, std::move(image_extents),
+ discard_granularity_bytes, io_context,
+ parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace) {
+ ImageFlushRequest<I> req(*ictx, c, flush_source, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_writesame(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ bufferlist &&bl, IOContext io_context,
+ int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageWriteSameRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(bl), io_context, op_flags,
+ parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_compare_and_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageCompareAndWriteRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, io_context, op_flags,
+ parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(m_aio_comp->is_initialized(get_aio_type()));
+ ceph_assert(m_aio_comp->is_started());
+
+ CephContext *cct = image_ctx.cct;
+ AioCompletion *aio_comp = this->m_aio_comp;
+ ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << dendl;
+
+ update_timestamp();
+ send_request();
+}
+
+template <typename I>
+void ImageRequest<I>::update_timestamp() {
+ bool modify = (get_aio_type() != AIO_TYPE_READ);
+ uint64_t update_interval;
+ if (modify) {
+ update_interval = m_image_ctx.mtime_update_interval;
+ } else {
+ update_interval = m_image_ctx.atime_update_interval;
+ }
+
+ if (update_interval == 0) {
+ return;
+ }
+
+ utime_t (I::*get_timestamp_fn)() const;
+ void (I::*set_timestamp_fn)(utime_t);
+ if (modify) {
+ get_timestamp_fn = &I::get_modify_timestamp;
+ set_timestamp_fn = &I::set_modify_timestamp;
+ } else {
+ get_timestamp_fn = &I::get_access_timestamp;
+ set_timestamp_fn = &I::set_access_timestamp;
+ }
+
+ utime_t ts = ceph_clock_now();
+ {
+ std::shared_lock timestamp_locker{m_image_ctx.timestamp_lock};
+ if(!should_update_timestamp(ts, std::invoke(get_timestamp_fn, m_image_ctx),
+ update_interval)) {
+ return;
+ }
+ }
+
+ {
+ std::unique_lock timestamp_locker{m_image_ctx.timestamp_lock};
+ bool update = should_update_timestamp(
+ ts, std::invoke(get_timestamp_fn, m_image_ctx), update_interval);
+ if (!update) {
+ return;
+ }
+
+ std::invoke(set_timestamp_fn, m_image_ctx, ts);
+ }
+
+ // TODO we fire and forget this outside the IO path to prevent
+ // potential race conditions with librbd client IO callbacks
+ // between different threads (e.g. librados and object cacher)
+ ldout(m_image_ctx.cct, 10) << get_request_type() << dendl;
+ auto req = new C_UpdateTimestamp<I>(m_image_ctx, modify);
+ req->send();
+}
+
+template <typename I>
+ImageReadRequest<I>::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents,
+ ReadResult &&read_result,
+ IOContext io_context, int op_flags,
+ int read_flags,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents),
+ io_context, "read", parent_trace),
+ m_op_flags(op_flags), m_read_flags(read_flags) {
+ aio_comp->read_result = std::move(read_result);
+}
+
+template <typename I>
+void ImageReadRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ auto &image_extents = this->m_image_extents;
+ if (image_ctx.cache && image_ctx.readahead_max_bytes > 0 &&
+ !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) {
+ readahead(get_image_ctx(&image_ctx), image_extents, this->m_io_context);
+ }
+
+ // map image extents to object extents
+ LightweightObjectExtents object_extents;
+ uint64_t buffer_ofs = 0;
+ for (auto &extent : image_extents) {
+ if (extent.second == 0) {
+ continue;
+ }
+
+ util::file_to_extents(&image_ctx, extent.first, extent.second, buffer_ofs,
+ &object_extents);
+ buffer_ofs += extent.second;
+ }
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->read_result.set_image_extents(image_extents);
+
+ // issue the requests
+ aio_comp->set_request_count(object_extents.size());
+ for (auto &oe : object_extents) {
+ ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " "
+ << oe.offset << "~" << oe.length << " from "
+ << oe.buffer_extents << dendl;
+
+ auto req_comp = new io::ReadResult::C_ObjectReadRequest(
+ aio_comp, {{oe.offset, oe.length, std::move(oe.buffer_extents)}});
+ auto req = ObjectDispatchSpec::create_read(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.object_no,
+ &req_comp->extents, this->m_io_context, m_op_flags, m_read_flags,
+ this->m_trace, nullptr, req_comp);
+ req->send();
+ }
+
+ image_ctx.perfcounter->inc(l_librbd_rd);
+ image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs);
+}
+
+template <typename I>
+void AbstractImageWriteRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool journaling = false;
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ {
+ // prevent image size from changing between computing clip and recording
+ // pending async operation
+ std::shared_lock image_locker{image_ctx.image_lock};
+ journaling = (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending());
+ }
+
+ uint64_t clip_len = 0;
+ LightweightObjectExtents object_extents;
+ for (auto &extent : this->m_image_extents) {
+ if (extent.second == 0) {
+ continue;
+ }
+
+ // map to object extents
+ io::util::file_to_extents(&image_ctx, extent.first, extent.second, clip_len,
+ &object_extents);
+ clip_len += extent.second;
+ }
+
+ int ret = prune_object_extents(&object_extents);
+ if (ret < 0) {
+ aio_comp->fail(ret);
+ return;
+ }
+
+ aio_comp->set_request_count(object_extents.size());
+ if (!object_extents.empty()) {
+ uint64_t journal_tid = 0;
+ if (journaling) {
+ // in-flight ops are flushed prior to closing the journal
+ ceph_assert(image_ctx.journal != NULL);
+ journal_tid = append_journal_event(m_synchronous);
+ }
+
+ send_object_requests(object_extents, this->m_io_context, journal_tid);
+ }
+
+ update_stats(clip_len);
+}
+
+template <typename I>
+void AbstractImageWriteRequest<I>::send_object_requests(
+ const LightweightObjectExtents &object_extents, IOContext io_context,
+ uint64_t journal_tid) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ bool single_extent = (object_extents.size() == 1);
+ for (auto& oe : object_extents) {
+ ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " "
+ << oe.offset << "~" << oe.length << " from "
+ << oe.buffer_extents << dendl;
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ auto request = create_object_request(oe, io_context, journal_tid,
+ single_extent, req_comp);
+ request->send();
+ }
+}
+
+template <typename I>
+void ImageWriteRequest<I>::assemble_extent(
+ const LightweightObjectExtent &object_extent, bufferlist *bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+ }
+}
+
+template <typename I>
+uint64_t ImageWriteRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ uint64_t buffer_offset = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, buffer_offset, extent.second);
+ buffer_offset += extent.second;
+
+ tid = image_ctx.journal->append_write_event(extent.first, extent.second,
+ sub_bl, synchronous);
+ }
+
+ return tid;
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageWriteRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ if (single_extent && object_extent.buffer_extents.size() == 1 &&
+ m_bl.length() == object_extent.length) {
+ // optimization for single object/buffer extent writes
+ bl = std::move(m_bl);
+ } else {
+ assemble_extent(object_extent, &bl);
+ }
+
+ auto req = ObjectDispatchSpec::create_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, std::move(bl), io_context, m_op_flags, 0,
+ std::nullopt, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_wr);
+ image_ctx.perfcounter->inc(l_librbd_wr_bytes, length);
+}
+
+template <typename I>
+uint64_t ImageDiscardRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ journal::EventEntry event_entry(
+ journal::AioDiscardEvent(extent.first,
+ extent.second,
+ this->m_discard_granularity_bytes));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, 0);
+ }
+
+ return tid;
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageDiscardRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+ auto req = ObjectDispatchSpec::create_discard(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, object_extent.length, io_context,
+ OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE, journal_tid, this->m_trace,
+ on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageDiscardRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_discard);
+ image_ctx.perfcounter->inc(l_librbd_discard_bytes, length);
+}
+
+template <typename I>
+int ImageDiscardRequest<I>::prune_object_extents(
+ LightweightObjectExtents* object_extents) const {
+ if (m_discard_granularity_bytes == 0) {
+ return 0;
+ }
+
+ // Align the range to discard_granularity_bytes boundary and skip
+ // and discards that are too small to free up any space.
+ //
+ // discard_granularity_bytes >= object_size && tail truncation
+ // is a special case for filestore
+ bool prune_required = false;
+ auto object_size = this->m_image_ctx.layout.object_size;
+ auto discard_granularity_bytes = std::min(m_discard_granularity_bytes,
+ object_size);
+ auto xform_lambda =
+ [discard_granularity_bytes, object_size, &prune_required]
+ (LightweightObjectExtent& object_extent) {
+ auto& offset = object_extent.offset;
+ auto& length = object_extent.length;
+ auto next_offset = offset + length;
+
+ if ((discard_granularity_bytes < object_size) ||
+ (next_offset < object_size)) {
+ offset = p2roundup<uint64_t>(offset, discard_granularity_bytes);
+ next_offset = p2align<uint64_t>(next_offset, discard_granularity_bytes);
+ if (offset >= next_offset) {
+ prune_required = true;
+ length = 0;
+ } else {
+ length = next_offset - offset;
+ }
+ }
+ };
+ std::for_each(object_extents->begin(), object_extents->end(),
+ xform_lambda);
+
+ if (prune_required) {
+ // one or more object extents were skipped
+ auto remove_lambda =
+ [](const LightweightObjectExtent& object_extent) {
+ return (object_extent.length == 0);
+ };
+ object_extents->erase(
+ std::remove_if(object_extents->begin(), object_extents->end(),
+ remove_lambda),
+ object_extents->end());
+ }
+ return 0;
+}
+
+template <typename I>
+void ImageFlushRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool journaling = false;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ journaling = (m_flush_source == FLUSH_SOURCE_USER &&
+ image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending());
+ }
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+
+ Context *ctx = new C_AioRequest(aio_comp);
+
+ // ensure no locks are held when flush is complete
+ ctx = librbd::util::create_async_context_callback(image_ctx, ctx);
+
+ uint64_t journal_tid = 0;
+ if (journaling) {
+ // in-flight ops are flushed prior to closing the journal
+ ceph_assert(image_ctx.journal != NULL);
+ journal_tid = image_ctx.journal->append_io_event(
+ journal::EventEntry(journal::AioFlushEvent()), 0, 0, false, 0);
+ image_ctx.journal->user_flushed();
+ }
+
+ auto object_dispatch_spec = ObjectDispatchSpec::create_flush(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, m_flush_source, journal_tid,
+ this->m_trace, ctx);
+ ctx = new LambdaContext([object_dispatch_spec](int r) {
+ object_dispatch_spec->send();
+ });
+
+ // ensure all in-flight IOs are settled if non-user flush request
+ if (m_flush_source == FLUSH_SOURCE_WRITEBACK) {
+ ctx->complete(0);
+ } else {
+ aio_comp->async_op.flush(ctx);
+ }
+
+ // might be flushing during image shutdown
+ if (image_ctx.perfcounter != nullptr) {
+ image_ctx.perfcounter->inc(l_librbd_flush);
+ }
+}
+
+template <typename I>
+uint64_t ImageWriteSameRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first,
+ extent.second,
+ m_data_bl));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, 0);
+ }
+
+ return tid;
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageWriteSameRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ ObjectDispatchSpec *req;
+
+ if (util::assemble_write_same_extent(object_extent, m_data_bl, &bl, false)) {
+ auto buffer_extents{object_extent.buffer_extents};
+
+ req = ObjectDispatchSpec::create_write_same(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, object_extent.length, std::move(buffer_extents),
+ std::move(bl), io_context, m_op_flags, journal_tid,
+ this->m_trace, on_finish);
+ return req;
+ }
+ req = ObjectDispatchSpec::create_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, std::move(bl), io_context, m_op_flags, 0,
+ std::nullopt, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageWriteSameRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_ws);
+ image_ctx.perfcounter->inc(l_librbd_ws_bytes, length);
+}
+
+template <typename I>
+uint64_t ImageCompareAndWriteRequest<I>::append_journal_event(
+ bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(this->m_image_extents.size() == 1);
+ auto &extent = this->m_image_extents.front();
+ journal::EventEntry event_entry(
+ journal::AioCompareAndWriteEvent(extent.first, extent.second, m_cmp_bl,
+ m_bl));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, -EILSEQ);
+
+ return tid;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::assemble_extent(
+ const LightweightObjectExtent &object_extent, bufferlist *bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+ }
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageCompareAndWriteRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ // NOTE: safe to move m_cmp_bl since we only support this op against
+ // a single object
+ bufferlist bl;
+ assemble_extent(object_extent, &bl);
+ auto req = ObjectDispatchSpec::create_compare_and_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, std::move(m_cmp_bl), std::move(bl), io_context,
+ m_mismatch_offset, m_op_flags, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_cmp);
+ image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length);
+}
+
+template <typename I>
+int ImageCompareAndWriteRequest<I>::prune_object_extents(
+ LightweightObjectExtents* object_extents) const {
+ if (object_extents->size() > 1)
+ return -EINVAL;
+
+ I &image_ctx = this->m_image_ctx;
+ uint64_t sector_size = 512ULL;
+ uint64_t su = image_ctx.layout.stripe_unit;
+ auto& object_extent = object_extents->front();
+ if (object_extent.offset % sector_size + object_extent.length > sector_size ||
+ (su != 0 && (object_extent.offset % su + object_extent.length > su)))
+ return -EINVAL;
+
+ return 0;
+}
+
+template <typename I>
+ImageListSnapsRequest<I>::ImageListSnapsRequest(
+ I& image_ctx, AioCompletion* aio_comp, Extents&& image_extents,
+ SnapIds&& snap_ids, int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace& parent_trace)
+ : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents),
+ image_ctx.get_data_io_context(), "list-snaps",
+ parent_trace),
+ m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags),
+ m_snapshot_delta(snapshot_delta) {
+}
+
+template <typename I>
+void ImageListSnapsRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ // map image extents to object extents
+ auto &image_extents = this->m_image_extents;
+ std::map<uint64_t, Extents> object_number_extents;
+ for (auto& image_extent : image_extents) {
+ if (image_extent.second == 0) {
+ continue;
+ }
+
+ striper::LightweightObjectExtents object_extents;
+ io::util::file_to_extents(&image_ctx, image_extent.first,
+ image_extent.second, 0, &object_extents);
+ for (auto& object_extent : object_extents) {
+ object_number_extents[object_extent.object_no].emplace_back(
+ object_extent.offset, object_extent.length);
+ }
+ }
+
+ // reassemble the deltas back into image-extents when complete
+ auto aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+ auto assemble_ctx = new C_AssembleSnapshotDeltas<I>(
+ &image_ctx, aio_comp, m_snapshot_delta);
+ auto sub_aio_comp = AioCompletion::create_and_start<
+ Context, &Context::complete>(assemble_ctx, get_image_ctx(&image_ctx),
+ AIO_TYPE_GENERIC);
+
+ // issue the requests
+ sub_aio_comp->set_request_count(object_number_extents.size());
+ for (auto& oe : object_number_extents) {
+ ldout(cct, 20) << data_object_name(&image_ctx, oe.first) << " "
+ << oe.second << dendl;
+ auto ctx = new C_AioRequest(sub_aio_comp);
+ auto req = ObjectDispatchSpec::create_list_snaps(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.first, std::move(oe.second),
+ SnapIds{m_snap_ids}, m_list_snaps_flags, this->m_trace,
+ assemble_ctx->get_snapshot_delta(oe.first), ctx);
+ req->send();
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageRequest<librbd::ImageCtx>;
+template class librbd::io::ImageReadRequest<librbd::ImageCtx>;
+template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
+template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
+template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageListSnapsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h
new file mode 100644
index 000000000..2c05c3847
--- /dev/null
+++ b/src/librbd/io/ImageRequest.h
@@ -0,0 +1,386 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_H
+#define CEPH_LIBRBD_IO_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "common/zipkin_trace.h"
+#include "osd/osd_types.h"
+#include "librbd/Utils.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <utility>
+#include <vector>
+
+namespace librbd {
+class ImageCtx;
+
+namespace io {
+
+class AioCompletion;
+class ObjectDispatchSpec;
+class ReadResult;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageRequest {
+public:
+ typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+ virtual ~ImageRequest() {
+ m_trace.event("finish");
+ }
+
+ static void aio_read(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace);
+ static void aio_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace);
+ static void aio_discard(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ IOContext io_context,
+ const ZTracer::Trace &parent_trace);
+ static void aio_flush(ImageCtxT *ictx, AioCompletion *c,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace);
+ static void aio_writesame(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace);
+
+ static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace);
+
+ void send();
+
+ inline const ZTracer::Trace &get_trace() const {
+ return m_trace;
+ }
+
+protected:
+ typedef std::list<ObjectDispatchSpec*> ObjectRequests;
+
+ ImageCtxT &m_image_ctx;
+ AioCompletion *m_aio_comp;
+ Extents m_image_extents;
+ IOContext m_io_context;
+ ZTracer::Trace m_trace;
+
+ ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, IOContext io_context,
+ const char *trace_name,
+ const ZTracer::Trace &parent_trace)
+ : m_image_ctx(image_ctx), m_aio_comp(aio_comp),
+ m_image_extents(std::move(image_extents)), m_io_context(io_context),
+ m_trace(librbd::util::create_trace(image_ctx, trace_name, parent_trace)) {
+ m_trace.event("start");
+ }
+
+ virtual void update_timestamp();
+ virtual void send_request() = 0;
+
+ virtual aio_type_t get_aio_type() const = 0;
+ virtual const char *get_request_type() const = 0;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageReadRequest : public ImageRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace);
+
+protected:
+ void send_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_READ;
+ }
+ const char *get_request_type() const override {
+ return "aio_read";
+ }
+private:
+ int m_op_flags;
+ int m_read_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AbstractImageWriteRequest : public ImageRequest<ImageCtxT> {
+public:
+ inline void flag_synchronous() {
+ m_synchronous = true;
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, IOContext io_context,
+ const char *trace_name,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents),
+ io_context, trace_name, parent_trace),
+ m_synchronous(false) {
+ }
+
+ void send_request() override;
+
+ virtual int prune_object_extents(
+ LightweightObjectExtents* object_extents) const {
+ return 0;
+ }
+
+ void send_object_requests(const LightweightObjectExtents &object_extents,
+ IOContext io_context, uint64_t journal_tid);
+ virtual ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) = 0;
+
+ virtual uint64_t append_journal_event(bool synchronous) = 0;
+ virtual void update_stats(size_t length) = 0;
+
+private:
+ bool m_synchronous;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), io_context, "write",
+ parent_trace),
+ m_bl(std::move(bl)), m_op_flags(op_flags) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_write";
+ }
+
+ void assemble_extent(const LightweightObjectExtent &object_extent,
+ bufferlist *bl);
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+private:
+ bufferlist m_bl;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageDiscardRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents&& image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), io_context, "discard",
+ parent_trace),
+ m_discard_granularity_bytes(discard_granularity_bytes) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_DISCARD;
+ }
+ const char *get_request_type() const override {
+ return "aio_discard";
+ }
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ int prune_object_extents(
+ LightweightObjectExtents* object_extents) const override;
+
+private:
+ uint32_t m_discard_granularity_bytes;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageFlushRequest : public ImageRequest<ImageCtxT> {
+public:
+ ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {}, {}, "flush",
+ parent_trace),
+ m_flush_source(flush_source) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ void update_timestamp() override {
+ }
+ void send_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_FLUSH;
+ }
+ const char *get_request_type() const override {
+ return "aio_flush";
+ }
+
+private:
+ FlushSource m_flush_source;
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWriteSameRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents&& image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), io_context, "writesame",
+ parent_trace),
+ m_data_bl(std::move(bl)), m_op_flags(op_flags) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_WRITESAME;
+ }
+ const char *get_request_type() const override {
+ return "aio_writesame";
+ }
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+private:
+ bufferlist m_data_bl;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCompareAndWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), io_context,
+ "compare_and_write", parent_trace),
+ m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+protected:
+ void assemble_extent(const LightweightObjectExtent &object_extent,
+ bufferlist *bl);
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_COMPARE_AND_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_compare_and_write";
+ }
+
+ int prune_object_extents(
+ LightweightObjectExtents* object_extents) const override;
+
+private:
+ bufferlist m_cmp_bl;
+ bufferlist m_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageListSnapsRequest : public ImageRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::Extents;
+
+ ImageListSnapsRequest(
+ ImageCtxT& image_ctx, AioCompletion* aio_comp,
+ Extents&& image_extents, SnapIds&& snap_ids, int list_snaps_flags,
+ SnapshotDelta* snapshot_delta, const ZTracer::Trace& parent_trace);
+
+protected:
+ void update_timestamp() override {}
+ void send_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_GENERIC;
+ }
+ const char *get_request_type() const override {
+ return "list-snaps";
+ }
+
+private:
+ SnapIds m_snap_ids;
+ int m_list_snaps_flags;
+ SnapshotDelta* m_snapshot_delta;
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageReadRequest<librbd::ImageCtx>;
+extern template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageListSnapsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H
diff --git a/src/librbd/io/ObjectDispatch.cc b/src/librbd/io/ObjectDispatch.cc
new file mode 100644
index 000000000..a31cc74ea
--- /dev/null
+++ b/src/librbd/io/ObjectDispatch.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatch.h"
+#include "common/dout.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+
+template <typename I>
+ObjectDispatch<I>::ObjectDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+void ObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_image_ctx->asio_engine->post(on_finish, 0);
+}
+
+template <typename I>
+bool ObjectDispatch<I>::read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectReadRequest<I>(m_image_ctx, object_no, extents,
+ io_context, op_flags, read_flags,
+ parent_trace, version, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectDiscardRequest<I>(m_image_ctx, object_no, object_off,
+ object_len, io_context, discard_flags,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectWriteRequest<I>(m_image_ctx, object_no, object_off,
+ std::move(data), io_context, op_flags,
+ write_flags, assert_version,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectWriteSameRequest<I>(m_image_ctx, object_no,
+ object_off, object_len,
+ std::move(data), io_context,
+ op_flags, parent_trace,
+ on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << write_data.length() << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectCompareAndWriteRequest<I>(m_image_ctx, object_no,
+ object_off,
+ std::move(cmp_data),
+ std::move(write_data),
+ io_context, mismatch_offset,
+ op_flags, parent_trace,
+ on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::list_snaps(
+ uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << "extents=" << extents << ", "
+ << "snap_ids=" << snap_ids << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = ObjectListSnapsRequest<I>::create(
+ m_image_ctx, object_no, std::move(extents), std::move(snap_ids),
+ list_snap_flags, parent_trace, snapshot_delta, on_dispatched);
+ req->send();
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectDispatch.h b/src/librbd/io/ObjectDispatch.h
new file mode 100644
index 000000000..dd1f7261d
--- /dev/null
+++ b/src/librbd/io/ObjectDispatch.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectDispatch : public ObjectDispatchInterface {
+public:
+ ObjectDispatch(ImageCtxT* image_ctx);
+
+ ObjectDispatchLayer get_dispatch_layer() const override {
+ return OBJECT_DISPATCH_LAYER_CORE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override {
+ return false;
+ }
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
diff --git a/src/librbd/io/ObjectDispatchInterface.h b/src/librbd/io/ObjectDispatchInterface.h
new file mode 100644
index 000000000..2e9dd1300
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchInterface.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+struct RWLock;
+
+namespace librbd {
+namespace io {
+
+struct AioCompletion;
+struct ObjectDispatchInterface;
+struct ObjectDispatchSpec;
+
+struct ObjectDispatchInterface {
+ typedef ObjectDispatchInterface Dispatch;
+ typedef ObjectDispatchLayer DispatchLayer;
+ typedef ObjectDispatchSpec DispatchSpec;
+
+ virtual ~ObjectDispatchInterface() {
+ }
+
+ virtual ObjectDispatchLayer get_dispatch_layer() const = 0;
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual bool read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context**on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) = 0;
+
+ virtual bool list_snaps(
+ uint64_t object_no, Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool invalidate_cache(Context* on_finish) = 0;
+ virtual bool reset_existence_cache(Context* on_finish) = 0;
+
+ virtual void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) = 0;
+
+ virtual int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
diff --git a/src/librbd/io/ObjectDispatchSpec.cc b/src/librbd/io/ObjectDispatchSpec.cc
new file mode 100644
index 000000000..3efff9774
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchSpec.cc
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "include/Context.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include <boost/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+void ObjectDispatchSpec::C_Dispatcher::complete(int r) {
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ switch (object_dispatch_spec->dispatch_result) {
+ case DISPATCH_RESULT_CONTINUE:
+ object_dispatch_spec->send();
+ break;
+ case DISPATCH_RESULT_COMPLETE:
+ finish(r);
+ break;
+ case DISPATCH_RESULT_INVALID:
+ case DISPATCH_RESULT_RESTART:
+ ceph_abort();
+ break;
+ }
+}
+
+void ObjectDispatchSpec::C_Dispatcher::finish(int r) {
+ on_finish->complete(r);
+ delete object_dispatch_spec;
+}
+
+void ObjectDispatchSpec::send() {
+ object_dispatcher->send(this);
+}
+
+void ObjectDispatchSpec::fail(int r) {
+ ceph_assert(r < 0);
+ dispatcher_ctx.complete(r);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ObjectDispatchSpec.h b/src/librbd/io/ObjectDispatchSpec.h
new file mode 100644
index 000000000..a0d4b49a4
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchSpec.h
@@ -0,0 +1,295 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <boost/variant/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+struct ObjectDispatcherInterface;
+
+struct ObjectDispatchSpec {
+private:
+ // helper to avoid extra heap allocation per object IO
+ struct C_Dispatcher : public Context {
+ ObjectDispatchSpec* object_dispatch_spec;
+ Context* on_finish;
+
+ C_Dispatcher(ObjectDispatchSpec* object_dispatch_spec, Context* on_finish)
+ : object_dispatch_spec(object_dispatch_spec), on_finish(on_finish) {
+ }
+
+ void complete(int r) override;
+ void finish(int r) override;
+ };
+
+public:
+ struct RequestBase {
+ uint64_t object_no;
+
+ RequestBase(uint64_t object_no)
+ : object_no(object_no) {
+ }
+ };
+
+ struct ReadRequest : public RequestBase {
+ ReadExtents* extents;
+ int read_flags;
+ uint64_t* version;
+
+ ReadRequest(uint64_t object_no, ReadExtents* extents, int read_flags,
+ uint64_t* version)
+ : RequestBase(object_no), extents(extents), read_flags(read_flags),
+ version(version) {
+ }
+ };
+
+ struct WriteRequestBase : public RequestBase {
+ uint64_t object_off;
+ uint64_t journal_tid;
+
+ WriteRequestBase(uint64_t object_no, uint64_t object_off,
+ uint64_t journal_tid)
+ : RequestBase(object_no), object_off(object_off),
+ journal_tid(journal_tid) {
+ }
+ };
+
+ struct DiscardRequest : public WriteRequestBase {
+ uint64_t object_len;
+ int discard_flags;
+
+ DiscardRequest(uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ int discard_flags, uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ object_len(object_len), discard_flags(discard_flags) {
+ }
+ };
+
+ struct WriteRequest : public WriteRequestBase {
+ ceph::bufferlist data;
+ int write_flags;
+ std::optional<uint64_t> assert_version;
+
+ WriteRequest(uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, int write_flags,
+ std::optional<uint64_t> assert_version, uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ data(std::move(data)), write_flags(write_flags),
+ assert_version(assert_version) {
+ }
+ };
+
+ struct WriteSameRequest : public WriteRequestBase {
+ uint64_t object_len;
+ LightweightBufferExtents buffer_extents;
+ ceph::bufferlist data;
+
+ WriteSameRequest(uint64_t object_no, uint64_t object_off,
+ uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents,
+ ceph::bufferlist&& data, uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ object_len(object_len), buffer_extents(std::move(buffer_extents)),
+ data(std::move(data)) {
+ }
+ };
+
+ struct CompareAndWriteRequest : public WriteRequestBase {
+ ceph::bufferlist cmp_data;
+ ceph::bufferlist data;
+ uint64_t* mismatch_offset;
+
+ CompareAndWriteRequest(uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& data,
+ uint64_t* mismatch_offset,
+ uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ cmp_data(std::move(cmp_data)), data(std::move(data)),
+ mismatch_offset(mismatch_offset) {
+ }
+ };
+
+ struct FlushRequest {
+ FlushSource flush_source;
+ uint64_t journal_tid;
+
+ FlushRequest(FlushSource flush_source, uint64_t journal_tid)
+ : flush_source(flush_source), journal_tid(journal_tid) {
+ }
+ };
+
+ struct ListSnapsRequest : public RequestBase {
+ Extents extents;
+ SnapIds snap_ids;
+ int list_snaps_flags;
+ SnapshotDelta* snapshot_delta;
+
+ ListSnapsRequest(uint64_t object_no, Extents&& extents,
+ SnapIds&& snap_ids, int list_snaps_flags,
+ SnapshotDelta* snapshot_delta)
+ : RequestBase(object_no), extents(std::move(extents)),
+ snap_ids(std::move(snap_ids)),list_snaps_flags(list_snaps_flags),
+ snapshot_delta(snapshot_delta) {
+ }
+ };
+
+ typedef boost::variant<ReadRequest,
+ DiscardRequest,
+ WriteRequest,
+ WriteSameRequest,
+ CompareAndWriteRequest,
+ FlushRequest,
+ ListSnapsRequest> Request;
+
+ C_Dispatcher dispatcher_ctx;
+
+ ObjectDispatcherInterface* object_dispatcher;
+ ObjectDispatchLayer dispatch_layer;
+ int object_dispatch_flags = 0;
+ DispatchResult dispatch_result = DISPATCH_RESULT_INVALID;
+
+ Request request;
+ IOContext io_context;
+ int op_flags;
+ ZTracer::Trace parent_trace;
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_read(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, Context* on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ ReadRequest{object_no, extents,
+ read_flags, version},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_discard(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ DiscardRequest{object_no, object_off,
+ object_len, discard_flags,
+ journal_tid},
+ io_context, 0, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_write(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ WriteRequest{object_no, object_off,
+ std::move(data), write_flags,
+ assert_version, journal_tid},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_write_same(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ WriteSameRequest{object_no, object_off,
+ object_len,
+ std::move(buffer_extents),
+ std::move(data),
+ journal_tid},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_compare_and_write(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context,
+ uint64_t *mismatch_offset, int op_flags, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ CompareAndWriteRequest{object_no,
+ object_off,
+ std::move(cmp_data),
+ std::move(write_data),
+ mismatch_offset,
+ journal_tid},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_flush(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ FlushSource flush_source, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ FlushRequest{flush_source, journal_tid},
+ {}, 0, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_list_snaps(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, Extents&& extents, SnapIds&& snap_ids,
+ int list_snaps_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, Context* on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ ListSnapsRequest{object_no,
+ std::move(extents),
+ std::move(snap_ids),
+ list_snaps_flags,
+ snapshot_delta},
+ {}, 0, parent_trace, on_finish);
+ }
+
+ void send();
+ void fail(int r);
+
+private:
+ template <typename> friend class ObjectDispatcher;
+
+ ObjectDispatchSpec(ObjectDispatcherInterface* object_dispatcher,
+ ObjectDispatchLayer object_dispatch_layer,
+ Request&& request, IOContext io_context, int op_flags,
+ const ZTracer::Trace& parent_trace, Context* on_finish)
+ : dispatcher_ctx(this, on_finish), object_dispatcher(object_dispatcher),
+ dispatch_layer(object_dispatch_layer), request(std::move(request)),
+ io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) {
+ }
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
diff --git a/src/librbd/io/ObjectDispatcher.cc b/src/librbd/io/ObjectDispatcher.cc
new file mode 100644
index 000000000..b66c6bb18
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcher.cc
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatcher.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ObjectDispatch.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include <boost/variant.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectDispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct ObjectDispatcher<I>::C_ResetExistenceCache : public C_LayerIterator {
+ C_ResetExistenceCache(ObjectDispatcher* object_dispatcher, Context* on_finish)
+ : C_LayerIterator(object_dispatcher, OBJECT_DISPATCH_LAYER_NONE, on_finish) {
+ }
+
+ bool execute(ObjectDispatchInterface* object_dispatch,
+ Context* on_finish) override {
+ return object_dispatch->reset_existence_cache(on_finish);
+ }
+};
+
+template <typename I>
+struct ObjectDispatcher<I>::SendVisitor : public boost::static_visitor<bool> {
+ ObjectDispatchInterface* object_dispatch;
+ ObjectDispatchSpec* object_dispatch_spec;
+
+ SendVisitor(ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec)
+ : object_dispatch(object_dispatch),
+ object_dispatch_spec(object_dispatch_spec) {
+ }
+
+ bool operator()(ObjectDispatchSpec::ReadRequest& read) const {
+ return object_dispatch->read(
+ read.object_no, read.extents, object_dispatch_spec->io_context,
+ object_dispatch_spec->op_flags, read.read_flags,
+ object_dispatch_spec->parent_trace, read.version,
+ &object_dispatch_spec->object_dispatch_flags,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::DiscardRequest& discard) const {
+ return object_dispatch->discard(
+ discard.object_no, discard.object_off, discard.object_len,
+ object_dispatch_spec->io_context, discard.discard_flags,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &discard.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::WriteRequest& write) const {
+ return object_dispatch->write(
+ write.object_no, write.object_off, std::move(write.data),
+ object_dispatch_spec->io_context, object_dispatch_spec->op_flags,
+ write.write_flags, write.assert_version,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &write.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::WriteSameRequest& write_same) const {
+ return object_dispatch->write_same(
+ write_same.object_no, write_same.object_off, write_same.object_len,
+ std::move(write_same.buffer_extents), std::move(write_same.data),
+ object_dispatch_spec->io_context, object_dispatch_spec->op_flags,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &write_same.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(
+ ObjectDispatchSpec::CompareAndWriteRequest& compare_and_write) const {
+ return object_dispatch->compare_and_write(
+ compare_and_write.object_no, compare_and_write.object_off,
+ std::move(compare_and_write.cmp_data), std::move(compare_and_write.data),
+ object_dispatch_spec->io_context, object_dispatch_spec->op_flags,
+ object_dispatch_spec->parent_trace, compare_and_write.mismatch_offset,
+ &object_dispatch_spec->object_dispatch_flags,
+ &compare_and_write.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::FlushRequest& flush) const {
+ return object_dispatch->flush(
+ flush.flush_source, object_dispatch_spec->parent_trace,
+ &flush.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::ListSnapsRequest& list_snaps) const {
+ return object_dispatch->list_snaps(
+ list_snaps.object_no, std::move(list_snaps.extents),
+ std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags,
+ object_dispatch_spec->parent_trace, list_snaps.snapshot_delta,
+ &object_dispatch_spec->object_dispatch_flags,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+};
+
+template <typename I>
+ObjectDispatcher<I>::ObjectDispatcher(I* image_ctx)
+ : Dispatcher<I, ObjectDispatcherInterface>(image_ctx) {
+ // configure the core object dispatch handler on startup
+ auto object_dispatch = new ObjectDispatch(image_ctx);
+ this->register_dispatch(object_dispatch);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::invalidate_cache(Context* on_finish) {
+ auto image_ctx = this->m_image_ctx;
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish = util::create_async_context_callback(*image_ctx, on_finish);
+ auto ctx = new C_InvalidateCache(
+ this, OBJECT_DISPATCH_LAYER_NONE, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::reset_existence_cache(Context* on_finish) {
+ auto image_ctx = this->m_image_ctx;
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish = util::create_async_context_callback(*image_ctx, on_finish);
+ auto ctx = new C_ResetExistenceCache(this, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ auto cct = this->m_image_ctx->cct;
+ ldout(cct, 20) << object_no << " " << object_off << "~" << object_len
+ << dendl;
+
+ std::shared_lock locker{this->m_lock};
+ for (auto it : this->m_dispatches) {
+ auto& object_dispatch_meta = it.second;
+ auto object_dispatch = object_dispatch_meta.dispatch;
+ object_dispatch->extent_overwritten(object_no, object_off, object_len,
+ journal_tid, new_journal_tid);
+ }
+}
+
+template <typename I>
+int ObjectDispatcher<I>::prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) {
+ auto cct = this->m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << dendl;
+
+ std::shared_lock locker{this->m_lock};
+ for (auto it : this->m_dispatches) {
+ auto& object_dispatch_meta = it.second;
+ auto object_dispatch = object_dispatch_meta.dispatch;
+ auto r = object_dispatch->prepare_copyup(
+ object_no, snapshot_sparse_bufferlist);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+bool ObjectDispatcher<I>::send_dispatch(
+ ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec) {
+ return boost::apply_visitor(
+ SendVisitor{object_dispatch, object_dispatch_spec},
+ object_dispatch_spec->request);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectDispatcher<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectDispatcher.h b/src/librbd/io/ObjectDispatcher.h
new file mode 100644
index 000000000..1e5e78d8b
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcher.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include "librbd/io/Dispatcher.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectDispatcher
+ : public Dispatcher<ImageCtxT, ObjectDispatcherInterface> {
+public:
+ ObjectDispatcher(ImageCtxT* image_ctx);
+
+ void invalidate_cache(Context* on_finish) override;
+ void reset_existence_cache(Context* on_finish) override;
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override;
+
+ int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override;
+
+ using typename Dispatcher<ImageCtxT, ObjectDispatcherInterface>::C_LayerIterator;
+
+ using typename Dispatcher<ImageCtxT, ObjectDispatcherInterface>::C_InvalidateCache;
+
+protected:
+ bool send_dispatch(ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec) override;
+
+private:
+ struct C_ResetExistenceCache;
+ struct SendVisitor;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectDispatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
diff --git a/src/librbd/io/ObjectDispatcherInterface.h b/src/librbd/io/ObjectDispatcherInterface.h
new file mode 100644
index 000000000..0f3d33330
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcherInterface.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H
+
+#include "include/int_types.h"
+#include "librbd/io/DispatcherInterface.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+struct ObjectDispatcherInterface
+ : public DispatcherInterface<ObjectDispatchInterface> {
+public:
+ virtual void invalidate_cache(Context* on_finish) = 0;
+ virtual void reset_existence_cache(Context* on_finish) = 0;
+
+ virtual void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) = 0;
+
+ virtual int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H
diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc
new file mode 100644
index 000000000..87c3cd7dd
--- /dev/null
+++ b/src/librbd/io/ObjectRequest.cc
@@ -0,0 +1,1075 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectRequest.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/err.h"
+#include "include/neorados/RADOS.hpp"
+#include "osd/osd_types.h"
+#include "librados/snap_set_diff.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/CopyupRequest.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/Utils.h"
+
+#include <boost/optional.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \
+ << " " << __func__ << ": " \
+ << data_object_name(this->m_ictx, \
+ this->m_object_no) << " "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+using librbd::util::create_context_callback;
+using librbd::util::create_trace;
+
+namespace {
+
+template <typename I>
+inline bool is_copy_on_read(I *ictx, const IOContext& io_context) {
+ std::shared_lock image_locker{ictx->image_lock};
+ return (ictx->clone_copy_on_read && !ictx->read_only &&
+ io_context->read_snap().value_or(CEPH_NOSNAP) == CEPH_NOSNAP &&
+ (ictx->exclusive_lock == nullptr ||
+ ictx->exclusive_lock->is_lock_owner()));
+}
+
+template <typename S, typename D>
+void convert_snap_set(const S& src_snap_set,
+ D* dst_snap_set) {
+ dst_snap_set->seq = src_snap_set.seq;
+ dst_snap_set->clones.reserve(src_snap_set.clones.size());
+ for (auto& src_clone : src_snap_set.clones) {
+ dst_snap_set->clones.emplace_back();
+ auto& dst_clone = dst_snap_set->clones.back();
+ dst_clone.cloneid = src_clone.cloneid;
+ dst_clone.snaps = src_clone.snaps;
+ dst_clone.overlap = src_clone.overlap;
+ dst_clone.size = src_clone.size;
+ }
+}
+
+} // anonymous namespace
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_write(
+ I *ictx, uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectWriteRequest<I>(ictx, object_no, object_off,
+ std::move(data), io_context, op_flags,
+ write_flags, assert_version,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_discard(
+ I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectDiscardRequest<I>(ictx, object_no, object_off,
+ object_len, io_context, discard_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_write_same(
+ I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ ceph::bufferlist&& data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectWriteSameRequest<I>(ictx, object_no, object_off,
+ object_len, std::move(data), io_context,
+ op_flags, parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_compare_and_write(
+ I *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ IOContext io_context, uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectCompareAndWriteRequest<I>(ictx, object_no, object_off,
+ std::move(cmp_data),
+ std::move(write_data), io_context,
+ mismatch_offset, op_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>::ObjectRequest(
+ I *ictx, uint64_t objectno, IOContext io_context,
+ const char *trace_name, const ZTracer::Trace &trace, Context *completion)
+ : m_ictx(ictx), m_object_no(objectno), m_io_context(io_context),
+ m_completion(completion),
+ m_trace(create_trace(*ictx, "", trace)) {
+ ceph_assert(m_ictx->data_ctx.is_valid());
+ if (m_trace.valid()) {
+ m_trace.copy_name(trace_name + std::string(" ") +
+ data_object_name(ictx, objectno));
+ m_trace.event("start");
+ }
+}
+
+template <typename I>
+void ObjectRequest<I>::add_write_hint(I& image_ctx, neorados::WriteOp* wr) {
+ auto alloc_hint_flags = static_cast<neorados::alloc_hint::alloc_hint_t>(
+ image_ctx.alloc_hint_flags);
+ if (image_ctx.enable_alloc_hint) {
+ wr->set_alloc_hint(image_ctx.get_object_size(),
+ image_ctx.get_object_size(),
+ alloc_hint_flags);
+ } else if (image_ctx.alloc_hint_flags != 0U) {
+ wr->set_alloc_hint(0, 0, alloc_hint_flags);
+ }
+}
+
+template <typename I>
+bool ObjectRequest<I>::compute_parent_extents(Extents *parent_extents,
+ bool read_request) {
+ ceph_assert(ceph_mutex_is_locked(m_ictx->image_lock));
+
+ m_has_parent = false;
+ parent_extents->clear();
+
+ uint64_t parent_overlap;
+ int r = m_ictx->get_parent_overlap(
+ m_io_context->read_snap().value_or(CEPH_NOSNAP), &parent_overlap);
+ if (r < 0) {
+ // NOTE: it's possible for a snapshot to be deleted while we are
+ // still reading from it
+ lderr(m_ictx->cct) << "failed to retrieve parent overlap: "
+ << cpp_strerror(r) << dendl;
+ return false;
+ }
+
+ if (!read_request && !m_ictx->migration_info.empty()) {
+ parent_overlap = m_ictx->migration_info.overlap;
+ }
+
+ if (parent_overlap == 0) {
+ return false;
+ }
+
+ io::util::extent_to_file(m_ictx, m_object_no, 0, m_ictx->layout.object_size,
+ *parent_extents);
+ uint64_t object_overlap = m_ictx->prune_parent_extents(*parent_extents,
+ parent_overlap);
+ if (object_overlap > 0) {
+ ldout(m_ictx->cct, 20) << "overlap " << parent_overlap << " "
+ << "extents " << *parent_extents << dendl;
+ m_has_parent = !parent_extents->empty();
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void ObjectRequest<I>::async_finish(int r) {
+ ldout(m_ictx->cct, 20) << "r=" << r << dendl;
+ m_ictx->asio_engine->post([this, r]() { finish(r); });
+}
+
+template <typename I>
+void ObjectRequest<I>::finish(int r) {
+ ldout(m_ictx->cct, 20) << "r=" << r << dendl;
+ m_completion->complete(r);
+ delete this;
+}
+
+/** read **/
+
+template <typename I>
+ObjectReadRequest<I>::ObjectReadRequest(
+ I *ictx, uint64_t objectno, ReadExtents* extents,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* version,
+ Context *completion)
+ : ObjectRequest<I>(ictx, objectno, io_context, "read", parent_trace,
+ completion),
+ m_extents(extents), m_op_flags(op_flags),m_read_flags(read_flags),
+ m_version(version) {
+}
+
+template <typename I>
+void ObjectReadRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ read_object();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::read_object() {
+ I *image_ctx = this->m_ictx;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+ auto read_snap_id = this->m_io_context->read_snap().value_or(CEPH_NOSNAP);
+ if (read_snap_id == image_ctx->snap_id &&
+ image_ctx->object_map != nullptr &&
+ !image_ctx->object_map->object_may_exist(this->m_object_no)) {
+ image_ctx->asio_engine->post([this]() { read_parent(); });
+ return;
+ }
+ image_locker.unlock();
+
+ ldout(image_ctx->cct, 20) << "snap_id=" << read_snap_id << dendl;
+
+ neorados::ReadOp read_op;
+ for (auto& extent: *this->m_extents) {
+ if (extent.length >= image_ctx->sparse_read_threshold_bytes) {
+ read_op.sparse_read(extent.offset, extent.length, &extent.bl,
+ &extent.extent_map);
+ } else {
+ read_op.read(extent.offset, extent.length, &extent.bl);
+ }
+ }
+ util::apply_op_flags(
+ m_op_flags, image_ctx->get_read_flags(read_snap_id), &read_op);
+
+ image_ctx->rados_api.execute(
+ {data_object_name(this->m_ictx, this->m_object_no)},
+ *this->m_io_context, std::move(read_op), nullptr,
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_read_object(r); }), m_version,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+}
+
+template <typename I>
+void ObjectReadRequest<I>::handle_read_object(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (m_version != nullptr) {
+ ldout(image_ctx->cct, 20) << "version=" << *m_version << dendl;
+ }
+
+ if (r == -ENOENT) {
+ read_parent();
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to read from object: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectReadRequest<I>::read_parent() {
+ if ((m_read_flags & READ_FLAG_DISABLE_READ_FROM_PARENT) != 0) {
+ this->finish(-ENOENT);
+ return;
+ }
+
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_parent>(this);
+
+ io::util::read_parent<I>(
+ image_ctx, this->m_object_no, this->m_extents,
+ this->m_io_context->read_snap().value_or(CEPH_NOSNAP), this->m_trace,
+ ctx);
+}
+
+template <typename I>
+void ObjectReadRequest<I>::handle_read_parent(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ this->finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to read parent extents: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ copyup();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::copyup() {
+ I *image_ctx = this->m_ictx;
+ if (!is_copy_on_read(image_ctx, this->m_io_context)) {
+ this->finish(0);
+ return;
+ }
+
+ image_ctx->owner_lock.lock_shared();
+ image_ctx->image_lock.lock_shared();
+ Extents parent_extents;
+ if (!this->compute_parent_extents(&parent_extents, true) ||
+ (image_ctx->exclusive_lock != nullptr &&
+ !image_ctx->exclusive_lock->is_lock_owner())) {
+ image_ctx->image_lock.unlock_shared();
+ image_ctx->owner_lock.unlock_shared();
+ this->finish(0);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ image_ctx->copyup_list_lock.lock();
+ auto it = image_ctx->copyup_list.find(this->m_object_no);
+ if (it == image_ctx->copyup_list.end()) {
+ // create and kick off a CopyupRequest
+ auto new_req = CopyupRequest<I>::create(
+ image_ctx, this->m_object_no, std::move(parent_extents), this->m_trace);
+
+ image_ctx->copyup_list[this->m_object_no] = new_req;
+ image_ctx->copyup_list_lock.unlock();
+ image_ctx->image_lock.unlock_shared();
+ new_req->send();
+ } else {
+ image_ctx->copyup_list_lock.unlock();
+ image_ctx->image_lock.unlock_shared();
+ }
+
+ image_ctx->owner_lock.unlock_shared();
+ this->finish(0);
+}
+
+/** write **/
+
+template <typename I>
+AbstractObjectWriteRequest<I>::AbstractObjectWriteRequest(
+ I *ictx, uint64_t object_no, uint64_t object_off, uint64_t len,
+ IOContext io_context, const char *trace_name,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : ObjectRequest<I>(ictx, object_no, io_context, trace_name, parent_trace,
+ completion),
+ m_object_off(object_off), m_object_len(len)
+{
+ if (this->m_object_off == 0 &&
+ this->m_object_len == ictx->get_object_size()) {
+ m_full_object = true;
+ }
+
+ compute_parent_info();
+
+ ictx->image_lock.lock_shared();
+ if (!ictx->migration_info.empty()) {
+ m_guarding_migration_write = true;
+ }
+ ictx->image_lock.unlock_shared();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::compute_parent_info() {
+ I *image_ctx = this->m_ictx;
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ this->compute_parent_extents(&m_parent_extents, false);
+
+ if (!this->has_parent() ||
+ (m_full_object &&
+ !this->m_io_context->write_snap_context() &&
+ !is_post_copyup_write_required())) {
+ m_copyup_enabled = false;
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::add_write_hint(
+ neorados::WriteOp *wr) {
+ I *image_ctx = this->m_ictx;
+ std::shared_lock image_locker{image_ctx->image_lock};
+ if (image_ctx->object_map == nullptr || !this->m_object_may_exist ||
+ image_ctx->alloc_hint_flags != 0U) {
+ ObjectRequest<I>::add_write_hint(*image_ctx, wr);
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << this->get_op_type() << " "
+ << this->m_object_off << "~" << this->m_object_len
+ << dendl;
+ {
+ std::shared_lock image_lock{image_ctx->image_lock};
+ if (image_ctx->object_map == nullptr) {
+ m_object_may_exist = true;
+ } else {
+ // should have been flushed prior to releasing lock
+ ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
+ m_object_may_exist = image_ctx->object_map->object_may_exist(
+ this->m_object_no);
+ }
+ }
+
+ if (!m_object_may_exist && is_no_op_for_nonexistent_object()) {
+ ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object"
+ << dendl;
+ this->async_finish(0);
+ return;
+ }
+
+ pre_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::pre_write_object_map_update() {
+ I *image_ctx = this->m_ictx;
+
+ image_ctx->image_lock.lock_shared();
+ if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) {
+ image_ctx->image_lock.unlock_shared();
+ write_object();
+ return;
+ }
+
+ if (!m_object_may_exist && m_copyup_enabled) {
+ // optimization: copyup required
+ image_ctx->image_lock.unlock_shared();
+ copyup();
+ return;
+ }
+
+ uint8_t new_state = this->get_pre_write_object_map_state();
+ ldout(image_ctx->cct, 20) << this->m_object_off << "~" << this->m_object_len
+ << dendl;
+
+ if (image_ctx->object_map->template aio_update<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update>(
+ CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false,
+ this)) {
+ image_ctx->image_lock.unlock_shared();
+ return;
+ }
+
+ image_ctx->image_lock.unlock_shared();
+ write_object();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ write_object();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::write_object() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ neorados::WriteOp write_op;
+ if (m_copyup_enabled) {
+ if (m_guarding_migration_write) {
+ auto snap_seq = (this->m_io_context->write_snap_context() ?
+ this->m_io_context->write_snap_context()->first : 0);
+ ldout(image_ctx->cct, 20) << "guarding write: snap_seq=" << snap_seq
+ << dendl;
+
+ cls_client::assert_snapc_seq(
+ &write_op, snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ);
+ } else {
+ ldout(image_ctx->cct, 20) << "guarding write" << dendl;
+ write_op.assert_exists();
+ }
+ }
+
+ add_write_hint(&write_op);
+ add_write_ops(&write_op);
+ ceph_assert(write_op.size() != 0);
+
+ image_ctx->rados_api.execute(
+ {data_object_name(this->m_ictx, this->m_object_no)},
+ *this->m_io_context, std::move(write_op),
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_write_object(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_write_object(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ r = filter_write_result(r);
+ if (r == -ENOENT) {
+ if (m_copyup_enabled) {
+ copyup();
+ return;
+ }
+ } else if (r == -ERANGE && m_guarding_migration_write) {
+ image_ctx->image_lock.lock_shared();
+ m_guarding_migration_write = !image_ctx->migration_info.empty();
+ image_ctx->image_lock.unlock_shared();
+
+ if (m_guarding_migration_write) {
+ copyup();
+ } else {
+ ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl;
+ compute_parent_info();
+ write_object();
+ }
+ return;
+ } else if (r == -EILSEQ) {
+ ldout(image_ctx->cct, 10) << "failed to write object" << dendl;
+ this->finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ post_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::copyup() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ ceph_assert(!m_copyup_in_progress);
+ m_copyup_in_progress = true;
+
+ image_ctx->copyup_list_lock.lock();
+ auto it = image_ctx->copyup_list.find(this->m_object_no);
+ if (it == image_ctx->copyup_list.end()) {
+ auto new_req = CopyupRequest<I>::create(
+ image_ctx, this->m_object_no, std::move(this->m_parent_extents),
+ this->m_trace);
+ this->m_parent_extents.clear();
+
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this, std::move(get_copyup_overwrite_extents()));
+ image_ctx->copyup_list[this->m_object_no] = new_req;
+
+ image_ctx->copyup_list_lock.unlock();
+ new_req->send();
+ } else {
+ it->second->append_request(this, std::move(get_copyup_overwrite_extents()));
+ image_ctx->copyup_list_lock.unlock();
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_copyup(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ ceph_assert(m_copyup_in_progress);
+ m_copyup_in_progress = false;
+
+ if (r < 0 && r != -ERESTART) {
+ lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ if (r == -ERESTART || is_post_copyup_write_required()) {
+ write_object();
+ return;
+ }
+
+ post_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::post_write_object_map_update() {
+ I *image_ctx = this->m_ictx;
+
+ image_ctx->image_lock.lock_shared();
+ if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() ||
+ !is_non_existent_post_write_object_map_state()) {
+ image_ctx->image_lock.unlock_shared();
+ this->finish(0);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ // should have been flushed prior to releasing lock
+ ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
+ if (image_ctx->object_map->template aio_update<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_post_write_object_map_update>(
+ CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING,
+ this->m_trace, false, this)) {
+ image_ctx->image_lock.unlock_shared();
+ return;
+ }
+
+ image_ctx->image_lock.unlock_shared();
+ this->finish(0);
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_post_write_object_map_update(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectWriteRequest<I>::add_write_hint(neorados::WriteOp* wr) {
+ if ((m_write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) {
+ wr->create(true);
+ } else if (m_assert_version.has_value()) {
+ wr->assert_version(m_assert_version.value());
+ }
+ AbstractObjectWriteRequest<I>::add_write_hint(wr);
+}
+
+template <typename I>
+void ObjectWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ if (this->m_full_object) {
+ wr->write_full(bufferlist{m_write_data});
+ } else {
+ wr->write(this->m_object_off, bufferlist{m_write_data});
+ }
+ util::apply_op_flags(m_op_flags, 0U, wr);
+}
+
+template <typename I>
+void ObjectDiscardRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ switch (m_discard_action) {
+ case DISCARD_ACTION_REMOVE:
+ wr->remove();
+ break;
+ case DISCARD_ACTION_REMOVE_TRUNCATE:
+ wr->create(false);
+ // fall through
+ case DISCARD_ACTION_TRUNCATE:
+ wr->truncate(this->m_object_off);
+ break;
+ case DISCARD_ACTION_ZERO:
+ wr->zero(this->m_object_off, this->m_object_len);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+}
+
+template <typename I>
+void ObjectWriteSameRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ wr->writesame(this->m_object_off, this->m_object_len,
+ bufferlist{m_write_data});
+ util::apply_op_flags(m_op_flags, 0U, wr);
+}
+
+template <typename I>
+void ObjectCompareAndWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ wr->cmpext(this->m_object_off, bufferlist{m_cmp_bl}, nullptr);
+
+ if (this->m_full_object) {
+ wr->write_full(bufferlist{m_write_bl});
+ } else {
+ wr->write(this->m_object_off, bufferlist{m_write_bl});
+ }
+ util::apply_op_flags(m_op_flags, 0U, wr);
+}
+
+template <typename I>
+int ObjectCompareAndWriteRequest<I>::filter_write_result(int r) const {
+ if (r <= -MAX_ERRNO) {
+ I *image_ctx = this->m_ictx;
+ Extents image_extents;
+
+ // object extent compare mismatch
+ uint64_t offset = -MAX_ERRNO - r;
+ io::util::extent_to_file(image_ctx, this->m_object_no, offset,
+ this->m_object_len, image_extents);
+ ceph_assert(image_extents.size() == 1);
+
+ if (m_mismatch_offset) {
+ *m_mismatch_offset = image_extents[0].first;
+ }
+ r = -EILSEQ;
+ }
+ return r;
+}
+
+template <typename I>
+ObjectListSnapsRequest<I>::ObjectListSnapsRequest(
+ I *ictx, uint64_t objectno, Extents&& object_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, Context *completion)
+ : ObjectRequest<I>(
+ ictx, objectno, ictx->duplicate_data_io_context(), "snap_list",
+ parent_trace, completion),
+ m_object_extents(std::move(object_extents)),
+ m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags),
+ m_snapshot_delta(snapshot_delta) {
+ this->m_io_context->read_snap(CEPH_SNAPDIR);
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ if (m_snap_ids.size() < 2) {
+ lderr(image_ctx->cct) << "invalid snap ids: " << m_snap_ids << dendl;
+ this->async_finish(-EINVAL);
+ return;
+ }
+
+ list_snaps();
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::list_snaps() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ neorados::ReadOp read_op;
+ read_op.list_snaps(&m_snap_set, &m_ec);
+
+ image_ctx->rados_api.execute(
+ {data_object_name(this->m_ictx, this->m_object_no)},
+ *this->m_io_context, std::move(read_op), nullptr,
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_list_snaps(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::handle_list_snaps(int r) {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ if (r >= 0) {
+ r = -m_ec.value();
+ }
+
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_snapshot_delta->clear();
+ auto& snapshot_delta = *m_snapshot_delta;
+
+ ceph_assert(!m_snap_ids.empty());
+ librados::snap_t start_snap_id = 0;
+ librados::snap_t first_snap_id = *m_snap_ids.begin();
+ librados::snap_t last_snap_id = *m_snap_ids.rbegin();
+
+ if (r == -ENOENT) {
+ // the object does not exist -- mark the missing extents
+ zero_extent(first_snap_id, true);
+ list_from_parent();
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve object snapshot list: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ // helper function requires the librados legacy data structure
+ librados::snap_set_t snap_set;
+ convert_snap_set(m_snap_set, &snap_set);
+
+ bool initial_extents_written = false;
+
+ interval_set<uint64_t> object_interval;
+ for (auto& object_extent : m_object_extents) {
+ object_interval.insert(object_extent.first, object_extent.second);
+ }
+ ldout(cct, 20) << "object_interval=" << object_interval << dendl;
+
+ // loop through all expected snapshots and build interval sets for
+ // data and zeroed ranges for each snapshot
+ uint64_t prev_end_size = 0;
+ interval_set<uint64_t> initial_written_extents;
+ for (auto end_snap_id : m_snap_ids) {
+ if (start_snap_id == end_snap_id) {
+ continue;
+ } else if (end_snap_id > last_snap_id) {
+ break;
+ }
+
+ interval_set<uint64_t> diff;
+ uint64_t end_size;
+ bool exists;
+ librados::snap_t clone_end_snap_id;
+ bool read_whole_object;
+ calc_snap_set_diff(cct, snap_set, start_snap_id,
+ end_snap_id, &diff, &end_size, &exists,
+ &clone_end_snap_id, &read_whole_object);
+
+ if (read_whole_object ||
+ (!diff.empty() &&
+ ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0))) {
+ ldout(cct, 1) << "need to read full object" << dendl;
+ diff.clear();
+ diff.insert(0, image_ctx->layout.object_size);
+ end_size = image_ctx->layout.object_size;
+ clone_end_snap_id = end_snap_id;
+ } else if (!exists) {
+ end_size = 0;
+ }
+
+ if (exists) {
+ // reads should be issued against the newest (existing) snapshot within
+ // the associated snapshot object clone. writes should be issued
+ // against the oldest snapshot in the snap_map.
+ ceph_assert(clone_end_snap_id >= end_snap_id);
+ if (clone_end_snap_id > last_snap_id) {
+ // do not read past the copy point snapshot
+ clone_end_snap_id = last_snap_id;
+ }
+ }
+
+ // clip diff to current object extent
+ interval_set<uint64_t> diff_interval;
+ diff_interval.intersection_of(object_interval, diff);
+
+ // clip diff to size of object (in case it was truncated)
+ interval_set<uint64_t> zero_interval;
+ if (end_size < prev_end_size) {
+ zero_interval.insert(end_size, prev_end_size - end_size);
+ zero_interval.intersection_of(object_interval);
+
+ interval_set<uint64_t> trunc_interval;
+ trunc_interval.intersection_of(zero_interval, diff_interval);
+ if (!trunc_interval.empty()) {
+ diff_interval.subtract(trunc_interval);
+ ldout(cct, 20) << "clearing truncate diff: " << trunc_interval << dendl;
+ }
+ }
+
+ ldout(cct, 20) << "start_snap_id=" << start_snap_id << ", "
+ << "end_snap_id=" << end_snap_id << ", "
+ << "clone_end_snap_id=" << clone_end_snap_id << ", "
+ << "diff=" << diff << ", "
+ << "diff_interval=" << diff_interval<< ", "
+ << "zero_interval=" << zero_interval<< ", "
+ << "end_size=" << end_size << ", "
+ << "prev_end_size=" << prev_end_size << ", "
+ << "exists=" << exists << ", "
+ << "whole_object=" << read_whole_object << dendl;
+
+ // check if object exists prior to start of incremental snap delta so that
+ // we don't DNE the object if no additional deltas exist
+ if (exists && start_snap_id == 0 &&
+ (!diff_interval.empty() || !zero_interval.empty())) {
+ ldout(cct, 20) << "object exists at snap id " << end_snap_id << dendl;
+ initial_extents_written = true;
+ }
+
+ prev_end_size = end_size;
+ start_snap_id = end_snap_id;
+
+ if (end_snap_id <= first_snap_id) {
+ // don't include deltas from the starting snapshots, but we iterate over
+ // it to track its existence and size
+ ldout(cct, 20) << "skipping prior snapshot " << dendl;
+ continue;
+ }
+
+ if (exists) {
+ for (auto& interval : diff_interval) {
+ snapshot_delta[{end_snap_id, clone_end_snap_id}].insert(
+ interval.first, interval.second,
+ SparseExtent(SPARSE_EXTENT_STATE_DATA, interval.second));
+ }
+ } else {
+ zero_interval.union_of(diff_interval);
+ }
+
+ if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) {
+ for (auto& interval : zero_interval) {
+ snapshot_delta[{end_snap_id, end_snap_id}].insert(
+ interval.first, interval.second,
+ SparseExtent(SPARSE_EXTENT_STATE_ZEROED, interval.second));
+ }
+ }
+ }
+
+ bool snapshot_delta_empty = snapshot_delta.empty();
+ if (!initial_extents_written) {
+ zero_extent(first_snap_id, first_snap_id > 0);
+ }
+ ldout(cct, 20) << "snapshot_delta=" << snapshot_delta << dendl;
+
+ if (snapshot_delta_empty) {
+ list_from_parent();
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::list_from_parent() {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ ceph_assert(!m_snap_ids.empty());
+ librados::snap_t snap_id_start = *m_snap_ids.begin();
+ librados::snap_t snap_id_end = *m_snap_ids.rbegin();
+
+ std::unique_lock image_locker{image_ctx->image_lock};
+ if ((snap_id_start > 0) || (image_ctx->parent == nullptr) ||
+ ((m_list_snaps_flags & LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT) != 0)) {
+ image_locker.unlock();
+
+ this->finish(0);
+ return;
+ }
+
+ // calculate reverse mapping onto the parent image
+ Extents parent_image_extents;
+ for (auto [object_off, object_len]: m_object_extents) {
+ io::util::extent_to_file(image_ctx, this->m_object_no, object_off,
+ object_len, parent_image_extents);
+ }
+
+ uint64_t parent_overlap = 0;
+ uint64_t object_overlap = 0;
+ int r = image_ctx->get_parent_overlap(snap_id_end, &parent_overlap);
+ if (r == 0) {
+ object_overlap = image_ctx->prune_parent_extents(parent_image_extents,
+ parent_overlap);
+ }
+
+ if (object_overlap == 0) {
+ image_locker.unlock();
+
+ this->finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ObjectListSnapsRequest<I>,
+ &ObjectListSnapsRequest<I>::handle_list_from_parent>(this);
+ auto aio_comp = AioCompletion::create_and_start(
+ ctx, librbd::util::get_image_ctx(image_ctx->parent), AIO_TYPE_GENERIC);
+ ldout(cct, 20) << "aio_comp=" << aio_comp<< ", "
+ << "parent_image_extents " << parent_image_extents << dendl;
+
+ auto list_snaps_flags = (
+ m_list_snaps_flags | LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS);
+
+ ImageListSnapsRequest<I> req(
+ *image_ctx->parent, aio_comp, std::move(parent_image_extents),
+ {0, image_ctx->parent->snap_id}, list_snaps_flags, &m_parent_snapshot_delta,
+ this->m_trace);
+ req.send();
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::handle_list_from_parent(int r) {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ ldout(cct, 20) << "r=" << r << ", "
+ << "parent_snapshot_delta=" << m_parent_snapshot_delta
+ << dendl;
+
+ // ignore special-case of fully empty dataset (we ignore zeroes)
+ if (m_parent_snapshot_delta.empty()) {
+ this->finish(0);
+ return;
+ }
+
+ // the write/read snapshot id key is not useful for parent images so
+ // map the the special-case INITIAL_WRITE_READ_SNAP_IDS key
+ *m_snapshot_delta = {};
+ auto& intervals = (*m_snapshot_delta)[INITIAL_WRITE_READ_SNAP_IDS];
+ for (auto& [key, image_extents] : m_parent_snapshot_delta) {
+ for (auto image_extent : image_extents) {
+ auto state = image_extent.get_val().state;
+
+ // map image-extents back to this object
+ striper::LightweightObjectExtents object_extents;
+ io::util::file_to_extents(image_ctx, image_extent.get_off(),
+ image_extent.get_len(), 0, &object_extents);
+ for (auto& object_extent : object_extents) {
+ ceph_assert(object_extent.object_no == this->m_object_no);
+ intervals.insert(
+ object_extent.offset, object_extent.length,
+ {state, object_extent.length});
+ }
+ }
+ }
+
+ ldout(cct, 20) << "snapshot_delta=" << *m_snapshot_delta << dendl;
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::zero_extent(uint64_t snap_id, bool dne) {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ // the object does not exist or is (partially) under whiteout -- mark the
+ // missing extents which would be any portion of the object that does not
+ // have data in the initial snapshot set
+ if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) {
+ interval_set<uint64_t> interval;
+ for (auto [object_offset, object_length] : m_object_extents) {
+ interval.insert(object_offset, object_length);
+ }
+
+ for (auto [offset, length] : interval) {
+ ldout(cct, 20) << "snapshot " << snap_id << ": "
+ << (dne ? "DNE" : "zeroed") << " extent "
+ << offset << "~" << length << dendl;
+ (*m_snapshot_delta)[{snap_id, snap_id}].insert(
+ offset, length,
+ SparseExtent(
+ (dne ? SPARSE_EXTENT_STATE_DNE : SPARSE_EXTENT_STATE_ZEROED),
+ length));
+ }
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
+template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h
new file mode 100644
index 000000000..89ca224cc
--- /dev/null
+++ b/src/librbd/io/ObjectRequest.h
@@ -0,0 +1,502 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_REQUEST_H
+#define CEPH_LIBRBD_IO_OBJECT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/neorados/RADOS.hpp"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+class Context;
+class ObjectExtent;
+
+namespace neorados { struct WriteOp; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class CopyupRequest;
+
+/**
+ * This class represents an I/O operation to a single RBD data object.
+ * Its subclasses encapsulate logic for dealing with special cases
+ * for I/O due to layering.
+ */
+template <typename ImageCtxT = ImageCtx>
+class ObjectRequest {
+public:
+ static ObjectRequest* create_write(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, IOContext io_context, int op_flags,
+ int write_flags, std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_discard(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_write_same(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_compare_and_write(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ IOContext io_context, uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion);
+
+ ObjectRequest(ImageCtxT *ictx, uint64_t objectno, IOContext io_context,
+ const char *trace_name, const ZTracer::Trace &parent_trace,
+ Context *completion);
+ virtual ~ObjectRequest() {
+ m_trace.event("finish");
+ }
+
+ static void add_write_hint(ImageCtxT& image_ctx,
+ neorados::WriteOp *wr);
+
+ virtual void send() = 0;
+
+ bool has_parent() const {
+ return m_has_parent;
+ }
+
+ virtual const char *get_op_type() const = 0;
+
+protected:
+ bool compute_parent_extents(Extents *parent_extents, bool read_request);
+
+ ImageCtxT *m_ictx;
+ uint64_t m_object_no;
+ IOContext m_io_context;
+ Context *m_completion;
+ ZTracer::Trace m_trace;
+
+ void async_finish(int r);
+ void finish(int r);
+
+private:
+ bool m_has_parent = false;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectReadRequest : public ObjectRequest<ImageCtxT> {
+public:
+ static ObjectReadRequest* create(
+ ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* version,
+ Context *completion) {
+ return new ObjectReadRequest(ictx, objectno, extents, io_context, op_flags,
+ read_flags, parent_trace, version, completion);
+ }
+
+ ObjectReadRequest(
+ ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* version,
+ Context *completion);
+
+ void send() override;
+
+ const char *get_op_type() const override {
+ return "read";
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v
+ * READ_OBJECT
+ * |
+ * v (skip if not needed)
+ * READ_PARENT
+ * |
+ * v (skip if not needed)
+ * COPYUP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ReadExtents* m_extents;
+ int m_op_flags;
+ int m_read_flags;
+ uint64_t* m_version;
+
+ void read_object();
+ void handle_read_object(int r);
+
+ void read_parent();
+ void handle_read_parent(int r);
+
+ void copyup();
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AbstractObjectWriteRequest : public ObjectRequest<ImageCtxT> {
+public:
+ AbstractObjectWriteRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, uint64_t len,
+ IOContext io_context, const char *trace_name,
+ const ZTracer::Trace &parent_trace, Context *completion);
+
+ virtual bool is_empty_write_op() const {
+ return false;
+ }
+
+ virtual uint8_t get_pre_write_object_map_state() const {
+ return OBJECT_EXISTS;
+ }
+
+ virtual void add_copyup_ops(neorados::WriteOp *wr) {
+ add_write_ops(wr);
+ }
+
+ void handle_copyup(int r);
+
+ void send() override;
+
+protected:
+ uint64_t m_object_off;
+ uint64_t m_object_len;
+ bool m_full_object = false;
+ bool m_copyup_enabled = true;
+
+ virtual bool is_no_op_for_nonexistent_object() const {
+ return false;
+ }
+ virtual bool is_object_map_update_enabled() const {
+ return true;
+ }
+ virtual bool is_post_copyup_write_required() const {
+ return false;
+ }
+ virtual bool is_non_existent_post_write_object_map_state() const {
+ return false;
+ }
+
+ virtual void add_write_hint(neorados::WriteOp *wr);
+ virtual void add_write_ops(neorados::WriteOp *wr) = 0;
+
+ virtual int filter_write_result(int r) const {
+ return r;
+ }
+
+ virtual Extents get_copyup_overwrite_extents() const {
+ return {{m_object_off, m_object_len}};
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (no-op write request)
+ * DETECT_NO_OP . . . . . . . . . . . . . . . . . . .
+ * | .
+ * v (skip if not required/disabled) .
+ * PRE_UPDATE_OBJECT_MAP .
+ * | . .
+ * | . (child dne) .
+ * | . . . . . . . . . .
+ * | . .
+ * | (post-copyup write) . .
+ * | . . . . . . . . . . . . . .
+ * | . . . .
+ * v v . v .
+ * WRITE . . . . . . . . > COPYUP (if required) .
+ * | | .
+ * |/----------------------/ .
+ * | .
+ * v (skip if not required/disabled) .
+ * POST_UPDATE_OBJECT_MAP .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ Extents m_parent_extents;
+ bool m_object_may_exist = false;
+ bool m_copyup_in_progress = false;
+ bool m_guarding_migration_write = false;
+
+ void compute_parent_info();
+
+ void pre_write_object_map_update();
+ void handle_pre_write_object_map_update(int r);
+
+ void write_object();
+ void handle_write_object(int r);
+
+ void copyup();
+
+ void post_write_object_map_update();
+ void handle_post_write_object_map_update(int r);
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectWriteRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, IOContext io_context, int op_flags,
+ int write_flags, std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ data.length(), io_context, "write",
+ parent_trace, completion),
+ m_write_data(std::move(data)), m_op_flags(op_flags),
+ m_write_flags(write_flags), m_assert_version(assert_version) {
+ }
+
+ bool is_empty_write_op() const override {
+ return (m_write_data.length() == 0);
+ }
+
+ const char *get_op_type() const override {
+ return "write";
+ }
+
+protected:
+ void add_write_ops(neorados::WriteOp *wr) override;
+ void add_write_hint(neorados::WriteOp *wr) override;
+
+private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+ int m_write_flags;
+ std::optional<uint64_t> m_assert_version;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectDiscardRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectDiscardRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ object_len, io_context, "discard",
+ parent_trace, completion),
+ m_discard_flags(discard_flags) {
+ if (this->m_full_object) {
+ if ((m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE) != 0 &&
+ this->has_parent()) {
+ if (!this->m_copyup_enabled) {
+ // need to hide the parent object instead of child object
+ m_discard_action = DISCARD_ACTION_REMOVE_TRUNCATE;
+ } else {
+ m_discard_action = DISCARD_ACTION_TRUNCATE;
+ }
+ } else {
+ m_discard_action = DISCARD_ACTION_REMOVE;
+ }
+ } else if (object_off + object_len == ictx->layout.object_size) {
+ m_discard_action = DISCARD_ACTION_TRUNCATE;
+ } else {
+ m_discard_action = DISCARD_ACTION_ZERO;
+ }
+ }
+
+ const char* get_op_type() const override {
+ switch (m_discard_action) {
+ case DISCARD_ACTION_REMOVE:
+ return "remove";
+ case DISCARD_ACTION_REMOVE_TRUNCATE:
+ return "remove (create+truncate)";
+ case DISCARD_ACTION_TRUNCATE:
+ return "truncate";
+ case DISCARD_ACTION_ZERO:
+ return "zero";
+ }
+ ceph_abort();
+ return nullptr;
+ }
+
+ uint8_t get_pre_write_object_map_state() const override {
+ if (m_discard_action == DISCARD_ACTION_REMOVE) {
+ return OBJECT_PENDING;
+ }
+ return OBJECT_EXISTS;
+ }
+
+protected:
+ bool is_no_op_for_nonexistent_object() const override {
+ return (!this->has_parent());
+ }
+ bool is_object_map_update_enabled() const override {
+ return (
+ (m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE) == 0);
+ }
+ bool is_non_existent_post_write_object_map_state() const override {
+ return (m_discard_action == DISCARD_ACTION_REMOVE);
+ }
+
+ void add_write_hint(neorados::WriteOp *wr) override {
+ // no hint for discard
+ }
+
+ void add_write_ops(neorados::WriteOp *wr) override;
+
+private:
+ enum DiscardAction {
+ DISCARD_ACTION_REMOVE,
+ DISCARD_ACTION_REMOVE_TRUNCATE,
+ DISCARD_ACTION_TRUNCATE,
+ DISCARD_ACTION_ZERO
+ };
+
+ DiscardAction m_discard_action;
+ int m_discard_flags;
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectWriteSameRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectWriteSameRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ object_len, io_context, "writesame",
+ parent_trace, completion),
+ m_write_data(std::move(data)), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "writesame";
+ }
+
+protected:
+ void add_write_ops(neorados::WriteOp *wr) override;
+
+private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectCompareAndWriteRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& write_bl,
+ IOContext io_context, uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ cmp_bl.length(), io_context,
+ "compare_and_write", parent_trace,
+ completion),
+ m_cmp_bl(std::move(cmp_bl)), m_write_bl(std::move(write_bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "compare_and_write";
+ }
+
+ void add_copyup_ops(neorados::WriteOp *wr) override {
+ // no-op on copyup
+ }
+
+protected:
+ virtual bool is_post_copyup_write_required() const {
+ return true;
+ }
+
+ void add_write_ops(neorados::WriteOp *wr) override;
+
+ int filter_write_result(int r) const override;
+
+ Extents get_copyup_overwrite_extents() const override {
+ return {};
+ }
+
+private:
+ ceph::bufferlist m_cmp_bl;
+ ceph::bufferlist m_write_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectListSnapsRequest : public ObjectRequest<ImageCtxT> {
+public:
+ static ObjectListSnapsRequest* create(
+ ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents,
+ SnapIds&& snap_ids, int list_snaps_flags,
+ const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta,
+ Context *completion) {
+ return new ObjectListSnapsRequest(ictx, objectno,
+ std::move(object_extents),
+ std::move(snap_ids), list_snaps_flags,
+ parent_trace, snapshot_delta, completion);
+ }
+
+ ObjectListSnapsRequest(
+ ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents,
+ SnapIds&& snap_ids, int list_snaps_flags,
+ const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta,
+ Context *completion);
+
+ void send() override;
+
+ const char *get_op_type() const override {
+ return "snap_list";
+ }
+
+private:
+ Extents m_object_extents;
+ SnapIds m_snap_ids;
+ int m_list_snaps_flags;
+ SnapshotDelta* m_snapshot_delta;
+
+ neorados::SnapSet m_snap_set;
+ boost::system::error_code m_ec;
+
+ SnapshotDelta m_parent_snapshot_delta;
+
+ void list_snaps();
+ void handle_list_snaps(int r);
+
+ void list_from_parent();
+ void handle_list_from_parent(int r);
+
+ void zero_extent(uint64_t snap_id, bool dne);
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
+extern template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_REQUEST_H
diff --git a/src/librbd/io/QosImageDispatch.cc b/src/librbd/io/QosImageDispatch.cc
new file mode 100644
index 000000000..9ca88ac19
--- /dev/null
+++ b/src/librbd/io/QosImageDispatch.cc
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/QosImageDispatch.h"
+#include "common/dout.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/FlushTracker.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::QosImageDispatch: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+uint64_t get_extent_length(const Extents& extents) {
+ uint64_t length = 0;
+ for (auto& extent : extents) {
+ length += extent.second;
+ }
+ return length;
+}
+
+uint64_t calculate_tokens(bool read_op, uint64_t extent_length, uint64_t flag) {
+ if (read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK) != 0)) {
+ return 0;
+ } else if (!read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_READ_MASK) != 0)) {
+ return 0;
+ }
+
+ return (((flag & IMAGE_DISPATCH_FLAG_QOS_BPS_MASK) != 0) ? extent_length : 1);
+}
+
+static std::map<uint64_t, std::string> throttle_flags = {
+ {IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE, "rbd_qos_iops_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE, "rbd_qos_bps_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE, "rbd_qos_read_iops_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE, "rbd_qos_write_iops_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE, "rbd_qos_read_bps_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE, "rbd_qos_write_bps_throttle" }
+};
+
+} // anonymous namespace
+
+template <typename I>
+QosImageDispatch<I>::QosImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker<I>(image_ctx)) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+
+ SafeTimer *timer;
+ ceph::mutex *timer_lock;
+ ImageCtx::get_timer_instance(cct, &timer, &timer_lock);
+ for (auto flag : throttle_flags) {
+ m_throttles.push_back(make_pair(
+ flag.first,
+ new TokenBucketThrottle(cct, flag.second, 0, 0, timer, timer_lock)));
+ }
+}
+
+template <typename I>
+QosImageDispatch<I>::~QosImageDispatch() {
+ for (auto t : m_throttles) {
+ delete t.second;
+ }
+ delete m_flush_tracker;
+}
+
+template <typename I>
+void QosImageDispatch<I>::shut_down(Context* on_finish) {
+ m_flush_tracker->shut_down();
+ on_finish->complete(0);
+}
+
+template <typename I>
+void QosImageDispatch<I>::apply_qos_schedule_tick_min(uint64_t tick) {
+ for (auto pair : m_throttles) {
+ pair.second->set_schedule_tick_min(tick);
+ }
+}
+
+template <typename I>
+void QosImageDispatch<I>::apply_qos_limit(uint64_t flag, uint64_t limit,
+ uint64_t burst, uint64_t burst_seconds) {
+ auto cct = m_image_ctx->cct;
+ TokenBucketThrottle *throttle = nullptr;
+ for (auto pair : m_throttles) {
+ if (flag == pair.first) {
+ throttle = pair.second;
+ break;
+ }
+ }
+ ceph_assert(throttle != nullptr);
+
+ int r = throttle->set_limit(limit, burst, burst_seconds);
+ if (r < 0) {
+ lderr(cct) << throttle->get_name() << ": invalid qos parameter: "
+ << "burst(" << burst << ") is less than "
+ << "limit(" << limit << ")" << dendl;
+ // if apply failed, we should at least make sure the limit works.
+ throttle->set_limit(limit, 0, 1);
+ }
+
+ if (limit) {
+ m_qos_enabled_flag |= flag;
+ } else {
+ m_qos_enabled_flag &= ~flag;
+ }
+}
+
+template <typename I>
+bool QosImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_throttle(true, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_flush_tracker->flush(on_dispatched);
+ return true;
+}
+
+template <typename I>
+void QosImageDispatch<I>::handle_finished(int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ m_flush_tracker->finish_io(tid);
+}
+
+template <typename I>
+bool QosImageDispatch<I>::set_throttle_flag(
+ std::atomic<uint32_t>* image_dispatch_flags, uint32_t flag) {
+ uint32_t expected = image_dispatch_flags->load();
+ uint32_t desired;
+ do {
+ desired = expected | flag;
+ } while (!image_dispatch_flags->compare_exchange_weak(expected, desired));
+
+ return ((desired & IMAGE_DISPATCH_FLAG_QOS_MASK) ==
+ IMAGE_DISPATCH_FLAG_QOS_MASK);
+}
+
+template <typename I>
+bool QosImageDispatch<I>::needs_throttle(
+ bool read_op, const Extents& image_extents, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto extent_length = get_extent_length(image_extents);
+ bool all_qos_flags_set = false;
+
+ if (!read_op) {
+ m_flush_tracker->start_io(tid);
+ *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) {
+ handle_finished(r, tid);
+ on_finish->complete(r);
+ });
+ }
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+
+ auto qos_enabled_flag = m_qos_enabled_flag;
+ for (auto [flag, throttle] : m_throttles) {
+ if ((qos_enabled_flag & flag) == 0) {
+ all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag);
+ continue;
+ }
+
+ auto tokens = calculate_tokens(read_op, extent_length, flag);
+ if (tokens > 0 &&
+ throttle->get(tokens, this, &QosImageDispatch<I>::handle_throttle_ready,
+ Tag{image_dispatch_flags, on_dispatched}, flag)) {
+ ldout(cct, 15) << "on_dispatched=" << on_dispatched << ", "
+ << "flag=" << flag << dendl;
+ all_qos_flags_set = false;
+ } else {
+ all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag);
+ }
+ }
+ return !all_qos_flags_set;
+}
+
+template <typename I>
+void QosImageDispatch<I>::handle_throttle_ready(Tag&& tag, uint64_t flag) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 15) << "on_dispatched=" << tag.on_dispatched << ", "
+ << "flag=" << flag << dendl;
+
+ if (set_throttle_flag(tag.image_dispatch_flags, flag)) {
+ // timer_lock is held -- so dispatch from outside the timer thread
+ m_image_ctx->asio_engine->post(tag.on_dispatched, 0);
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::QosImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/QosImageDispatch.h b/src/librbd/io/QosImageDispatch.h
new file mode 100644
index 000000000..baf16da02
--- /dev/null
+++ b/src/librbd/io/QosImageDispatch.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <list>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class FlushTracker;
+
+template <typename ImageCtxT>
+class QosImageDispatch : public ImageDispatchInterface {
+public:
+ struct Tag {
+ std::atomic<uint32_t>* image_dispatch_flags;
+ Context* on_dispatched;
+
+ Tag(std::atomic<uint32_t>* image_dispatch_flags, Context* on_dispatched)
+ : image_dispatch_flags(image_dispatch_flags),
+ on_dispatched(on_dispatched) {
+ }
+ };
+
+ QosImageDispatch(ImageCtxT* image_ctx);
+ ~QosImageDispatch() override;
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_QOS;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ void apply_qos_schedule_tick_min(uint64_t tick);
+ void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst,
+ uint64_t burst_seconds);
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+ std::list<std::pair<uint64_t, TokenBucketThrottle*> > m_throttles;
+ uint64_t m_qos_enabled_flag = 0;
+
+ FlushTracker<ImageCtxT>* m_flush_tracker;
+
+ void handle_finished(int r, uint64_t tid);
+
+ bool set_throttle_flag(std::atomic<uint32_t>* image_dispatch_flags,
+ uint32_t flag);
+ bool needs_throttle(bool read_op, const Extents& image_extents, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched);
+ void handle_throttle_ready(Tag&& tag, uint64_t flag);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::QosImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/QueueImageDispatch.cc b/src/librbd/io/QueueImageDispatch.cc
new file mode 100644
index 000000000..e80d39e44
--- /dev/null
+++ b/src/librbd/io/QueueImageDispatch.cc
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/QueueImageDispatch.h"
+#include "common/dout.h"
+#include "common/Cond.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/FlushTracker.h"
+#include "librbd/io/ImageDispatchSpec.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::QueueImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+QueueImageDispatch<I>::QueueImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker<I>(image_ctx)) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+QueueImageDispatch<I>::~QueueImageDispatch() {
+ delete m_flush_tracker;
+}
+
+template <typename I>
+void QueueImageDispatch<I>::shut_down(Context* on_finish) {
+ m_flush_tracker->shut_down();
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(true, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_flush_tracker->flush(on_dispatched);
+ return true;
+}
+
+template <typename I>
+void QueueImageDispatch<I>::handle_finished(int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ m_flush_tracker->finish_io(tid);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::enqueue(
+ bool read_op, uint64_t tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (!m_image_ctx->non_blocking_aio) {
+ return false;
+ }
+
+ if (!read_op) {
+ m_flush_tracker->start_io(tid);
+ *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) {
+ handle_finished(r, tid);
+ on_finish->complete(r);
+ });
+ }
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_image_ctx->asio_engine->post(on_dispatched, 0);
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::QueueImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/QueueImageDispatch.h b/src/librbd/io/QueueImageDispatch.h
new file mode 100644
index 000000000..60ee46750
--- /dev/null
+++ b/src/librbd/io/QueueImageDispatch.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class FlushTracker;
+
+template <typename ImageCtxT>
+class QueueImageDispatch : public ImageDispatchInterface {
+public:
+ QueueImageDispatch(ImageCtxT* image_ctx);
+ ~QueueImageDispatch();
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_QUEUE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+ FlushTracker<ImageCtxT>* m_flush_tracker;
+
+ void handle_finished(int r, uint64_t tid);
+
+ bool enqueue(bool read_op, uint64_t tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::QueueImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc
new file mode 100644
index 000000000..c4053fee6
--- /dev/null
+++ b/src/librbd/io/ReadResult.cc
@@ -0,0 +1,262 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ReadResult.h"
+#include "include/buffer.h"
+#include "common/dout.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/Utils.h"
+#include <boost/variant/apply_visitor.hpp>
+#include <boost/variant/static_visitor.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+struct ReadResult::SetImageExtentsVisitor : public boost::static_visitor<void> {
+ Extents image_extents;
+
+ explicit SetImageExtentsVisitor(const Extents& image_extents)
+ : image_extents(image_extents) {
+ }
+
+ void operator()(Linear &linear) const {
+ uint64_t length = util::get_extents_length(image_extents);
+
+ ceph_assert(length <= linear.buf_len);
+ linear.buf_len = length;
+ }
+
+ void operator()(SparseBufferlist &sbl) const {
+ sbl.image_extents = image_extents;
+ }
+
+ template <typename T>
+ void operator()(T &t) const {
+ }
+};
+
+struct ReadResult::AssembleResultVisitor : public boost::static_visitor<void> {
+ CephContext *cct;
+ Striper::StripedReadResult &destriper;
+
+ AssembleResultVisitor(CephContext *cct, Striper::StripedReadResult &destriper)
+ : cct(cct), destriper(destriper) {
+ }
+
+ void operator()(Empty &empty) const {
+ ldout(cct, 20) << "dropping read result" << dendl;
+ }
+
+ void operator()(Linear &linear) const {
+ ldout(cct, 20) << "copying resulting bytes to "
+ << reinterpret_cast<void*>(linear.buf) << dendl;
+ destriper.assemble_result(cct, linear.buf, linear.buf_len);
+ }
+
+ void operator()(Vector &vector) const {
+ bufferlist bl;
+ destriper.assemble_result(cct, bl, true);
+
+ ldout(cct, 20) << "copying resulting " << bl.length() << " bytes to iovec "
+ << reinterpret_cast<const void*>(vector.iov) << dendl;
+
+ bufferlist::iterator it = bl.begin();
+ size_t length = bl.length();
+ size_t offset = 0;
+ int idx = 0;
+ for (; offset < length && idx < vector.iov_count; idx++) {
+ size_t len = std::min(vector.iov[idx].iov_len, length - offset);
+ it.copy(len, static_cast<char *>(vector.iov[idx].iov_base));
+ offset += len;
+ }
+ ceph_assert(offset == bl.length());
+ }
+
+ void operator()(Bufferlist &bufferlist) const {
+ bufferlist.bl->clear();
+ destriper.assemble_result(cct, *bufferlist.bl, true);
+
+ ldout(cct, 20) << "moved resulting " << bufferlist.bl->length() << " "
+ << "bytes to bl " << reinterpret_cast<void*>(bufferlist.bl)
+ << dendl;
+ }
+
+ void operator()(SparseBufferlist &sparse_bufferlist) const {
+ sparse_bufferlist.bl->clear();
+
+ ExtentMap buffer_extent_map;
+ auto buffer_extents_length = destriper.assemble_result(
+ cct, &buffer_extent_map, sparse_bufferlist.bl);
+
+ ldout(cct, 20) << "image_extents="
+ << sparse_bufferlist.image_extents << ", "
+ << "buffer_extent_map=" << buffer_extent_map << dendl;
+
+ sparse_bufferlist.extent_map->clear();
+ sparse_bufferlist.extent_map->reserve(buffer_extent_map.size());
+
+ // The extent-map is logically addressed by buffer-extents not image- or
+ // object-extents. Translate this address mapping to image-extent
+ // logical addressing since it's tied to an image-extent read
+ uint64_t buffer_offset = 0;
+ auto bem_it = buffer_extent_map.begin();
+ for (auto [image_offset, image_length] : sparse_bufferlist.image_extents) {
+ while (bem_it != buffer_extent_map.end()) {
+ auto [buffer_extent_offset, buffer_extent_length] = *bem_it;
+
+ if (buffer_offset + image_length <= buffer_extent_offset) {
+ // skip any image extent that is not included in the results
+ break;
+ }
+
+ // current buffer-extent should be within the current image-extent
+ ceph_assert(buffer_offset <= buffer_extent_offset &&
+ buffer_offset + image_length >=
+ buffer_extent_offset + buffer_extent_length);
+ auto image_extent_offset =
+ image_offset + (buffer_extent_offset - buffer_offset);
+ ldout(cct, 20) << "mapping buffer extent " << buffer_extent_offset
+ << "~" << buffer_extent_length << " to image extent "
+ << image_extent_offset << "~" << buffer_extent_length
+ << dendl;
+ sparse_bufferlist.extent_map->emplace_back(
+ image_extent_offset, buffer_extent_length);
+ ++bem_it;
+ }
+
+ buffer_offset += image_length;
+ }
+ ceph_assert(buffer_offset == buffer_extents_length);
+ ceph_assert(bem_it == buffer_extent_map.end());
+
+ ldout(cct, 20) << "moved resulting " << *sparse_bufferlist.extent_map
+ << " extents of total " << sparse_bufferlist.bl->length()
+ << " bytes to bl "
+ << reinterpret_cast<void*>(sparse_bufferlist.bl) << dendl;
+ }
+};
+
+ReadResult::C_ImageReadRequest::C_ImageReadRequest(
+ AioCompletion *aio_completion, uint64_t buffer_offset,
+ const Extents image_extents)
+ : aio_completion(aio_completion), buffer_offset(buffer_offset),
+ image_extents(image_extents) {
+ aio_completion->add_request();
+}
+
+void ReadResult::C_ImageReadRequest::finish(int r) {
+ CephContext *cct = aio_completion->ictx->cct;
+ ldout(cct, 10) << "C_ImageReadRequest: r=" << r
+ << dendl;
+ if (r >= 0 || (ignore_enoent && r == -ENOENT)) {
+ striper::LightweightBufferExtents buffer_extents;
+ size_t length = 0;
+ for (auto &image_extent : image_extents) {
+ buffer_extents.emplace_back(buffer_offset + length, image_extent.second);
+ length += image_extent.second;
+ }
+ ceph_assert(r == -ENOENT || length == bl.length());
+
+ aio_completion->lock.lock();
+ aio_completion->read_result.m_destriper.add_partial_result(
+ cct, std::move(bl), buffer_extents);
+ aio_completion->lock.unlock();
+ r = length;
+ }
+
+ aio_completion->complete_request(r);
+}
+
+ReadResult::C_ObjectReadRequest::C_ObjectReadRequest(
+ AioCompletion *aio_completion, ReadExtents&& extents)
+ : aio_completion(aio_completion), extents(std::move(extents)) {
+ aio_completion->add_request();
+}
+
+void ReadResult::C_ObjectReadRequest::finish(int r) {
+ CephContext *cct = aio_completion->ictx->cct;
+ ldout(cct, 10) << "C_ObjectReadRequest: r=" << r
+ << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r >= 0) {
+ uint64_t object_len = 0;
+ aio_completion->lock.lock();
+ for (auto& extent: extents) {
+ ldout(cct, 10) << " got " << extent.extent_map
+ << " for " << extent.buffer_extents
+ << " bl " << extent.bl.length() << dendl;
+
+ aio_completion->read_result.m_destriper.add_partial_sparse_result(
+ cct, std::move(extent.bl), extent.extent_map, extent.offset,
+ extent.buffer_extents);
+
+ object_len += extent.length;
+ }
+ aio_completion->lock.unlock();
+ r = object_len;
+ }
+
+ aio_completion->complete_request(r);
+}
+
+ReadResult::C_ObjectReadMergedExtents::C_ObjectReadMergedExtents(
+ CephContext* cct, ReadExtents* extents, Context* on_finish)
+ : cct(cct), extents(extents), on_finish(on_finish) {
+}
+
+void ReadResult::C_ObjectReadMergedExtents::finish(int r) {
+ if (r >= 0) {
+ for (auto& extent: *extents) {
+ if (bl.length() < extent.length) {
+ lderr(cct) << "Merged extents length is less than expected" << dendl;
+ r = -EIO;
+ break;
+ }
+ bl.splice(0, extent.length, &extent.bl);
+ }
+ if (bl.length() != 0) {
+ lderr(cct) << "Merged extents length is greater than expected" << dendl;
+ r = -EIO;
+ }
+ }
+ on_finish->complete(r);
+}
+
+ReadResult::ReadResult() : m_buffer(Empty()) {
+}
+
+ReadResult::ReadResult(char *buf, size_t buf_len)
+ : m_buffer(Linear(buf, buf_len)) {
+}
+
+ReadResult::ReadResult(const struct iovec *iov, int iov_count)
+ : m_buffer(Vector(iov, iov_count)) {
+}
+
+ReadResult::ReadResult(ceph::bufferlist *bl)
+ : m_buffer(Bufferlist(bl)) {
+}
+
+ReadResult::ReadResult(Extents* extent_map, ceph::bufferlist* bl)
+ : m_buffer(SparseBufferlist(extent_map, bl)) {
+}
+
+void ReadResult::set_image_extents(const Extents& image_extents) {
+ boost::apply_visitor(SetImageExtentsVisitor(image_extents), m_buffer);
+}
+
+void ReadResult::assemble_result(CephContext *cct) {
+ boost::apply_visitor(AssembleResultVisitor(cct, m_destriper), m_buffer);
+}
+
+} // namespace io
+} // namespace librbd
+
diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h
new file mode 100644
index 000000000..12a1e78cc
--- /dev/null
+++ b/src/librbd/io/ReadResult.h
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_READ_RESULT_H
+#define CEPH_LIBRBD_IO_READ_RESULT_H
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "librbd/io/Types.h"
+#include "osdc/Striper.h"
+#include <sys/uio.h>
+#include <boost/variant/variant.hpp>
+
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> struct ObjectReadRequest;
+
+class ReadResult {
+public:
+ struct C_ImageReadRequest : public Context {
+ AioCompletion *aio_completion;
+ uint64_t buffer_offset = 0;
+ Extents image_extents;
+ bufferlist bl;
+ bool ignore_enoent = false;
+
+ C_ImageReadRequest(AioCompletion *aio_completion,
+ uint64_t buffer_offset,
+ const Extents image_extents);
+
+ void finish(int r) override;
+ };
+
+ struct C_ObjectReadRequest : public Context {
+ AioCompletion *aio_completion;
+ ReadExtents extents;
+
+ C_ObjectReadRequest(AioCompletion *aio_completion, ReadExtents&& extents);
+
+ void finish(int r) override;
+ };
+
+ struct C_ObjectReadMergedExtents : public Context {
+ CephContext* cct;
+ ReadExtents* extents;
+ Context *on_finish;
+ bufferlist bl;
+
+ C_ObjectReadMergedExtents(CephContext* cct, ReadExtents* extents,
+ Context* on_finish);
+
+ void finish(int r) override;
+ };
+
+ ReadResult();
+ ReadResult(char *buf, size_t buf_len);
+ ReadResult(const struct iovec *iov, int iov_count);
+ ReadResult(ceph::bufferlist *bl);
+ ReadResult(Extents* extent_map, ceph::bufferlist* bl);
+
+ void set_image_extents(const Extents& image_extents);
+
+ void assemble_result(CephContext *cct);
+
+private:
+ struct Empty {
+ };
+
+ struct Linear {
+ char *buf;
+ size_t buf_len;
+
+ Linear(char *buf, size_t buf_len) : buf(buf), buf_len(buf_len) {
+ }
+ };
+
+ struct Vector {
+ const struct iovec *iov;
+ int iov_count;
+
+ Vector(const struct iovec *iov, int iov_count)
+ : iov(iov), iov_count(iov_count) {
+ }
+ };
+
+ struct Bufferlist {
+ ceph::bufferlist *bl;
+
+ Bufferlist(ceph::bufferlist *bl) : bl(bl) {
+ }
+ };
+
+ struct SparseBufferlist {
+ Extents *extent_map;
+ ceph::bufferlist *bl;
+
+ Extents image_extents;
+
+ SparseBufferlist(Extents* extent_map, ceph::bufferlist* bl)
+ : extent_map(extent_map), bl(bl) {
+ }
+ };
+
+ typedef boost::variant<Empty,
+ Linear,
+ Vector,
+ Bufferlist,
+ SparseBufferlist> Buffer;
+ struct SetImageExtentsVisitor;
+ struct AssembleResultVisitor;
+
+ Buffer m_buffer;
+ Striper::StripedReadResult m_destriper;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_READ_RESULT_H
+
diff --git a/src/librbd/io/RefreshImageDispatch.cc b/src/librbd/io/RefreshImageDispatch.cc
new file mode 100644
index 000000000..5199f01d8
--- /dev/null
+++ b/src/librbd/io/RefreshImageDispatch.cc
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/RefreshImageDispatch.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::RefreshImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+RefreshImageDispatch<I>::RefreshImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+void RefreshImageDispatch<I>::shut_down(Context* on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ // The refresh state machine can initiate a flush and it can
+ // enable the exclusive-lock which will also attmept to flush.
+ if (flush_source == FLUSH_SOURCE_REFRESH ||
+ flush_source == FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH ||
+ flush_source == FLUSH_SOURCE_SHUTDOWN) {
+ return false;
+ }
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::needs_refresh(
+ DispatchResult* dispatch_result, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+
+ if (m_image_ctx->state->is_refresh_required()) {
+ ldout(cct, 15) << "on_dispatched=" << on_dispatched << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_image_ctx->state->refresh(on_dispatched);
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::RefreshImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/RefreshImageDispatch.h b/src/librbd/io/RefreshImageDispatch.h
new file mode 100644
index 000000000..1bcb3c312
--- /dev/null
+++ b/src/librbd/io/RefreshImageDispatch.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class RefreshImageDispatch : public ImageDispatchInterface {
+public:
+ RefreshImageDispatch(ImageCtxT* image_ctx);
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_REFRESH;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+ bool needs_refresh(DispatchResult* dispatch_result, Context* on_dispatched);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::RefreshImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.cc b/src/librbd/io/SimpleSchedulerObjectDispatch.cc
new file mode 100644
index 000000000..6b50c46ad
--- /dev/null
+++ b/src/librbd/io/SimpleSchedulerObjectDispatch.cc
@@ -0,0 +1,564 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/SimpleSchedulerObjectDispatch.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/ceph_time.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/FlushTracker.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/Utils.h"
+
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/rolling_count.hpp>
+#include <boost/accumulators/statistics/rolling_sum.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::SimpleSchedulerObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using namespace boost::accumulators;
+using ceph::operator<<;
+using librbd::util::data_object_name;
+
+static const int LATENCY_STATS_WINDOW_SIZE = 10;
+
+class LatencyStats {
+private:
+ accumulator_set<uint64_t, stats<tag::rolling_count, tag::rolling_sum>> m_acc;
+
+public:
+ LatencyStats()
+ : m_acc(tag::rolling_window::window_size = LATENCY_STATS_WINDOW_SIZE) {
+ }
+
+ bool is_ready() const {
+ return rolling_count(m_acc) == LATENCY_STATS_WINDOW_SIZE;
+ }
+
+ void add(uint64_t latency) {
+ m_acc(latency);
+ }
+
+ uint64_t avg() const {
+ auto count = rolling_count(m_acc);
+
+ if (count > 0) {
+ return rolling_sum(m_acc);
+ }
+ return 0;
+ }
+};
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::ObjectRequests::try_delay_request(
+ uint64_t object_off, ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, int object_dispatch_flags, Context* on_dispatched) {
+ if (!m_delayed_requests.empty()) {
+ if (!m_io_context || *m_io_context != *io_context ||
+ op_flags != m_op_flags || data.length() == 0 ||
+ intersects(object_off, data.length())) {
+ return false;
+ }
+ } else {
+ m_io_context = io_context;
+ m_op_flags = op_flags;
+ }
+
+ if (data.length() == 0) {
+ // a zero length write is usually a special case,
+ // and we don't want it to be merged with others
+ ceph_assert(m_delayed_requests.empty());
+ m_delayed_request_extents.insert(0, UINT64_MAX);
+ } else {
+ m_delayed_request_extents.insert(object_off, data.length());
+ }
+ m_object_dispatch_flags |= object_dispatch_flags;
+
+ if (!m_delayed_requests.empty()) {
+ // try to merge front to an existing request
+ auto iter = m_delayed_requests.find(object_off + data.length());
+ if (iter != m_delayed_requests.end()) {
+ auto new_iter = m_delayed_requests.insert({object_off, {}}).first;
+ new_iter->second.data = std::move(data);
+ new_iter->second.data.append(std::move(iter->second.data));
+ new_iter->second.requests = std::move(iter->second.requests);
+ new_iter->second.requests.push_back(on_dispatched);
+ m_delayed_requests.erase(iter);
+
+ if (new_iter != m_delayed_requests.begin()) {
+ auto prev = new_iter;
+ try_merge_delayed_requests(--prev, new_iter);
+ }
+ return true;
+ }
+
+ // try to merge back to an existing request
+ iter = m_delayed_requests.lower_bound(object_off);
+ if (iter == m_delayed_requests.end() || iter->first > object_off) {
+ iter--;
+ }
+ if (iter != m_delayed_requests.end() &&
+ iter->first + iter->second.data.length() == object_off) {
+ iter->second.data.append(std::move(data));
+ iter->second.requests.push_back(on_dispatched);
+
+ auto next = iter;
+ if (++next != m_delayed_requests.end()) {
+ try_merge_delayed_requests(iter, next);
+ }
+ return true;
+ }
+ }
+
+ // create a new request
+ auto iter = m_delayed_requests.insert({object_off, {}}).first;
+ iter->second.data = std::move(data);
+ iter->second.requests.push_back(on_dispatched);
+ return true;
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::ObjectRequests::try_merge_delayed_requests(
+ typename std::map<uint64_t, MergedRequests>::iterator &iter1,
+ typename std::map<uint64_t, MergedRequests>::iterator &iter2) {
+ if (iter1->first + iter1->second.data.length() != iter2->first) {
+ return;
+ }
+
+ iter1->second.data.append(std::move(iter2->second.data));
+ iter1->second.requests.insert(iter1->second.requests.end(),
+ iter2->second.requests.begin(),
+ iter2->second.requests.end());
+ m_delayed_requests.erase(iter2);
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::ObjectRequests::dispatch_delayed_requests(
+ I *image_ctx, LatencyStats *latency_stats, ceph::mutex *latency_stats_lock) {
+ for (auto &it : m_delayed_requests) {
+ auto offset = it.first;
+ auto &merged_requests = it.second;
+
+ auto ctx = new LambdaContext(
+ [requests=std::move(merged_requests.requests), latency_stats,
+ latency_stats_lock, start_time=ceph_clock_now()](int r) {
+ if (latency_stats) {
+ std::lock_guard locker{*latency_stats_lock};
+ auto latency = ceph_clock_now() - start_time;
+ latency_stats->add(latency.to_nsec());
+ }
+ for (auto on_dispatched : requests) {
+ on_dispatched->complete(r);
+ }
+ });
+
+ auto req = ObjectDispatchSpec::create_write(
+ image_ctx, OBJECT_DISPATCH_LAYER_SCHEDULER,
+ m_object_no, offset, std::move(merged_requests.data), m_io_context,
+ m_op_flags, 0, std::nullopt, 0, {}, ctx);
+
+ req->object_dispatch_flags = m_object_dispatch_flags;
+ req->send();
+ }
+
+ m_dispatch_time = {};
+}
+
+template <typename I>
+SimpleSchedulerObjectDispatch<I>::SimpleSchedulerObjectDispatch(
+ I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_flush_tracker(new FlushTracker<I>(image_ctx)),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "librbd::io::SimpleSchedulerObjectDispatch::lock", this))),
+ m_max_delay(image_ctx->config.template get_val<uint64_t>(
+ "rbd_io_scheduler_simple_max_delay")) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+
+ I::get_timer_instance(cct, &m_timer, &m_timer_lock);
+
+ if (m_max_delay == 0) {
+ m_latency_stats = std::make_unique<LatencyStats>();
+ }
+}
+
+template <typename I>
+SimpleSchedulerObjectDispatch<I>::~SimpleSchedulerObjectDispatch() {
+ delete m_flush_tracker;
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::init() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // add ourself to the IO object dispatcher chain
+ m_image_ctx->io_object_dispatcher->register_dispatch(this);
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_flush_tracker->shut_down();
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " << extents
+ << dendl;
+
+ std::lock_guard locker{m_lock};
+ for (auto& extent : *extents) {
+ if (intersects(object_no, extent.offset, extent.length)) {
+ dispatch_delayed_requests(object_no);
+ break;
+ }
+ }
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, {}, on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ // don't try to batch assert version writes
+ if (assert_version.has_value() ||
+ (write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) {
+ dispatch_delayed_requests(object_no);
+ return false;
+ }
+
+ if (try_delay_write(object_no, object_off, std::move(data), io_context,
+ op_flags, *object_dispatch_flags, on_dispatched)) {
+
+ auto dispatch_seq = ++m_dispatch_seq;
+ m_flush_tracker->start_io(dispatch_seq);
+ *on_finish = new LambdaContext(
+ [this, dispatch_seq, ctx=*on_finish](int r) {
+ ctx->complete(r);
+ m_flush_tracker->finish_io(dispatch_seq);
+ });
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ return true;
+ }
+
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, ceph_clock_now(), on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, {}, on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << cmp_data.length() << dendl;
+
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, {}, on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ dispatch_all_delayed_requests();
+ }
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_flush_tracker->flush(on_dispatched);
+
+ return true;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::intersects(
+ uint64_t object_no, uint64_t object_off, uint64_t len) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ auto it = m_requests.find(object_no);
+ bool intersects = (it != m_requests.end()) &&
+ it->second->intersects(object_off, len);
+
+ ldout(cct, 20) << intersects << dendl;
+
+ return intersects;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::try_delay_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int object_dispatch_flags,
+ Context* on_dispatched) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ if (m_latency_stats && !m_latency_stats->is_ready()) {
+ ldout(cct, 20) << "latency stats not collected yet" << dendl;
+ return false;
+ }
+
+ auto it = m_requests.find(object_no);
+ if (it == m_requests.end()) {
+ ldout(cct, 20) << "no pending requests" << dendl;
+ return false;
+ }
+
+ auto &object_requests = it->second;
+ bool delayed = object_requests->try_delay_request(
+ object_off, std::move(data), io_context, op_flags, object_dispatch_flags,
+ on_dispatched);
+
+ ldout(cct, 20) << "delayed: " << delayed << dendl;
+
+ // schedule dispatch on the first request added
+ if (delayed && !object_requests->is_scheduled_dispatch()) {
+ auto dispatch_time = ceph::real_clock::now();
+ if (m_latency_stats) {
+ dispatch_time += std::chrono::nanoseconds(m_latency_stats->avg() / 2);
+ } else {
+ dispatch_time += std::chrono::milliseconds(m_max_delay);
+ }
+ object_requests->set_scheduled_dispatch(dispatch_time);
+ m_dispatch_queue.push_back(object_requests);
+ if (m_dispatch_queue.front() == object_requests) {
+ schedule_dispatch_delayed_requests();
+ }
+ }
+
+ return delayed;
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::dispatch_all_delayed_requests() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ while (!m_requests.empty()) {
+ auto it = m_requests.begin();
+ dispatch_delayed_requests(it->second);
+ m_requests.erase(it);
+ }
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::register_in_flight_request(
+ uint64_t object_no, const utime_t &start_time, Context **on_finish) {
+ auto res = m_requests.insert(
+ {object_no, std::make_shared<ObjectRequests>(object_no)});
+ ceph_assert(res.second);
+ auto it = res.first;
+
+ auto dispatch_seq = ++m_dispatch_seq;
+ m_flush_tracker->start_io(dispatch_seq);
+
+ it->second->set_dispatch_seq(dispatch_seq);
+ *on_finish = new LambdaContext(
+ [this, object_no, dispatch_seq, start_time, ctx=*on_finish](int r) {
+ ctx->complete(r);
+
+ std::unique_lock locker{m_lock};
+ if (m_latency_stats && start_time != utime_t()) {
+ auto latency = ceph_clock_now() - start_time;
+ m_latency_stats->add(latency.to_nsec());
+ }
+
+ auto it = m_requests.find(object_no);
+ if (it == m_requests.end() ||
+ it->second->get_dispatch_seq() != dispatch_seq) {
+ ldout(m_image_ctx->cct, 20) << "already dispatched" << dendl;
+ } else {
+ dispatch_delayed_requests(it->second);
+ m_requests.erase(it);
+ }
+ locker.unlock();
+
+ m_flush_tracker->finish_io(dispatch_seq);
+ });
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::dispatch_delayed_requests(
+ uint64_t object_no) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ auto it = m_requests.find(object_no);
+ if (it == m_requests.end()) {
+ ldout(cct, 20) << "object_no=" << object_no << ": not found" << dendl;
+ return;
+ }
+
+ dispatch_delayed_requests(it->second);
+ m_requests.erase(it);
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::dispatch_delayed_requests(
+ ObjectRequestsRef object_requests) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ ldout(cct, 20) << "object_no=" << object_requests->get_object_no() << ", "
+ << object_requests->delayed_requests_size() << " requests, "
+ << "dispatch_time=" << object_requests->get_dispatch_time()
+ << dendl;
+
+ if (!object_requests->is_scheduled_dispatch()) {
+ return;
+ }
+
+ object_requests->dispatch_delayed_requests(m_image_ctx, m_latency_stats.get(),
+ &m_lock);
+
+ ceph_assert(!m_dispatch_queue.empty());
+ if (m_dispatch_queue.front() == object_requests) {
+ m_dispatch_queue.pop_front();
+ schedule_dispatch_delayed_requests();
+ }
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::schedule_dispatch_delayed_requests() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ std::lock_guard timer_locker{*m_timer_lock};
+
+ if (m_timer_task != nullptr) {
+ ldout(cct, 20) << "canceling task " << m_timer_task << dendl;
+
+ bool canceled = m_timer->cancel_event(m_timer_task);
+ ceph_assert(canceled);
+ m_timer_task = nullptr;
+ }
+
+ if (m_dispatch_queue.empty()) {
+ ldout(cct, 20) << "nothing to schedule" << dendl;
+ return;
+ }
+
+ auto object_requests = m_dispatch_queue.front().get();
+
+ while (!object_requests->is_scheduled_dispatch()) {
+ ldout(cct, 20) << "garbage collecting " << object_requests << dendl;
+ m_dispatch_queue.pop_front();
+
+ if (m_dispatch_queue.empty()) {
+ ldout(cct, 20) << "nothing to schedule" << dendl;
+ return;
+ }
+ object_requests = m_dispatch_queue.front().get();
+ }
+
+ m_timer_task = new LambdaContext(
+ [this, object_no=object_requests->get_object_no()](int r) {
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "running timer task " << m_timer_task << dendl;
+
+ m_timer_task = nullptr;
+ m_image_ctx->asio_engine->post(
+ [this, object_no]() {
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ });
+ });
+
+ ldout(cct, 20) << "scheduling task " << m_timer_task << " at "
+ << object_requests->get_dispatch_time() << dendl;
+
+ m_timer->add_event_at(object_requests->get_dispatch_time(), m_timer_task);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::SimpleSchedulerObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.h b/src/librbd/io/SimpleSchedulerObjectDispatch.h
new file mode 100644
index 000000000..ca8a57f3a
--- /dev/null
+++ b/src/librbd/io/SimpleSchedulerObjectDispatch.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H
+
+#include "common/ceph_mutex.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/TypeTraits.h"
+
+#include <list>
+#include <map>
+#include <memory>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+template <typename> class FlushTracker;
+class LatencyStats;
+
+/**
+ * Simple scheduler plugin for object dispatcher layer.
+ */
+template <typename ImageCtxT = ImageCtx>
+class SimpleSchedulerObjectDispatch : public ObjectDispatchInterface {
+private:
+ // mock unit testing support
+ typedef ::librbd::io::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::SafeTimer SafeTimer;
+public:
+ static SimpleSchedulerObjectDispatch* create(ImageCtxT* image_ctx) {
+ return new SimpleSchedulerObjectDispatch(image_ctx);
+ }
+
+ SimpleSchedulerObjectDispatch(ImageCtxT* image_ctx);
+ ~SimpleSchedulerObjectDispatch() override;
+
+ ObjectDispatchLayer get_dispatch_layer() const override {
+ return OBJECT_DISPATCH_LAYER_SCHEDULER;
+ }
+
+ void init();
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ struct MergedRequests {
+ ceph::bufferlist data;
+ std::list<Context *> requests;
+ };
+
+ class ObjectRequests {
+ public:
+ using clock_t = ceph::real_clock;
+
+ ObjectRequests(uint64_t object_no) : m_object_no(object_no) {
+ }
+
+ uint64_t get_object_no() const {
+ return m_object_no;
+ }
+
+ void set_dispatch_seq(uint64_t dispatch_seq) {
+ m_dispatch_seq = dispatch_seq;
+ }
+
+ uint64_t get_dispatch_seq() const {
+ return m_dispatch_seq;
+ }
+
+ clock_t::time_point get_dispatch_time() const {
+ return m_dispatch_time;
+ }
+
+ void set_scheduled_dispatch(const clock_t::time_point &dispatch_time) {
+ m_dispatch_time = dispatch_time;
+ }
+
+ bool is_scheduled_dispatch() const {
+ return !clock_t::is_zero(m_dispatch_time);
+ }
+
+ size_t delayed_requests_size() const {
+ return m_delayed_requests.size();
+ }
+
+ bool intersects(uint64_t object_off, uint64_t len) const {
+ return m_delayed_request_extents.intersects(object_off, len);
+ }
+
+ bool try_delay_request(uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ int object_dispatch_flags, Context* on_dispatched);
+
+ void dispatch_delayed_requests(ImageCtxT *image_ctx,
+ LatencyStats *latency_stats,
+ ceph::mutex *latency_stats_lock);
+
+ private:
+ uint64_t m_object_no;
+ uint64_t m_dispatch_seq = 0;
+ clock_t::time_point m_dispatch_time;
+ IOContext m_io_context;
+ int m_op_flags = 0;
+ int m_object_dispatch_flags = 0;
+ std::map<uint64_t, MergedRequests> m_delayed_requests;
+ interval_set<uint64_t> m_delayed_request_extents;
+
+ void try_merge_delayed_requests(
+ typename std::map<uint64_t, MergedRequests>::iterator &iter,
+ typename std::map<uint64_t, MergedRequests>::iterator &iter2);
+ };
+
+ typedef std::shared_ptr<ObjectRequests> ObjectRequestsRef;
+ typedef std::map<uint64_t, ObjectRequestsRef> Requests;
+
+ ImageCtxT *m_image_ctx;
+
+ FlushTracker<ImageCtxT>* m_flush_tracker;
+
+ ceph::mutex m_lock;
+ SafeTimer *m_timer;
+ ceph::mutex *m_timer_lock;
+ uint64_t m_max_delay;
+ uint64_t m_dispatch_seq = 0;
+
+ Requests m_requests;
+ std::list<ObjectRequestsRef> m_dispatch_queue;
+ Context *m_timer_task = nullptr;
+ std::unique_ptr<LatencyStats> m_latency_stats;
+
+ bool try_delay_write(uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, int object_dispatch_flags,
+ Context* on_dispatched);
+ bool intersects(uint64_t object_no, uint64_t object_off, uint64_t len) const;
+
+ void dispatch_all_delayed_requests();
+ void dispatch_delayed_requests(uint64_t object_no);
+ void dispatch_delayed_requests(ObjectRequestsRef object_requests);
+ void register_in_flight_request(uint64_t object_no, const utime_t &start_time,
+ Context** on_finish);
+
+ void schedule_dispatch_delayed_requests();
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::SimpleSchedulerObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H
diff --git a/src/librbd/io/TypeTraits.h b/src/librbd/io/TypeTraits.h
new file mode 100644
index 000000000..2f3a6b7ef
--- /dev/null
+++ b/src/librbd/io/TypeTraits.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_TYPE_TRAITS_H
+#define CEPH_LIBRBD_IO_TYPE_TRAITS_H
+
+#include "common/Timer.h"
+
+namespace librbd {
+namespace io {
+
+template <typename IoCtxT>
+struct TypeTraits {
+ typedef ::SafeTimer SafeTimer;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_TYPE_TRAITS_H
diff --git a/src/librbd/io/Types.cc b/src/librbd/io/Types.cc
new file mode 100644
index 000000000..223c77283
--- /dev/null
+++ b/src/librbd/io/Types.cc
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/Types.h"
+#include <iostream>
+
+namespace librbd {
+namespace io {
+
+const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS{0, 0};
+
+std::ostream& operator<<(std::ostream& os, SparseExtentState state) {
+ switch (state) {
+ case SPARSE_EXTENT_STATE_DNE:
+ os << "dne";
+ break;
+ case SPARSE_EXTENT_STATE_ZEROED:
+ os << "zeroed";
+ break;
+ case SPARSE_EXTENT_STATE_DATA:
+ os << "data";
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const SparseExtent& se) {
+ os << "["
+ << "state=" << se.state << ", "
+ << "length=" << se.length << "]";
+ return os;
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h
new file mode 100644
index 000000000..8d91c5515
--- /dev/null
+++ b/src/librbd/io/Types.h
@@ -0,0 +1,307 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_TYPES_H
+#define CEPH_LIBRBD_IO_TYPES_H
+
+#include "include/int_types.h"
+#include "include/rados/rados_types.hpp"
+#include "common/interval_map.h"
+#include "osdc/StriperTypes.h"
+#include <iosfwd>
+#include <map>
+#include <vector>
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+typedef enum {
+ AIO_TYPE_NONE = 0,
+ AIO_TYPE_GENERIC,
+ AIO_TYPE_OPEN,
+ AIO_TYPE_CLOSE,
+ AIO_TYPE_READ,
+ AIO_TYPE_WRITE,
+ AIO_TYPE_DISCARD,
+ AIO_TYPE_FLUSH,
+ AIO_TYPE_WRITESAME,
+ AIO_TYPE_COMPARE_AND_WRITE,
+} aio_type_t;
+
+enum FlushSource {
+ FLUSH_SOURCE_USER,
+ FLUSH_SOURCE_INTERNAL,
+ FLUSH_SOURCE_SHUTDOWN,
+ FLUSH_SOURCE_EXCLUSIVE_LOCK,
+ FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH,
+ FLUSH_SOURCE_REFRESH,
+ FLUSH_SOURCE_WRITEBACK,
+ FLUSH_SOURCE_WRITE_BLOCK,
+};
+
+enum Direction {
+ DIRECTION_READ,
+ DIRECTION_WRITE,
+ DIRECTION_BOTH
+};
+
+enum DispatchResult {
+ DISPATCH_RESULT_INVALID,
+ DISPATCH_RESULT_RESTART,
+ DISPATCH_RESULT_CONTINUE,
+ DISPATCH_RESULT_COMPLETE
+};
+
+enum ImageDispatchLayer {
+ IMAGE_DISPATCH_LAYER_NONE = 0,
+ IMAGE_DISPATCH_LAYER_API_START = IMAGE_DISPATCH_LAYER_NONE,
+ IMAGE_DISPATCH_LAYER_QUEUE,
+ IMAGE_DISPATCH_LAYER_QOS,
+ IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK,
+ IMAGE_DISPATCH_LAYER_REFRESH,
+ IMAGE_DISPATCH_LAYER_INTERNAL_START = IMAGE_DISPATCH_LAYER_REFRESH,
+ IMAGE_DISPATCH_LAYER_MIGRATION,
+ IMAGE_DISPATCH_LAYER_JOURNAL,
+ IMAGE_DISPATCH_LAYER_WRITE_BLOCK,
+ IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE,
+ IMAGE_DISPATCH_LAYER_CRYPTO,
+ IMAGE_DISPATCH_LAYER_CORE,
+ IMAGE_DISPATCH_LAYER_LAST
+};
+
+enum {
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE = 1 << 0,
+ IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE = 1 << 1,
+ IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE = 1 << 2,
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE = 1 << 3,
+ IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE = 1 << 4,
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE = 1 << 5,
+ IMAGE_DISPATCH_FLAG_QOS_BPS_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_READ_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_BPS_MASK |
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK),
+};
+
+enum ImageExtentsMapType {
+ IMAGE_EXTENTS_MAP_TYPE_LOGICAL_TO_PHYSICAL,
+ IMAGE_EXTENTS_MAP_TYPE_PHYSICAL_TO_LOGICAL,
+};
+
+enum ObjectDispatchLayer {
+ OBJECT_DISPATCH_LAYER_NONE = 0,
+ OBJECT_DISPATCH_LAYER_CACHE,
+ OBJECT_DISPATCH_LAYER_CRYPTO,
+ OBJECT_DISPATCH_LAYER_JOURNAL,
+ OBJECT_DISPATCH_LAYER_PARENT_CACHE,
+ OBJECT_DISPATCH_LAYER_SCHEDULER,
+ OBJECT_DISPATCH_LAYER_CORE,
+ OBJECT_DISPATCH_LAYER_LAST
+};
+
+enum {
+ READ_FLAG_DISABLE_READ_FROM_PARENT = 1UL << 0,
+ READ_FLAG_DISABLE_CLIPPING = 1UL << 1,
+};
+
+enum {
+ OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE = 1UL << 0
+};
+
+enum {
+ OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE = 1UL << 0,
+ OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE = 1UL << 1
+};
+
+enum {
+ OBJECT_DISPATCH_FLAG_FLUSH = 1UL << 0,
+ OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR = 1UL << 1
+};
+
+enum {
+ LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT = 1UL << 0,
+ LIST_SNAPS_FLAG_WHOLE_OBJECT = 1UL << 1,
+ LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS = 1UL << 2,
+};
+
+enum SparseExtentState {
+ SPARSE_EXTENT_STATE_DNE, /* does not exist */
+ SPARSE_EXTENT_STATE_ZEROED,
+ SPARSE_EXTENT_STATE_DATA
+};
+
+std::ostream& operator<<(std::ostream& os, SparseExtentState state);
+
+struct SparseExtent {
+ SparseExtentState state;
+ size_t length;
+
+ SparseExtent(SparseExtentState state, size_t length)
+ : state(state), length(length) {
+ }
+
+ operator SparseExtentState() const {
+ return state;
+ }
+
+ bool operator==(const SparseExtent& rhs) const {
+ return state == rhs.state && length == rhs.length;
+ }
+};
+
+std::ostream& operator<<(std::ostream& os, const SparseExtent& state);
+
+struct SparseExtentSplitMerge {
+ SparseExtent split(uint64_t offset, uint64_t length, SparseExtent &se) const {
+ return SparseExtent(se.state, se.length);
+ }
+
+ bool can_merge(const SparseExtent& left, const SparseExtent& right) const {
+ return left.state == right.state;
+ }
+
+ SparseExtent merge(SparseExtent&& left, SparseExtent&& right) const {
+ SparseExtent se(left);
+ se.length += right.length;
+ return se;
+ }
+
+ uint64_t length(const SparseExtent& se) const {
+ return se.length;
+ }
+};
+
+typedef interval_map<uint64_t,
+ SparseExtent,
+ SparseExtentSplitMerge> SparseExtents;
+
+typedef std::vector<uint64_t> SnapIds;
+
+typedef std::pair<librados::snap_t, librados::snap_t> WriteReadSnapIds;
+extern const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS;
+
+typedef std::map<WriteReadSnapIds, SparseExtents> SnapshotDelta;
+
+struct SparseBufferlistExtent : public SparseExtent {
+ ceph::bufferlist bl;
+
+ SparseBufferlistExtent(SparseExtentState state, size_t length)
+ : SparseExtent(state, length) {
+ ceph_assert(state != SPARSE_EXTENT_STATE_DATA);
+ }
+ SparseBufferlistExtent(SparseExtentState state, size_t length,
+ ceph::bufferlist&& bl_)
+ : SparseExtent(state, length), bl(std::move(bl_)) {
+ ceph_assert(state != SPARSE_EXTENT_STATE_DATA || length == bl.length());
+ }
+
+ bool operator==(const SparseBufferlistExtent& rhs) const {
+ return (state == rhs.state &&
+ length == rhs.length &&
+ bl.contents_equal(rhs.bl));
+ }
+};
+
+struct SparseBufferlistExtentSplitMerge {
+ SparseBufferlistExtent split(uint64_t offset, uint64_t length,
+ SparseBufferlistExtent& sbe) const {
+ ceph::bufferlist bl;
+ if (sbe.state == SPARSE_EXTENT_STATE_DATA) {
+ bl.substr_of(bl, offset, length);
+ }
+ return SparseBufferlistExtent(sbe.state, length, std::move(bl));
+ }
+
+ bool can_merge(const SparseBufferlistExtent& left,
+ const SparseBufferlistExtent& right) const {
+ return left.state == right.state;
+ }
+
+ SparseBufferlistExtent merge(SparseBufferlistExtent&& left,
+ SparseBufferlistExtent&& right) const {
+ if (left.state == SPARSE_EXTENT_STATE_DATA) {
+ ceph::bufferlist bl{std::move(left.bl)};
+ bl.claim_append(std::move(right.bl));
+ return SparseBufferlistExtent(SPARSE_EXTENT_STATE_DATA,
+ bl.length(), std::move(bl));
+ } else {
+ return SparseBufferlistExtent(left.state, left.length + right.length, {});
+ }
+ }
+
+ uint64_t length(const SparseBufferlistExtent& sbe) const {
+ return sbe.length;
+ }
+};
+
+typedef interval_map<uint64_t,
+ SparseBufferlistExtent,
+ SparseBufferlistExtentSplitMerge> SparseBufferlist;
+typedef std::map<uint64_t, SparseBufferlist> SnapshotSparseBufferlist;
+
+using striper::LightweightBufferExtents;
+using striper::LightweightObjectExtent;
+using striper::LightweightObjectExtents;
+
+typedef std::pair<uint64_t,uint64_t> Extent;
+typedef std::vector<Extent> Extents;
+
+struct ReadExtent {
+ const uint64_t offset;
+ const uint64_t length;
+ const LightweightBufferExtents buffer_extents;
+ ceph::bufferlist bl;
+ Extents extent_map;
+
+ ReadExtent(uint64_t offset,
+ uint64_t length) : offset(offset), length(length) {};
+ ReadExtent(uint64_t offset,
+ uint64_t length,
+ const LightweightBufferExtents&& buffer_extents)
+ : offset(offset),
+ length(length),
+ buffer_extents(buffer_extents) {}
+ ReadExtent(uint64_t offset,
+ uint64_t length,
+ const LightweightBufferExtents&& buffer_extents,
+ ceph::bufferlist&& bl,
+ Extents&& extent_map) : offset(offset),
+ length(length),
+ buffer_extents(buffer_extents),
+ bl(bl),
+ extent_map(extent_map) {};
+
+ friend inline std::ostream& operator<<(
+ std::ostream& os,
+ const ReadExtent &extent) {
+ os << "offset=" << extent.offset << ", "
+ << "length=" << extent.length << ", "
+ << "buffer_extents=" << extent.buffer_extents << ", "
+ << "bl.length=" << extent.bl.length() << ", "
+ << "extent_map=" << extent.extent_map;
+ return os;
+ }
+};
+
+typedef std::vector<ReadExtent> ReadExtents;
+
+typedef std::map<uint64_t, uint64_t> ExtentMap;
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_TYPES_H
diff --git a/src/librbd/io/Utils.cc b/src/librbd/io/Utils.cc
new file mode 100644
index 000000000..2ce9dd11f
--- /dev/null
+++ b/src/librbd/io/Utils.cc
@@ -0,0 +1,239 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/Utils.h"
+#include "common/dout.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/neorados/RADOS.hpp"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "osd/osd_types.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::util: " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+namespace util {
+
+void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op) {
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)
+ op->set_fadvise_random();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL)
+ op->set_fadvise_sequential();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_WILLNEED)
+ op->set_fadvise_willneed();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED)
+ op->set_fadvise_dontneed();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE)
+ op->set_fadvise_nocache();
+
+ if (flags & librados::OPERATION_BALANCE_READS)
+ op->balance_reads();
+ if (flags & librados::OPERATION_LOCALIZE_READS)
+ op->localize_reads();
+}
+
+bool assemble_write_same_extent(
+ const LightweightObjectExtent &object_extent, const ceph::bufferlist& data,
+ ceph::bufferlist *ws_data, bool force_write) {
+ size_t data_len = data.length();
+
+ if (!force_write) {
+ bool may_writesame = true;
+ for (auto& q : object_extent.buffer_extents) {
+ if (!(q.first % data_len == 0 && q.second % data_len == 0)) {
+ may_writesame = false;
+ break;
+ }
+ }
+
+ if (may_writesame) {
+ ws_data->append(data);
+ return true;
+ }
+ }
+
+ for (auto& q : object_extent.buffer_extents) {
+ bufferlist sub_bl;
+ uint64_t sub_off = q.first % data_len;
+ uint64_t sub_len = data_len - sub_off;
+ uint64_t extent_left = q.second;
+ while (extent_left >= sub_len) {
+ sub_bl.substr_of(data, sub_off, sub_len);
+ ws_data->claim_append(sub_bl);
+ extent_left -= sub_len;
+ if (sub_off) {
+ sub_off = 0;
+ sub_len = data_len;
+ }
+ }
+ if (extent_left) {
+ sub_bl.substr_of(data, sub_off, extent_left);
+ ws_data->claim_append(sub_bl);
+ }
+ }
+ return false;
+}
+
+template <typename I>
+void read_parent(I *image_ctx, uint64_t object_no, ReadExtents* extents,
+ librados::snap_t snap_id, const ZTracer::Trace &trace,
+ Context* on_finish) {
+
+ auto cct = image_ctx->cct;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ // calculate reverse mapping onto the image
+ Extents parent_extents;
+ for (auto& extent: *extents) {
+ extent_to_file(image_ctx, object_no, extent.offset, extent.length,
+ parent_extents);
+ }
+
+ uint64_t parent_overlap = 0;
+ uint64_t object_overlap = 0;
+ int r = image_ctx->get_parent_overlap(snap_id, &parent_overlap);
+ if (r == 0) {
+ object_overlap = image_ctx->prune_parent_extents(parent_extents,
+ parent_overlap);
+ }
+
+ if (object_overlap == 0) {
+ image_locker.unlock();
+
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ ceph::bufferlist* parent_read_bl;
+ if (extents->size() > 1) {
+ auto parent_comp = new ReadResult::C_ObjectReadMergedExtents(
+ cct, extents, on_finish);
+ parent_read_bl = &parent_comp->bl;
+ on_finish = parent_comp;
+ } else {
+ parent_read_bl = &extents->front().bl;
+ }
+
+ auto comp = AioCompletion::create_and_start(on_finish, image_ctx->parent,
+ AIO_TYPE_READ);
+ ldout(cct, 20) << "completion " << comp << ", extents " << parent_extents
+ << dendl;
+ auto req = io::ImageDispatchSpec::create_read(
+ *image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp,
+ std::move(parent_extents), ReadResult{parent_read_bl},
+ image_ctx->parent->get_data_io_context(), 0, 0, trace);
+ req->send();
+}
+
+template <typename I>
+int clip_request(I *image_ctx, Extents *image_extents) {
+ std::shared_lock image_locker{image_ctx->image_lock};
+ for (auto &image_extent : *image_extents) {
+ auto clip_len = image_extent.second;
+ int r = clip_io(librbd::util::get_image_ctx(image_ctx),
+ image_extent.first, &clip_len);
+ if (r < 0) {
+ return r;
+ }
+
+ image_extent.second = clip_len;
+ }
+ return 0;
+}
+
+void unsparsify(CephContext* cct, ceph::bufferlist* bl,
+ const Extents& extent_map, uint64_t bl_off,
+ uint64_t out_bl_len) {
+ Striper::StripedReadResult destriper;
+ bufferlist out_bl;
+
+ destriper.add_partial_sparse_result(cct, std::move(*bl), extent_map, bl_off,
+ {{0, out_bl_len}});
+ destriper.assemble_result(cct, out_bl, true);
+ *bl = out_bl;
+}
+
+template <typename I>
+bool trigger_copyup(I* image_ctx, uint64_t object_no, IOContext io_context,
+ Context* on_finish) {
+ bufferlist bl;
+ auto req = new ObjectWriteRequest<I>(
+ image_ctx, object_no, 0, std::move(bl), io_context, 0, 0,
+ std::nullopt, {}, on_finish);
+ if (!req->has_parent()) {
+ delete req;
+ return false;
+ }
+
+ req->send();
+ return true;
+}
+
+template <typename I>
+void file_to_extents(I* image_ctx, uint64_t offset, uint64_t length,
+ uint64_t buffer_offset,
+ striper::LightweightObjectExtents* object_extents) {
+ Extents extents = {{offset, length}};
+ image_ctx->io_image_dispatcher->remap_extents(
+ extents, IMAGE_EXTENTS_MAP_TYPE_LOGICAL_TO_PHYSICAL);
+ for (auto [off, len] : extents) {
+ Striper::file_to_extents(image_ctx->cct, &image_ctx->layout, off, len, 0,
+ buffer_offset, object_extents);
+ }
+}
+
+template <typename I>
+void extent_to_file(I* image_ctx, uint64_t object_no, uint64_t offset,
+ uint64_t length,
+ std::vector<std::pair<uint64_t, uint64_t> >& extents) {
+ Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, object_no,
+ offset, length, extents);
+ image_ctx->io_image_dispatcher->remap_extents(
+ extents, IMAGE_EXTENTS_MAP_TYPE_PHYSICAL_TO_LOGICAL);
+}
+
+template <typename I>
+uint64_t get_file_offset(I* image_ctx, uint64_t object_no, uint64_t offset) {
+ auto off = Striper::get_file_offset(image_ctx->cct, &image_ctx->layout,
+ object_no, offset);
+ Extents extents = {{off, 0}};
+ image_ctx->io_image_dispatcher->remap_extents(
+ extents, IMAGE_EXTENTS_MAP_TYPE_PHYSICAL_TO_LOGICAL);
+ return extents[0].first;
+}
+
+} // namespace util
+} // namespace io
+} // namespace librbd
+
+template void librbd::io::util::read_parent(
+ librbd::ImageCtx *image_ctx, uint64_t object_no, ReadExtents* extents,
+ librados::snap_t snap_id, const ZTracer::Trace &trace, Context* on_finish);
+template int librbd::io::util::clip_request(
+ librbd::ImageCtx *image_ctx, Extents *image_extents);
+template bool librbd::io::util::trigger_copyup(
+ librbd::ImageCtx *image_ctx, uint64_t object_no, IOContext io_context,
+ Context* on_finish);
+template void librbd::io::util::file_to_extents(
+ librbd::ImageCtx *image_ctx, uint64_t offset, uint64_t length,
+ uint64_t buffer_offset,
+ striper::LightweightObjectExtents* object_extents);
+template void librbd::io::util::extent_to_file(
+ librbd::ImageCtx *image_ctx, uint64_t object_no, uint64_t offset,
+ uint64_t length,
+ std::vector<std::pair<uint64_t, uint64_t> >& extents);
+template uint64_t librbd::io::util::get_file_offset(
+ librbd::ImageCtx *image_ctx, uint64_t object_no, uint64_t offset);
+
diff --git a/src/librbd/io/Utils.h b/src/librbd/io/Utils.h
new file mode 100644
index 000000000..9f7e0b946
--- /dev/null
+++ b/src/librbd/io/Utils.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_UTILS_H
+#define CEPH_LIBRBD_IO_UTILS_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/rados/rados_types.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+class ObjectExtent;
+
+namespace neorados { struct Op; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+namespace util {
+
+void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op);
+
+bool assemble_write_same_extent(const LightweightObjectExtent &object_extent,
+ const ceph::bufferlist& data,
+ ceph::bufferlist *ws_data,
+ bool force_write);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+void read_parent(ImageCtxT *image_ctx, uint64_t object_no,
+ ReadExtents* extents, librados::snap_t snap_id,
+ const ZTracer::Trace &trace, Context* on_finish);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+int clip_request(ImageCtxT *image_ctx, Extents *image_extents);
+
+inline uint64_t get_extents_length(const Extents &extents) {
+ uint64_t total_bytes = 0;
+ for (auto [_, extent_length] : extents) {
+ total_bytes += extent_length;
+ }
+ return total_bytes;
+}
+
+void unsparsify(CephContext* cct, ceph::bufferlist* bl,
+ const Extents& extent_map, uint64_t bl_off,
+ uint64_t out_bl_len);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+bool trigger_copyup(ImageCtxT *image_ctx, uint64_t object_no,
+ IOContext io_context, Context* on_finish);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+void file_to_extents(ImageCtxT *image_ctx, uint64_t offset, uint64_t length,
+ uint64_t buffer_offset,
+ striper::LightweightObjectExtents* object_extents);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+void extent_to_file(ImageCtxT *image_ctx, uint64_t object_no, uint64_t offset,
+ uint64_t length,
+ std::vector<std::pair<uint64_t, uint64_t> >& extents);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+uint64_t get_file_offset(ImageCtxT *image_ctx, uint64_t object_no,
+ uint64_t offset);
+
+inline ObjectDispatchLayer get_previous_layer(ObjectDispatchLayer layer) {
+ return (ObjectDispatchLayer)(((int)layer) - 1);
+}
+
+} // namespace util
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_UTILS_H
diff --git a/src/librbd/io/WriteBlockImageDispatch.cc b/src/librbd/io/WriteBlockImageDispatch.cc
new file mode 100644
index 000000000..4439a15a7
--- /dev/null
+++ b/src/librbd/io/WriteBlockImageDispatch.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/WriteBlockImageDispatch.h"
+#include "common/dout.h"
+#include "common/Cond.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::WriteBlockImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct WriteBlockImageDispatch<I>::C_BlockedWrites : public Context {
+ WriteBlockImageDispatch *dispatch;
+ explicit C_BlockedWrites(WriteBlockImageDispatch *dispatch)
+ : dispatch(dispatch) {
+ }
+
+ void finish(int r) override {
+ dispatch->handle_blocked_writes(r);
+ }
+};
+
+template <typename I>
+WriteBlockImageDispatch<I>::WriteBlockImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::io::WriteBlockImageDispatch::m_lock",
+ this))) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::shut_down(Context* on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+int WriteBlockImageDispatch<I>::block_writes() {
+ C_SaferCond cond_ctx;
+ block_writes(&cond_ctx);
+ return cond_ctx.wait();
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::block_writes(Context *on_blocked) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock));
+ auto cct = m_image_ctx->cct;
+
+ // ensure owner lock is not held after block_writes completes
+ on_blocked = util::create_async_context_callback(
+ *m_image_ctx, on_blocked);
+
+ {
+ std::unique_lock locker{m_lock};
+ ++m_write_blockers;
+ ldout(cct, 5) << m_image_ctx << ", "
+ << "num=" << m_write_blockers << dendl;
+ if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) {
+ ldout(cct, 5) << "waiting for in-flight writes to complete: "
+ << "in_flight_writes=" << m_in_flight_writes << dendl;
+ m_write_blocker_contexts.push_back(on_blocked);
+ return;
+ }
+ }
+
+ flush_io(on_blocked);
+};
+
+template <typename I>
+void WriteBlockImageDispatch<I>::unblock_writes() {
+ auto cct = m_image_ctx->cct;
+
+ Contexts waiter_contexts;
+ Contexts dispatch_contexts;
+ {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_write_blockers > 0);
+ --m_write_blockers;
+
+ ldout(cct, 5) << m_image_ctx << ", "
+ << "num=" << m_write_blockers << dendl;
+ if (m_write_blockers == 0) {
+ std::swap(waiter_contexts, m_unblocked_write_waiter_contexts);
+ std::swap(dispatch_contexts, m_on_dispatches);
+ }
+ }
+
+ for (auto ctx : waiter_contexts) {
+ ctx->complete(0);
+ }
+
+ for (auto ctx : dispatch_contexts) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::wait_on_writes_unblocked(
+ Context *on_unblocked) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock));
+ auto cct = m_image_ctx->cct;
+
+ {
+ std::unique_lock locker{m_lock};
+ ldout(cct, 20) << m_image_ctx << ", "
+ << "write_blockers=" << m_write_blockers << dendl;
+ if (!m_unblocked_write_waiter_contexts.empty() || m_write_blockers > 0) {
+ m_unblocked_write_waiter_contexts.push_back(on_unblocked);
+ return;
+ }
+ }
+
+ on_unblocked->complete(0);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ if (flush_source != FLUSH_SOURCE_USER) {
+ return false;
+ }
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::handle_finished(int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_in_flight_writes > 0);
+ --m_in_flight_writes;
+
+ bool writes_blocked = false;
+ if (m_write_blockers > 0 && m_in_flight_writes == 0) {
+ ldout(cct, 10) << "flushing all in-flight IO for blocked writes" << dendl;
+ writes_blocked = true;
+ }
+ locker.unlock();
+
+ if (writes_blocked) {
+ flush_io(new C_BlockedWrites(this));
+ }
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::process_io(
+ uint64_t tid, DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ std::unique_lock locker{m_lock};
+ if (m_write_blockers > 0 || !m_on_dispatches.empty()) {
+ *dispatch_result = DISPATCH_RESULT_RESTART;
+ m_on_dispatches.push_back(on_dispatched);
+ return true;
+ }
+
+ ++m_in_flight_writes;
+ *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) {
+ handle_finished(r, tid);
+ on_finish->complete(r);
+ });
+ return false;
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::flush_io(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // ensure that all in-flight IO is flushed
+ auto aio_comp = AioCompletion::create_and_start(
+ on_finish, util::get_image_ctx(m_image_ctx), librbd::io::AIO_TYPE_FLUSH);
+ auto req = ImageDispatchSpec::create_flush(
+ *m_image_ctx, IMAGE_DISPATCH_LAYER_WRITE_BLOCK, aio_comp,
+ FLUSH_SOURCE_WRITE_BLOCK, {});
+ req->send();
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::handle_blocked_writes(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ Contexts write_blocker_contexts;
+ {
+ std::unique_lock locker{m_lock};
+ std::swap(write_blocker_contexts, m_write_blocker_contexts);
+ }
+
+ for (auto ctx : write_blocker_contexts) {
+ ctx->complete(0);
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::WriteBlockImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/WriteBlockImageDispatch.h b/src/librbd/io/WriteBlockImageDispatch.h
new file mode 100644
index 000000000..9d200fb97
--- /dev/null
+++ b/src/librbd/io/WriteBlockImageDispatch.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/ceph_mutex.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <list>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class WriteBlockImageDispatch : public ImageDispatchInterface {
+public:
+ WriteBlockImageDispatch(ImageCtxT* image_ctx);
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_WRITE_BLOCK;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ int block_writes();
+ void block_writes(Context *on_blocked);
+ void unblock_writes();
+
+ inline bool writes_blocked() const {
+ std::shared_lock locker{m_lock};
+ return (m_write_blockers > 0);
+ }
+
+ void wait_on_writes_unblocked(Context *on_unblocked);
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, IOContext io_context,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+private:
+ struct C_BlockedWrites;
+
+ typedef std::list<Context*> Contexts;
+
+ ImageCtxT* m_image_ctx;
+
+ mutable ceph::shared_mutex m_lock;
+ Contexts m_on_dispatches;
+
+ uint32_t m_write_blockers = 0;
+ Contexts m_write_blocker_contexts;
+ Contexts m_unblocked_write_waiter_contexts;
+ uint64_t m_in_flight_writes = 0;
+
+ void handle_finished(int r, uint64_t tid);
+
+ bool process_io(uint64_t tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched);
+ void flush_io(Context* on_finish);
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void handle_blocked_writes(int r);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::WriteBlockImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H
diff --git a/src/librbd/journal/CreateRequest.cc b/src/librbd/journal/CreateRequest.cc
new file mode 100644
index 000000000..4f7a0f5be
--- /dev/null
+++ b/src/librbd/journal/CreateRequest.cc
@@ -0,0 +1,234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "librbd/Utils.h"
+#include "common/Timer.h"
+#include "journal/Settings.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal::CreateRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+
+namespace journal {
+
+template<typename I>
+CreateRequest<I>::CreateRequest(IoCtx &ioctx, const std::string &imageid,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool,
+ uint64_t tag_class, TagData &tag_data,
+ const std::string &client_id,
+ ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_image_id(imageid), m_order(order),
+ m_splay_width(splay_width), m_object_pool(object_pool),
+ m_tag_class(tag_class), m_tag_data(tag_data), m_image_client_id(client_id),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+template<typename I>
+void CreateRequest<I>::send() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ if (m_order > 64 || m_order < 12) {
+ lderr(m_cct) << "order must be in the range [12, 64]" << dendl;
+ complete(-EDOM);
+ return;
+ }
+ if (m_splay_width == 0) {
+ complete(-EINVAL);
+ return;
+ }
+
+ get_pool_id();
+}
+
+template<typename I>
+void CreateRequest<I>::get_pool_id() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ if (m_object_pool.empty()) {
+ create_journal();
+ return;
+ }
+
+ librados::Rados rados(m_ioctx);
+ IoCtx data_ioctx;
+ int r = rados.ioctx_create(m_object_pool.c_str(), data_ioctx);
+ if (r != 0) {
+ lderr(m_cct) << "failed to create journal: "
+ << "error opening journal object pool '" << m_object_pool
+ << "': " << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+ data_ioctx.set_namespace(m_ioctx.get_namespace());
+
+ m_pool_id = data_ioctx.get_id();
+ create_journal();
+}
+
+template<typename I>
+void CreateRequest<I>::create_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock);
+ m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock, m_ioctx,
+ m_image_id, m_image_client_id, {}, nullptr);
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_create_journal>(this);
+
+ m_journaler->create(m_order, m_splay_width, m_pool_id, ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_create_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to create journal: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ allocate_journal_tag();
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::allocate_journal_tag() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journal_tag>(this);
+
+ encode(m_tag_data, m_bl);
+ m_journaler->allocate_tag(m_tag_class, m_bl, &m_tag, ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_journal_tag(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to allocate tag: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ register_client();
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::register_client() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ m_bl.clear();
+ encode(ClientData{ImageClientMeta{m_tag.tag_class}}, m_bl);
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_register_client>(this);
+
+ m_journaler->register_client(m_bl, ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_register_client(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to register client: " << cpp_strerror(*result) << dendl;
+ }
+
+ shut_down_journaler(*result);
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::shut_down_journaler(int r) {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ m_r_saved = r;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journaler_shutdown>(this);
+
+ m_journaler->shut_down(ctx);
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_journaler_shutdown(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl;
+ }
+
+ delete m_journaler;
+
+ if (!m_r_saved) {
+ complete(0);
+ return nullptr;
+ }
+
+ // there was an error during journal creation, so we rollback
+ // what ever was done. the easiest way to do this is to invoke
+ // journal remove state machine, although it's not the most
+ // cleanest approach when it comes to redundancy, but that's
+ // ok during the failure path.
+ remove_journal();
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::remove_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = CreateRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_journal>(this);
+
+ RemoveRequest<I> *req = RemoveRequest<I>::create(
+ m_ioctx, m_image_id, m_image_client_id, m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+Context *CreateRequest<I>::handle_remove_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "error cleaning up journal after creation failed: "
+ << cpp_strerror(*result) << dendl;
+ }
+
+ complete(m_r_saved);
+ return nullptr;
+}
+
+template<typename I>
+void CreateRequest<I>::complete(int r) {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ if (r == 0) {
+ ldout(m_cct, 20) << "done." << dendl;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::CreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/CreateRequest.h b/src/librbd/journal/CreateRequest.h
new file mode 100644
index 000000000..6fab409c4
--- /dev/null
+++ b/src/librbd/journal/CreateRequest.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "librbd/ImageCtx.h"
+#include "journal/Journaler.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "cls/journal/cls_journal_types.h"
+
+using librados::IoCtx;
+using journal::Journaler;
+
+class Context;
+class ContextWQ;
+
+namespace journal {
+ class Journaler;
+}
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class CreateRequest {
+public:
+ static CreateRequest *create(IoCtx &ioctx, const std::string &imageid,
+ uint8_t order, uint8_t splay_width,
+ const std::string &object_pool,
+ uint64_t tag_class, TagData &tag_data,
+ const std::string &client_id,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new CreateRequest(ioctx, imageid, order, splay_width, object_pool,
+ tag_class, tag_data, client_id, op_work_queue,
+ on_finish);
+ }
+
+ void send();
+
+private:
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ CreateRequest(IoCtx &ioctx, const std::string &imageid, uint8_t order,
+ uint8_t splay_width, const std::string &object_pool,
+ uint64_t tag_class, TagData &tag_data,
+ const std::string &client_id, ContextWQ *op_work_queue,
+ Context *on_finish);
+
+ IoCtx &m_ioctx;
+ std::string m_image_id;
+ uint8_t m_order;
+ uint8_t m_splay_width;
+ std::string m_object_pool;
+ uint64_t m_tag_class;
+ TagData m_tag_data;
+ std::string m_image_client_id;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ cls::journal::Tag m_tag;
+ bufferlist m_bl;
+ Journaler *m_journaler;
+ SafeTimer *m_timer;
+ ceph::mutex *m_timer_lock;
+ int m_r_saved;
+
+ int64_t m_pool_id = -1;
+
+ void get_pool_id();
+
+ void create_journal();
+ Context *handle_create_journal(int *result);
+
+ void allocate_journal_tag();
+ Context *handle_journal_tag(int *result);
+
+ void register_client();
+ Context *handle_register_client(int *result);
+
+ void shut_down_journaler(int r);
+ Context *handle_journaler_shutdown(int *result);
+
+ void remove_journal();
+ Context *handle_remove_journal(int *result);
+
+ void complete(int r);
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::CreateRequest<librbd::ImageCtx>;
+
+#endif /* CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H */
diff --git a/src/librbd/journal/DemoteRequest.cc b/src/librbd/journal/DemoteRequest.cc
new file mode 100644
index 000000000..564391978
--- /dev/null
+++ b/src/librbd/journal/DemoteRequest.cc
@@ -0,0 +1,255 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/DemoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/OpenRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::DemoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+DemoteRequest<I>::DemoteRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish),
+ m_lock(ceph::make_mutex("DemoteRequest::m_lock")) {
+}
+
+template <typename I>
+DemoteRequest<I>::~DemoteRequest() {
+ ceph_assert(m_journaler == nullptr);
+}
+
+template <typename I>
+void DemoteRequest<I>::send() {
+ open_journaler();
+}
+
+template <typename I>
+void DemoteRequest<I>::open_journaler() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler = new Journaler(m_image_ctx.md_ctx, m_image_ctx.id,
+ Journal<>::IMAGE_CLIENT_ID, {}, nullptr);
+ auto ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_open_journaler>(this));
+ auto req = OpenRequest<I>::create(&m_image_ctx, m_journaler, &m_lock,
+ &m_client_meta, &m_tag_tid, &m_tag_data,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_open_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl;
+ shut_down_journaler();
+ return;
+ } else if (m_tag_data.mirror_uuid != Journal<>::LOCAL_MIRROR_UUID) {
+ m_ret_val = -EINVAL;
+ lderr(cct) << "image is not currently the primary" << dendl;
+ shut_down_journaler();
+ return;
+ }
+
+ allocate_tag();
+}
+
+template <typename I>
+void DemoteRequest<I>::allocate_tag() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ cls::journal::Client client;
+ int r = m_journaler->get_cached_client(Journal<>::IMAGE_CLIENT_ID, &client);
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ shut_down_journaler();
+ return;
+ }
+
+ TagPredecessor predecessor;
+ predecessor.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID;
+ if (!client.commit_position.object_positions.empty()) {
+ auto position = client.commit_position.object_positions.front();
+ predecessor.commit_valid = true;
+ predecessor.tag_tid = position.tag_tid;
+ predecessor.entry_tid = position.entry_tid;
+ }
+
+ TagData tag_data;
+ tag_data.mirror_uuid = Journal<>::ORPHAN_MIRROR_UUID;
+ tag_data.predecessor = std::move(predecessor);
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_allocate_tag>(this);
+ m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_allocate_tag(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+ shut_down_journaler();
+ return;
+ }
+
+ m_tag_tid = m_tag.tid;
+ append_event();
+}
+
+template <typename I>
+void DemoteRequest<I>::append_event() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ EventEntry event_entry{DemotePromoteEvent{}, {}};
+ bufferlist event_entry_bl;
+ encode(event_entry, event_entry_bl);
+
+ m_journaler->start_append(0);
+ m_future = m_journaler->append(m_tag_tid, event_entry_bl);
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_append_event>(this);
+ m_future.flush(ctx);
+
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_append_event(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to append demotion journal event: " << cpp_strerror(r)
+ << dendl;
+ stop_append();
+ return;
+ }
+
+ commit_event();
+}
+
+template <typename I>
+void DemoteRequest<I>::commit_event() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler->committed(m_future);
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_commit_event>(this);
+ m_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_commit_event(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to flush demotion commit position: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ stop_append();
+}
+
+template <typename I>
+void DemoteRequest<I>::stop_append() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_stop_append>(this);
+ m_journaler->stop_append(ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_stop_append(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl;
+ }
+
+ shut_down_journaler();
+}
+
+template <typename I>
+void DemoteRequest<I>::shut_down_journaler() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_image_ctx, create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_shut_down_journaler>(this));
+ m_journaler->shut_down(ctx);
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_shut_down_journaler(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl;
+ }
+
+ delete m_journaler;
+ m_journaler = nullptr;
+ finish(r);
+}
+
+template <typename I>
+void DemoteRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::DemoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/DemoteRequest.h b/src/librbd/journal/DemoteRequest.h
new file mode 100644
index 000000000..6aba6cc8f
--- /dev/null
+++ b/src/librbd/journal/DemoteRequest.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Future.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DemoteRequest {
+public:
+ static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new DemoteRequest(image_ctx, on_finish);
+ }
+
+ DemoteRequest(ImageCtxT &image_ctx, Context *on_finish);
+ ~DemoteRequest();
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_JOURNALER * * * * *
+ * | *
+ * v *
+ * ALLOCATE_TAG * * * * * *
+ * | *
+ * v *
+ * APPEND_EVENT * * * *
+ * | * *
+ * v * *
+ * COMMIT_EVENT * *
+ * | * *
+ * v * *
+ * STOP_APPEND <* * * *
+ * | *
+ * v *
+ * SHUT_DOWN_JOURNALER <* *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+ typedef typename TypeTraits<ImageCtxT>::Future Future;
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ Journaler *m_journaler = nullptr;
+ int m_ret_val = 0;
+
+ ceph::mutex m_lock;
+ ImageClientMeta m_client_meta;
+ uint64_t m_tag_tid = 0;
+ TagData m_tag_data;
+
+ cls::journal::Tag m_tag;
+ Future m_future;
+
+ void open_journaler();
+ void handle_open_journaler(int r);
+
+ void allocate_tag();
+ void handle_allocate_tag(int r);
+
+ void append_event();
+ void handle_append_event(int r);
+
+ void commit_event();
+ void handle_commit_event(int r);
+
+ void stop_append();
+ void handle_stop_append(int r);
+
+ void shut_down_journaler();
+ void handle_shut_down_journaler(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::DemoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H
diff --git a/src/librbd/journal/DisabledPolicy.h b/src/librbd/journal/DisabledPolicy.h
new file mode 100644
index 000000000..27d69a50d
--- /dev/null
+++ b/src/librbd/journal/DisabledPolicy.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H
+
+#include "librbd/journal/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+class DisabledPolicy : public Policy {
+public:
+ bool append_disabled() const override {
+ return true;
+ }
+ bool journal_disabled() const override {
+ return true;
+ }
+ void allocate_tag_on_lock(Context *on_finish) override {
+ ceph_abort();
+ }
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H
diff --git a/src/librbd/journal/ObjectDispatch.cc b/src/librbd/journal/ObjectDispatch.cc
new file mode 100644
index 000000000..5623c635d
--- /dev/null
+++ b/src/librbd/journal/ObjectDispatch.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/ObjectDispatch.h"
+#include "common/dout.h"
+#include "osdc/Striper.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::ObjectDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::data_object_name;
+using util::create_context_callback;
+
+namespace {
+
+template <typename I>
+struct C_CommitIOEvent : public Context {
+ I* image_ctx;
+ Journal<I>* journal;
+ uint64_t object_no;
+ uint64_t object_off;
+ uint64_t object_len;
+ uint64_t journal_tid;
+ int object_dispatch_flags;
+ Context* on_finish;
+
+ C_CommitIOEvent(I* image_ctx, Journal<I>* journal, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, int object_dispatch_flags,
+ Context* on_finish)
+ : image_ctx(image_ctx), journal(journal), object_no(object_no),
+ object_off(object_off), object_len(object_len), journal_tid(journal_tid),
+ object_dispatch_flags(object_dispatch_flags), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ // don't commit the IO extent if a previous dispatch handler will just
+ // retry the failed IO
+ if (r >= 0 ||
+ (object_dispatch_flags &
+ io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR) == 0) {
+ io::Extents file_extents;
+ io::util::extent_to_file(image_ctx, object_no, object_off, object_len,
+ file_extents);
+ for (auto& extent : file_extents) {
+ journal->commit_io_event_extent(journal_tid, extent.first,
+ extent.second, r);
+ }
+ }
+
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ }
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+ObjectDispatch<I>::ObjectDispatch(I* image_ctx, Journal<I>* journal)
+ : m_image_ctx(image_ctx), m_journal(journal) {
+}
+
+template <typename I>
+void ObjectDispatch<I>::shut_down(Context* on_finish) {
+ m_image_ctx->op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+bool ObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, object_len, *journal_tid,
+ *object_dispatch_flags, *on_finish);
+ *on_finish = create_context_callback<
+ Context, &Context::complete>(*on_finish, m_journal);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, data.length(), *journal_tid,
+ *object_dispatch_flags, *on_finish);
+ *on_finish = create_context_callback<
+ Context, &Context::complete>(*on_finish, m_journal);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, object_len, *journal_tid,
+ *object_dispatch_flags, *on_finish);
+ *on_finish = create_context_callback<
+ Context, &Context::complete>(*on_finish, m_journal);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << write_data.length()
+ << dendl;
+
+ *on_finish = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, write_data.length(),
+ *journal_tid, *object_dispatch_flags,
+ *on_finish);
+ *on_finish = create_context_callback<
+ Context, &Context::complete>(*on_finish, m_journal);
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched);
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (*journal_tid == 0) {
+ // non-journaled IO
+ return false;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = *on_finish;
+ *on_finish = new LambdaContext(
+ [image_ctx=m_image_ctx, ctx, journal_tid=*journal_tid](int r) {
+ image_ctx->journal->commit_io_event(journal_tid, r);
+ ctx->complete(r);
+ });
+
+ *dispatch_result = io::DISPATCH_RESULT_CONTINUE;
+ wait_or_flush_event(*journal_tid, io::OBJECT_DISPATCH_FLAG_FLUSH,
+ on_dispatched);
+ return true;
+}
+
+template <typename I>
+void ObjectDispatch<I>::extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << object_no << " " << object_off << "~" << object_len
+ << dendl;
+
+ Context *ctx = new C_CommitIOEvent<I>(m_image_ctx, m_journal, object_no,
+ object_off, object_len, journal_tid, false,
+ nullptr);
+ if (new_journal_tid != 0) {
+ // ensure new journal event is safely committed to disk before
+ // committing old event
+ m_journal->flush_event(new_journal_tid, ctx);
+ } else {
+ ctx = create_context_callback<
+ Context, &Context::complete>(ctx, m_journal);
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void ObjectDispatch<I>::wait_or_flush_event(
+ uint64_t journal_tid, int object_dispatch_flags, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "journal_tid=" << journal_tid << dendl;
+
+ if ((object_dispatch_flags & io::OBJECT_DISPATCH_FLAG_FLUSH) != 0) {
+ m_journal->flush_event(journal_tid, on_dispatched);
+ } else {
+ m_journal->wait_event(journal_tid, on_dispatched);
+ }
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::ObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/journal/ObjectDispatch.h b/src/librbd/journal/ObjectDispatch.h
new file mode 100644
index 000000000..45e4773cc
--- /dev/null
+++ b/src/librbd/journal/ObjectDispatch.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+template <typename> class Journal;
+
+namespace journal {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectDispatch : public io::ObjectDispatchInterface {
+public:
+ static ObjectDispatch* create(ImageCtxT* image_ctx,
+ Journal<ImageCtxT>* journal) {
+ return new ObjectDispatch(image_ctx, journal);
+ }
+
+ ObjectDispatch(ImageCtxT* image_ctx, Journal<ImageCtxT>* journal);
+
+ io::ObjectDispatchLayer get_dispatch_layer() const override {
+ return io::OBJECT_DISPATCH_LAYER_JOURNAL;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, io::ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ return false;
+ }
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ io::FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, io::DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override;
+
+ int prepare_copyup(
+ uint64_t object_no,
+ io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+ Journal<ImageCtxT>* m_journal;
+
+ void wait_or_flush_event(uint64_t journal_tid, int object_dispatch_flags,
+ Context* on_dispatched);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::ObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H
diff --git a/src/librbd/journal/OpenRequest.cc b/src/librbd/journal/OpenRequest.cc
new file mode 100644
index 000000000..eb01aa35a
--- /dev/null
+++ b/src/librbd/journal/OpenRequest.cc
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/OpenRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::OpenRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using util::C_DecodeTags;
+
+template <typename I>
+OpenRequest<I>::OpenRequest(I *image_ctx, Journaler *journaler, ceph::mutex *lock,
+ journal::ImageClientMeta *client_meta,
+ uint64_t *tag_tid, journal::TagData *tag_data,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_journaler(journaler), m_lock(lock),
+ m_client_meta(client_meta), m_tag_tid(tag_tid), m_tag_data(tag_data),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void OpenRequest<I>::send() {
+ send_init();
+}
+
+template <typename I>
+void OpenRequest<I>::send_init() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler->init(create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ OpenRequest<I>, &OpenRequest<I>::handle_init>(this)));
+}
+
+template <typename I>
+void OpenRequest<I>::handle_init(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to initialize journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ // locate the master image client record
+ cls::journal::Client client;
+ r = m_journaler->get_cached_client(Journal<ImageCtx>::IMAGE_CLIENT_ID,
+ &client);
+ if (r < 0) {
+ lderr(cct) << "failed to locate master image client" << dendl;
+ finish(r);
+ return;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto bl = client.data.cbegin();
+ try {
+ decode(client_data, bl);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode client meta data: " << err.what()
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ journal::ImageClientMeta *image_client_meta =
+ boost::get<journal::ImageClientMeta>(&client_data.client_meta);
+ if (image_client_meta == nullptr) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to extract client meta data" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "client: " << client << ", "
+ << "image meta: " << *image_client_meta << dendl;
+
+ m_tag_class = image_client_meta->tag_class;
+ {
+ std::lock_guard locker{*m_lock};
+ *m_client_meta = *image_client_meta;
+ }
+
+ send_get_tags();
+}
+
+template <typename I>
+void OpenRequest<I>::send_get_tags() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ C_DecodeTags *tags_ctx = new C_DecodeTags(
+ cct, m_lock, m_tag_tid, m_tag_data, create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ OpenRequest<I>, &OpenRequest<I>::handle_get_tags>(this)));
+ m_journaler->get_tags(m_tag_class, &tags_ctx->tags, tags_ctx);
+}
+
+template <typename I>
+void OpenRequest<I>::handle_get_tags(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << this << " " << __func__ << ": "
+ << "failed to decode journal tags: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void OpenRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::OpenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/OpenRequest.h b/src/librbd/journal/OpenRequest.h
new file mode 100644
index 000000000..0f10bccf1
--- /dev/null
+++ b/src/librbd/journal/OpenRequest.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "include/int_types.h"
+#include "librbd/journal/TypeTraits.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+struct ImageClientMeta;
+struct TagData;
+
+template <typename ImageCtxT = ImageCtx>
+class OpenRequest {
+public:
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ static OpenRequest* create(ImageCtxT *image_ctx, Journaler *journaler,
+ ceph::mutex *lock, journal::ImageClientMeta *client_meta,
+ uint64_t *tag_tid, journal::TagData *tag_data,
+ Context *on_finish) {
+ return new OpenRequest(image_ctx, journaler, lock, client_meta, tag_tid,
+ tag_data, on_finish);
+ }
+
+ OpenRequest(ImageCtxT *image_ctx, Journaler *journaler, ceph::mutex *lock,
+ journal::ImageClientMeta *client_meta, uint64_t *tag_tid,
+ journal::TagData *tag_data, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * GET_TAGS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+
+ ImageCtxT *m_image_ctx;
+ Journaler *m_journaler;
+ ceph::mutex *m_lock;
+ journal::ImageClientMeta *m_client_meta;
+ uint64_t *m_tag_tid;
+ journal::TagData *m_tag_data;
+ Context *m_on_finish;
+
+ uint64_t m_tag_class = 0;
+
+ void send_init();
+ void handle_init(int r);
+
+ void send_get_tags();
+ void handle_get_tags(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::OpenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H
diff --git a/src/librbd/journal/Policy.h b/src/librbd/journal/Policy.h
new file mode 100644
index 000000000..1ced3c53e
--- /dev/null
+++ b/src/librbd/journal/Policy.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_POLICY_H
+
+class Context;
+
+namespace librbd {
+
+namespace journal {
+
+struct Policy {
+ virtual ~Policy() {
+ }
+
+ virtual bool append_disabled() const = 0;
+ virtual bool journal_disabled() const = 0;
+ virtual void allocate_tag_on_lock(Context *on_finish) = 0;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_POLICY_H
diff --git a/src/librbd/journal/PromoteRequest.cc b/src/librbd/journal/PromoteRequest.cc
new file mode 100644
index 000000000..f7ae45a92
--- /dev/null
+++ b/src/librbd/journal/PromoteRequest.cc
@@ -0,0 +1,237 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/PromoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/OpenRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::PromoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+PromoteRequest<I>::PromoteRequest(I *image_ctx, bool force, Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish),
+ m_lock(ceph::make_mutex("PromoteRequest::m_lock")) {
+}
+
+template <typename I>
+void PromoteRequest<I>::send() {
+ send_open();
+}
+
+template <typename I>
+void PromoteRequest<I>::send_open() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler = new Journaler(m_image_ctx->md_ctx, m_image_ctx->id,
+ Journal<>::IMAGE_CLIENT_ID, {}, nullptr);
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_open>(this));
+ auto open_req = OpenRequest<I>::create(m_image_ctx, m_journaler,
+ &m_lock, &m_client_meta,
+ &m_tag_tid, &m_tag_data, ctx);
+ open_req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_open(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl;
+ shut_down();
+ return;
+ }
+
+ allocate_tag();
+}
+
+template <typename I>
+void PromoteRequest<I>::allocate_tag() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ journal::TagPredecessor predecessor;
+ if (!m_force && m_tag_data.mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) {
+ // orderly promotion -- demotion epoch will have a single entry
+ // so link to our predecessor (demotion) epoch
+ predecessor = TagPredecessor{Journal<>::ORPHAN_MIRROR_UUID, true, m_tag_tid,
+ 1};
+ } else {
+ // forced promotion -- create an epoch no peers can link against
+ predecessor = TagPredecessor{Journal<>::LOCAL_MIRROR_UUID, true, m_tag_tid,
+ 0};
+ }
+
+ TagData tag_data;
+ tag_data.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID;
+ tag_data.predecessor = predecessor;
+
+ bufferlist tag_bl;
+ encode(tag_data, tag_bl);
+
+ Context *ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_allocate_tag>(this);
+ m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_allocate_tag(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+ shut_down();
+ return;
+ }
+
+ m_tag_tid = m_tag.tid;
+ append_event();
+}
+
+template <typename I>
+void PromoteRequest<I>::append_event() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ EventEntry event_entry{DemotePromoteEvent{}, {}};
+ bufferlist event_entry_bl;
+ encode(event_entry, event_entry_bl);
+
+ m_journaler->start_append(0);
+ m_future = m_journaler->append(m_tag_tid, event_entry_bl);
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_append_event>(this);
+ m_future.flush(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_append_event(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to append promotion journal event: "
+ << cpp_strerror(r) << dendl;
+ stop_append();
+ return;
+ }
+
+ commit_event();
+}
+
+template <typename I>
+void PromoteRequest<I>::commit_event() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ m_journaler->committed(m_future);
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_commit_event>(this);
+ m_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_commit_event(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to flush promote commit position: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ stop_append();
+}
+
+template <typename I>
+void PromoteRequest<I>::stop_append() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_stop_append>(this);
+ m_journaler->stop_append(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_stop_append(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl;
+ }
+
+ shut_down();
+}
+
+template <typename I>
+void PromoteRequest<I>::shut_down() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ Context *ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_shut_down>(this));
+ m_journaler->shut_down(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_shut_down(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl;
+ }
+
+ delete m_journaler;
+ finish(r);
+}
+
+template <typename I>
+void PromoteRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::PromoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/PromoteRequest.h b/src/librbd/journal/PromoteRequest.h
new file mode 100644
index 000000000..f6258066e
--- /dev/null
+++ b/src/librbd/journal/PromoteRequest.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Future.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+template <typename ImageCtxT = ImageCtx>
+class PromoteRequest {
+public:
+ static PromoteRequest* create(ImageCtxT *image_ctx, bool force,
+ Context *on_finish) {
+ return new PromoteRequest(image_ctx, force, on_finish);
+ }
+
+ PromoteRequest(ImageCtxT *image_ctx, bool force, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN * * * * * * * * * *
+ * | *
+ * v *
+ * ALLOCATE_TAG * * * * * *
+ * | *
+ * v *
+ * APPEND_EVENT * * * *
+ * | * *
+ * v * *
+ * COMMIT_EVENT * *
+ * | * *
+ * v * *
+ * STOP_APPEND <* * * *
+ * | *
+ * v *
+ * SHUT_DOWN <* * * * * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+ typedef typename TypeTraits<ImageCtxT>::Future Future;
+
+ ImageCtxT *m_image_ctx;
+ bool m_force;
+ Context *m_on_finish;
+
+ Journaler *m_journaler = nullptr;
+ int m_ret_val = 0;
+
+ ceph::mutex m_lock;
+ ImageClientMeta m_client_meta;
+ uint64_t m_tag_tid = 0;
+ TagData m_tag_data;
+
+ cls::journal::Tag m_tag;
+ Future m_future;
+
+ void send_open();
+ void handle_open(int r);
+
+ void allocate_tag();
+ void handle_allocate_tag(int r);
+
+ void append_event();
+ void handle_append_event(int r);
+
+ void commit_event();
+ void handle_commit_event(int r);
+
+ void stop_append();
+ void handle_stop_append(int r);
+
+ void shut_down();
+ void handle_shut_down(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::PromoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H
diff --git a/src/librbd/journal/RemoveRequest.cc b/src/librbd/journal/RemoveRequest.cc
new file mode 100644
index 000000000..0f73a31ba
--- /dev/null
+++ b/src/librbd/journal/RemoveRequest.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "journal/Settings.h"
+#include "include/ceph_assert.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal::RemoveRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+
+namespace journal {
+
+template<typename I>
+RemoveRequest<I>::RemoveRequest(IoCtx &ioctx, const std::string &image_id,
+ const std::string &client_id,
+ ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_image_id(image_id), m_image_client_id(client_id),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+template<typename I>
+void RemoveRequest<I>::send() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ stat_journal();
+}
+
+template<typename I>
+void RemoveRequest<I>::stat_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock);
+ m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock, m_ioctx,
+ m_image_id, m_image_client_id, {}, nullptr);
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_stat_journal>(this);
+
+ m_journaler->exists(ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_stat_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if ((*result < 0) && (*result != -ENOENT)) {
+ lderr(m_cct) << "failed to stat journal header: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ if (*result == -ENOENT) {
+ shut_down_journaler(0);
+ return nullptr;
+ }
+
+ init_journaler();
+ return nullptr;
+}
+
+template<typename I>
+void RemoveRequest<I>::init_journaler() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_init_journaler>(this);
+
+ m_journaler->init(ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_init_journaler(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if ((*result < 0) && (*result != -ENOENT)) {
+ lderr(m_cct) << "failed to init journaler: " << cpp_strerror(*result) << dendl;
+ shut_down_journaler(*result);
+ return nullptr;
+ }
+
+ remove_journal();
+ return nullptr;
+}
+
+template<typename I>
+void RemoveRequest<I>::remove_journal() {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_journal>(this);
+
+ m_journaler->remove(true, ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_remove_journal(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to remove journal: " << cpp_strerror(*result) << dendl;
+ }
+
+ shut_down_journaler(*result);
+ return nullptr;
+}
+
+template<typename I>
+void RemoveRequest<I>::shut_down_journaler(int r) {
+ ldout(m_cct, 20) << this << " " << __func__ << dendl;
+
+ m_r_saved = r;
+
+ using klass = RemoveRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_journaler_shutdown>(this);
+
+ m_journaler->shut_down(ctx);
+}
+
+template<typename I>
+Context *RemoveRequest<I>::handle_journaler_shutdown(int *result) {
+ ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl;
+ }
+
+ delete m_journaler;
+
+ if (m_r_saved == 0) {
+ ldout(m_cct, 20) << "done." << dendl;
+ }
+
+ m_on_finish->complete(m_r_saved);
+ delete this;
+
+ return nullptr;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/RemoveRequest.h b/src/librbd/journal/RemoveRequest.h
new file mode 100644
index 000000000..14b1c4dc5
--- /dev/null
+++ b/src/librbd/journal/RemoveRequest.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "librbd/ImageCtx.h"
+#include "journal/Journaler.h"
+#include "librbd/journal/TypeTraits.h"
+#include "common/Timer.h"
+
+using librados::IoCtx;
+using journal::Journaler;
+
+class Context;
+class ContextWQ;
+
+namespace journal {
+ class Journaler;
+}
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class RemoveRequest {
+public:
+ static RemoveRequest *create(IoCtx &ioctx, const std::string &image_id,
+ const std::string &client_id,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new RemoveRequest(ioctx, image_id, client_id,
+ op_work_queue, on_finish);
+ }
+
+ void send();
+
+private:
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ RemoveRequest(IoCtx &ioctx, const std::string &image_id,
+ const std::string &client_id,
+ ContextWQ *op_work_queue, Context *on_finish);
+
+ IoCtx &m_ioctx;
+ std::string m_image_id;
+ std::string m_image_client_id;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ Journaler *m_journaler;
+ SafeTimer *m_timer;
+ ceph::mutex *m_timer_lock;
+ int m_r_saved;
+
+ void stat_journal();
+ Context *handle_stat_journal(int *result);
+
+ void init_journaler();
+ Context *handle_init_journaler(int *result);
+
+ void remove_journal();
+ Context *handle_remove_journal(int *result);
+
+ void shut_down_journaler(int r);
+ Context *handle_journaler_shutdown(int *result);
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc
new file mode 100644
index 000000000..db73edb61
--- /dev/null
+++ b/src/librbd/journal/Replay.cc
@@ -0,0 +1,1177 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/Replay.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " "
+
+namespace librbd {
+namespace journal {
+
+namespace {
+
+static const uint64_t IN_FLIGHT_IO_LOW_WATER_MARK(32);
+static const uint64_t IN_FLIGHT_IO_HIGH_WATER_MARK(64);
+
+static NoOpProgressContext no_op_progress_callback;
+
+template <typename I, typename E>
+struct ExecuteOp : public Context {
+ I &image_ctx;
+ E event;
+ Context *on_op_complete;
+
+ ExecuteOp(I &image_ctx, const E &event, Context *on_op_complete)
+ : image_ctx(image_ctx), event(event), on_op_complete(on_op_complete) {
+ }
+
+ void execute(const journal::SnapCreateEvent &_) {
+ image_ctx.operations->execute_snap_create(event.snap_namespace,
+ event.snap_name,
+ on_op_complete,
+ event.op_tid,
+ SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE,
+ no_op_progress_callback);
+ }
+
+ void execute(const journal::SnapRemoveEvent &_) {
+ image_ctx.operations->execute_snap_remove(event.snap_namespace,
+ event.snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapRenameEvent &_) {
+ image_ctx.operations->execute_snap_rename(event.snap_id,
+ event.dst_snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapProtectEvent &_) {
+ image_ctx.operations->execute_snap_protect(event.snap_namespace,
+ event.snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapUnprotectEvent &_) {
+ image_ctx.operations->execute_snap_unprotect(event.snap_namespace,
+ event.snap_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapRollbackEvent &_) {
+ image_ctx.operations->execute_snap_rollback(event.snap_namespace,
+ event.snap_name,
+ no_op_progress_callback,
+ on_op_complete);
+ }
+
+ void execute(const journal::RenameEvent &_) {
+ image_ctx.operations->execute_rename(event.image_name,
+ on_op_complete);
+ }
+
+ void execute(const journal::ResizeEvent &_) {
+ image_ctx.operations->execute_resize(event.size, true, no_op_progress_callback,
+ on_op_complete, event.op_tid);
+ }
+
+ void execute(const journal::FlattenEvent &_) {
+ image_ctx.operations->execute_flatten(no_op_progress_callback,
+ on_op_complete);
+ }
+
+ void execute(const journal::SnapLimitEvent &_) {
+ image_ctx.operations->execute_snap_set_limit(event.limit, on_op_complete);
+ }
+
+ void execute(const journal::UpdateFeaturesEvent &_) {
+ image_ctx.operations->execute_update_features(event.features, event.enabled,
+ on_op_complete, event.op_tid);
+ }
+
+ void execute(const journal::MetadataSetEvent &_) {
+ image_ctx.operations->execute_metadata_set(event.key, event.value,
+ on_op_complete);
+ }
+
+ void execute(const journal::MetadataRemoveEvent &_) {
+ image_ctx.operations->execute_metadata_remove(event.key, on_op_complete);
+ }
+
+ void finish(int r) override {
+ CephContext *cct = image_ctx.cct;
+ if (r < 0) {
+ lderr(cct) << ": ExecuteOp::" << __func__ << ": r=" << r << dendl;
+ on_op_complete->complete(r);
+ return;
+ }
+
+ ldout(cct, 20) << ": ExecuteOp::" << __func__ << dendl;
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+
+ if (image_ctx.exclusive_lock == nullptr ||
+ !image_ctx.exclusive_lock->accept_ops()) {
+ ldout(cct, 5) << ": lost exclusive lock -- skipping op" << dendl;
+ on_op_complete->complete(-ECANCELED);
+ return;
+ }
+
+ execute(event);
+ }
+};
+
+template <typename I>
+struct C_RefreshIfRequired : public Context {
+ I &image_ctx;
+ Context *on_finish;
+
+ C_RefreshIfRequired(I &image_ctx, Context *on_finish)
+ : image_ctx(image_ctx), on_finish(on_finish) {
+ }
+ ~C_RefreshIfRequired() override {
+ delete on_finish;
+ }
+
+ void finish(int r) override {
+ CephContext *cct = image_ctx.cct;
+ Context *ctx = on_finish;
+ on_finish = nullptr;
+
+ if (r < 0) {
+ lderr(cct) << ": C_RefreshIfRequired::" << __func__ << ": r=" << r << dendl;
+ image_ctx.op_work_queue->queue(ctx, r);
+ return;
+ }
+
+ if (image_ctx.state->is_refresh_required()) {
+ ldout(cct, 20) << ": C_RefreshIfRequired::" << __func__ << ": "
+ << "refresh required" << dendl;
+ image_ctx.state->refresh(ctx);
+ return;
+ }
+
+ image_ctx.op_work_queue->queue(ctx, 0);
+ }
+};
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " \
+ << __func__
+
+template <typename I>
+Replay<I>::Replay(I &image_ctx)
+ : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+Replay<I>::~Replay() {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_in_flight_aio_flush == 0);
+ ceph_assert(m_in_flight_aio_modify == 0);
+ ceph_assert(m_aio_modify_unsafe_contexts.empty());
+ ceph_assert(m_aio_modify_safe_contexts.empty());
+ ceph_assert(m_op_events.empty());
+ ceph_assert(m_in_flight_op_events == 0);
+}
+
+template <typename I>
+int Replay<I>::decode(bufferlist::const_iterator *it, EventEntry *event_entry) {
+ try {
+ using ceph::decode;
+ decode(*event_entry, *it);
+ } catch (const buffer::error &err) {
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+template <typename I>
+void Replay<I>::process(const EventEntry &event_entry,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": on_ready=" << on_ready << ", on_safe=" << on_safe
+ << dendl;
+
+ on_ready = util::create_async_context_callback(m_image_ctx, on_ready);
+
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ if (m_image_ctx.exclusive_lock == nullptr ||
+ !m_image_ctx.exclusive_lock->accept_ops()) {
+ ldout(cct, 5) << ": lost exclusive lock -- skipping event" << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, -ECANCELED);
+ on_ready->complete(0);
+ return;
+ }
+
+ boost::apply_visitor(EventVisitor(this, on_ready, on_safe),
+ event_entry.event);
+}
+
+template <typename I>
+void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ io::AioCompletion *flush_comp = nullptr;
+ on_finish = util::create_async_context_callback(
+ m_image_ctx, on_finish);
+
+ {
+ std::lock_guard locker{m_lock};
+
+ // safely commit any remaining AIO modify operations
+ if ((m_in_flight_aio_flush + m_in_flight_aio_modify) != 0) {
+ flush_comp = create_aio_flush_completion(nullptr);
+ ceph_assert(flush_comp != nullptr);
+ }
+
+ for (auto &op_event_pair : m_op_events) {
+ OpEvent &op_event = op_event_pair.second;
+ if (cancel_ops) {
+ // cancel ops that are waiting to start (waiting for
+ // OpFinishEvent or waiting for ready)
+ if (op_event.on_start_ready == nullptr &&
+ op_event.on_op_finish_event != nullptr) {
+ Context *on_op_finish_event = nullptr;
+ std::swap(on_op_finish_event, op_event.on_op_finish_event);
+ m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART);
+ }
+ } else if (op_event.on_op_finish_event != nullptr) {
+ // start ops waiting for OpFinishEvent
+ Context *on_op_finish_event = nullptr;
+ std::swap(on_op_finish_event, op_event.on_op_finish_event);
+ m_image_ctx.op_work_queue->queue(on_op_finish_event, 0);
+ } else if (op_event.on_start_ready != nullptr) {
+ // waiting for op ready
+ op_event_pair.second.finish_on_ready = true;
+ }
+ }
+
+ ceph_assert(!m_shut_down);
+ m_shut_down = true;
+
+ ceph_assert(m_flush_ctx == nullptr);
+ if (m_in_flight_op_events > 0 || flush_comp != nullptr) {
+ std::swap(m_flush_ctx, on_finish);
+ }
+ }
+
+ // execute the following outside of lock scope
+ if (flush_comp != nullptr) {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ if (on_finish != nullptr) {
+ on_finish->complete(0);
+ }
+}
+
+template <typename I>
+void Replay<I>::flush(Context *on_finish) {
+ io::AioCompletion *aio_comp;
+ {
+ std::lock_guard locker{m_lock};
+ aio_comp = create_aio_flush_completion(
+ util::create_async_context_callback(m_image_ctx, on_finish));
+ if (aio_comp == nullptr) {
+ return;
+ }
+ }
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+}
+
+template <typename I>
+void Replay<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": op_tid=" << op_tid << dendl;
+
+ std::lock_guard locker{m_lock};
+ auto op_it = m_op_events.find(op_tid);
+ ceph_assert(op_it != m_op_events.end());
+
+ OpEvent &op_event = op_it->second;
+ ceph_assert(op_event.op_in_progress &&
+ op_event.on_op_finish_event == nullptr &&
+ op_event.on_finish_ready == nullptr &&
+ op_event.on_finish_safe == nullptr);
+
+ // resume processing replay events
+ Context *on_start_ready = nullptr;
+ std::swap(on_start_ready, op_event.on_start_ready);
+ on_start_ready->complete(0);
+
+ // cancel has been requested -- send error to paused state machine
+ if (!op_event.finish_on_ready && m_flush_ctx != nullptr) {
+ m_image_ctx.op_work_queue->queue(on_resume, -ERESTART);
+ return;
+ }
+
+ // resume the op state machine once the associated OpFinishEvent
+ // is processed
+ op_event.on_op_finish_event = new LambdaContext(
+ [on_resume](int r) {
+ on_resume->complete(r);
+ });
+
+ // shut down request -- don't expect OpFinishEvent
+ if (op_event.finish_on_ready) {
+ m_image_ctx.op_work_queue->queue(on_resume, 0);
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioDiscardEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO discard event" << dendl;
+
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_DISCARD,
+ &flush_required,
+ {});
+ if (aio_comp == nullptr) {
+ return;
+ }
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ event.discard_granularity_bytes,
+ m_image_ctx.get_data_io_context(), {});
+ }
+
+ if (flush_required) {
+ m_lock.lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.unlock();
+
+ if (flush_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioWriteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO write event" << dendl;
+
+ bufferlist data = event.data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_WRITE,
+ &flush_required,
+ {});
+ if (aio_comp == nullptr) {
+ return;
+ }
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_write(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(data),
+ m_image_ctx.get_data_io_context(), 0, {});
+ }
+
+ if (flush_required) {
+ m_lock.lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.unlock();
+
+ if (flush_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioFlushEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO flush event" << dendl;
+
+ io::AioCompletion *aio_comp;
+ {
+ std::lock_guard locker{m_lock};
+ aio_comp = create_aio_flush_completion(on_safe);
+ }
+
+ if (aio_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::AioWriteSameEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO writesame event" << dendl;
+
+ bufferlist data = event.data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_WRITESAME,
+ &flush_required,
+ {});
+ if (aio_comp == nullptr) {
+ return;
+ }
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_writesame(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(data),
+ m_image_ctx.get_data_io_context(), 0,
+ {});
+ }
+
+ if (flush_required) {
+ m_lock.lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.unlock();
+
+ if (flush_comp != nullptr) {
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+ }
+}
+
+ template <typename I>
+ void Replay<I>::handle_event(const journal::AioCompareAndWriteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO CompareAndWrite event" << dendl;
+
+ bufferlist cmp_data = event.cmp_data;
+ bufferlist write_data = event.write_data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_COMPARE_AND_WRITE,
+ &flush_required,
+ {-EILSEQ});
+
+ if (!clipped_io(event.offset, aio_comp)) {
+ io::ImageRequest<I>::aio_compare_and_write(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(cmp_data),
+ std::move(write_data),
+ nullptr,
+ m_image_ctx.get_data_io_context(),
+ 0, {});
+ }
+
+ if (flush_required) {
+ m_lock.lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.unlock();
+
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ }
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::OpFinishEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Op finish event: "
+ << "op_tid=" << event.op_tid << dendl;
+
+ bool op_in_progress;
+ bool filter_ret_val;
+ Context *on_op_complete = nullptr;
+ Context *on_op_finish_event = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ auto op_it = m_op_events.find(event.op_tid);
+ if (op_it == m_op_events.end()) {
+ ldout(cct, 10) << ": unable to locate associated op: assuming previously "
+ << "committed." << dendl;
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, 0);
+ return;
+ }
+
+ OpEvent &op_event = op_it->second;
+ ceph_assert(op_event.on_finish_safe == nullptr);
+ op_event.on_finish_ready = on_ready;
+ op_event.on_finish_safe = on_safe;
+ op_in_progress = op_event.op_in_progress;
+ std::swap(on_op_complete, op_event.on_op_complete);
+ std::swap(on_op_finish_event, op_event.on_op_finish_event);
+
+ // special errors which indicate op never started but was recorded
+ // as failed in the journal
+ filter_ret_val = (op_event.op_finish_error_codes.count(event.r) != 0);
+ }
+
+ if (event.r < 0) {
+ if (op_in_progress) {
+ // bubble the error up to the in-progress op to cancel it
+ on_op_finish_event->complete(event.r);
+ } else {
+ // op hasn't been started -- bubble the error up since
+ // our image is now potentially in an inconsistent state
+ // since simple errors should have been caught before
+ // creating the op event
+ delete on_op_complete;
+ delete on_op_finish_event;
+ handle_op_complete(event.op_tid, filter_ret_val ? 0 : event.r);
+ }
+ return;
+ }
+
+ // journal recorded success -- apply the op now
+ on_op_finish_event->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapCreateEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap create event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EEXIST};
+
+ // avoid lock cycles
+ m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapCreateEvent>(m_image_ctx, event,
+ on_op_complete)),
+ 0);
+
+ // do not process more events until the state machine is ready
+ // since it will affect IO
+ op_event->op_in_progress = true;
+ op_event->on_start_ready = on_ready;
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapRemoveEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap remove event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapRemoveEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-ENOENT};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapRenameEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap rename event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapRenameEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EEXIST};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapProtectEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap protect event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapProtectEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EBUSY};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapUnprotectEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap unprotect event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapUnprotectEvent>(m_image_ctx,
+ event,
+ on_op_complete));
+
+ // ignore errors recorded in the journal
+ op_event->op_finish_error_codes = {-EBUSY};
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EINVAL};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapRollbackEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap rollback start event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapRollbackEvent>(m_image_ctx,
+ event,
+ on_op_complete));
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::RenameEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Rename event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::RenameEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EEXIST};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::ResizeEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Resize start event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ // avoid lock cycles
+ m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::ResizeEvent>(m_image_ctx, event,
+ on_op_complete)), 0);
+
+ // do not process more events until the state machine is ready
+ // since it will affect IO
+ op_event->op_in_progress = true;
+ op_event->on_start_ready = on_ready;
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::FlattenEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Flatten start event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::FlattenEvent>(m_image_ctx, event,
+ on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-EINVAL};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::DemotePromoteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Demote/Promote event" << dendl;
+ on_ready->complete(0);
+ on_safe->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::SnapLimitEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Snap limit event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ op_event->on_op_finish_event = new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::SnapLimitEvent>(m_image_ctx,
+ event,
+ on_op_complete));
+
+ op_event->ignore_error_codes = {-ERANGE};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::UpdateFeaturesEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Update features event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ // avoid lock cycles
+ m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>(
+ m_image_ctx, new ExecuteOp<I, journal::UpdateFeaturesEvent>(
+ m_image_ctx, event, on_op_complete)), 0);
+
+ // do not process more events until the state machine is ready
+ // since it will affect IO
+ op_event->op_in_progress = true;
+ op_event->on_start_ready = on_ready;
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::MetadataSetEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Metadata set event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete);
+ op_event->on_op_finish_event = util::create_async_context_callback(
+ m_image_ctx, new ExecuteOp<I, journal::MetadataSetEvent>(
+ m_image_ctx, event, on_op_complete));
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::MetadataRemoveEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": Metadata remove event" << dendl;
+
+ std::lock_guard locker{m_lock};
+ OpEvent *op_event;
+ Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready,
+ on_safe, &op_event);
+ if (on_op_complete == nullptr) {
+ return;
+ }
+
+ on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete);
+ op_event->on_op_finish_event = util::create_async_context_callback(
+ m_image_ctx, new ExecuteOp<I, journal::MetadataRemoveEvent>(
+ m_image_ctx, event, on_op_complete));
+
+ // ignore errors caused due to replay
+ op_event->ignore_error_codes = {-ENOENT};
+
+ on_ready->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_event(const journal::UnknownEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": unknown event" << dendl;
+ on_ready->complete(0);
+ on_safe->complete(0);
+}
+
+template <typename I>
+void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe,
+ int r, std::set<int> &filters) {
+ std::lock_guard locker{m_lock};
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": on_ready=" << on_ready << ", "
+ << "on_safe=" << on_safe << ", r=" << r << dendl;
+
+ if (on_ready != nullptr) {
+ on_ready->complete(0);
+ }
+
+ if (filters.find(r) != filters.end())
+ r = 0;
+
+ if (r < 0) {
+ lderr(cct) << ": AIO modify op failed: " << cpp_strerror(r) << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, r);
+ return;
+ }
+
+ // will be completed after next flush operation completes
+ m_aio_modify_safe_contexts.insert(on_safe);
+}
+
+template <typename I>
+void Replay<I>::handle_aio_flush_complete(Context *on_flush_safe,
+ Contexts &on_safe_ctxs, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << ": AIO flush failed: " << cpp_strerror(r) << dendl;
+ }
+
+ Context *on_aio_ready = nullptr;
+ Context *on_flush = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_in_flight_aio_flush > 0);
+ ceph_assert(m_in_flight_aio_modify >= on_safe_ctxs.size());
+ --m_in_flight_aio_flush;
+ m_in_flight_aio_modify -= on_safe_ctxs.size();
+
+ std::swap(on_aio_ready, m_on_aio_ready);
+ if (m_in_flight_op_events == 0 &&
+ (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) {
+ on_flush = m_flush_ctx;
+ }
+
+ // strip out previously failed on_safe contexts
+ for (auto it = on_safe_ctxs.begin(); it != on_safe_ctxs.end(); ) {
+ if (m_aio_modify_safe_contexts.erase(*it)) {
+ ++it;
+ } else {
+ it = on_safe_ctxs.erase(it);
+ }
+ }
+ }
+
+ if (on_aio_ready != nullptr) {
+ ldout(cct, 10) << ": resuming paused AIO" << dendl;
+ on_aio_ready->complete(0);
+ }
+
+ if (on_flush_safe != nullptr) {
+ on_safe_ctxs.push_back(on_flush_safe);
+ }
+ for (auto ctx : on_safe_ctxs) {
+ ldout(cct, 20) << ": completing safe context: " << ctx << dendl;
+ ctx->complete(r);
+ }
+
+ if (on_flush != nullptr) {
+ ldout(cct, 20) << ": completing flush context: " << on_flush << dendl;
+ on_flush->complete(r);
+ }
+}
+
+template <typename I>
+Context *Replay<I>::create_op_context_callback(uint64_t op_tid,
+ Context *on_ready,
+ Context *on_safe,
+ OpEvent **op_event) {
+ CephContext *cct = m_image_ctx.cct;
+ if (m_shut_down) {
+ ldout(cct, 5) << ": ignoring event after shut down" << dendl;
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN);
+ return nullptr;
+ }
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ if (m_op_events.count(op_tid) != 0) {
+ lderr(cct) << ": duplicate op tid detected: " << op_tid << dendl;
+
+ // on_ready is already async but on failure invoke on_safe async
+ // as well
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, -EINVAL);
+ return nullptr;
+ }
+
+ ++m_in_flight_op_events;
+ *op_event = &m_op_events[op_tid];
+ (*op_event)->on_start_safe = on_safe;
+
+ Context *on_op_complete = new C_OpOnComplete(this, op_tid);
+ (*op_event)->on_op_complete = on_op_complete;
+ return on_op_complete;
+}
+
+template <typename I>
+void Replay<I>::handle_op_complete(uint64_t op_tid, int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": op_tid=" << op_tid << ", "
+ << "r=" << r << dendl;
+
+ OpEvent op_event;
+ bool shutting_down = false;
+ {
+ std::lock_guard locker{m_lock};
+ auto op_it = m_op_events.find(op_tid);
+ ceph_assert(op_it != m_op_events.end());
+
+ op_event = std::move(op_it->second);
+ m_op_events.erase(op_it);
+
+ if (m_shut_down) {
+ ceph_assert(m_flush_ctx != nullptr);
+ shutting_down = true;
+ }
+ }
+
+ ceph_assert(op_event.on_start_ready == nullptr || (r < 0 && r != -ERESTART));
+ if (op_event.on_start_ready != nullptr) {
+ // blocking op event failed before it became ready
+ ceph_assert(op_event.on_finish_ready == nullptr &&
+ op_event.on_finish_safe == nullptr);
+
+ op_event.on_start_ready->complete(0);
+ } else {
+ // event kicked off by OpFinishEvent
+ ceph_assert((op_event.on_finish_ready != nullptr &&
+ op_event.on_finish_safe != nullptr) || shutting_down);
+ }
+
+ if (op_event.on_op_finish_event != nullptr) {
+ op_event.on_op_finish_event->complete(r);
+ }
+
+ if (op_event.on_finish_ready != nullptr) {
+ op_event.on_finish_ready->complete(0);
+ }
+
+ // filter out errors caused by replay of the same op
+ if (r < 0 && op_event.ignore_error_codes.count(r) != 0) {
+ r = 0;
+ }
+
+ op_event.on_start_safe->complete(r);
+ if (op_event.on_finish_safe != nullptr) {
+ op_event.on_finish_safe->complete(r);
+ }
+
+ // shut down request might have occurred while lock was
+ // dropped -- handle if pending
+ Context *on_flush = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_in_flight_op_events > 0);
+ --m_in_flight_op_events;
+ if (m_in_flight_op_events == 0 &&
+ (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) {
+ on_flush = m_flush_ctx;
+ }
+ }
+ if (on_flush != nullptr) {
+ m_image_ctx.op_work_queue->queue(on_flush, 0);
+ }
+}
+
+template <typename I>
+io::AioCompletion *
+Replay<I>::create_aio_modify_completion(Context *on_ready,
+ Context *on_safe,
+ io::aio_type_t aio_type,
+ bool *flush_required,
+ std::set<int> &&filters) {
+ std::lock_guard locker{m_lock};
+ CephContext *cct = m_image_ctx.cct;
+ ceph_assert(m_on_aio_ready == nullptr);
+
+ if (m_shut_down) {
+ ldout(cct, 5) << ": ignoring event after shut down" << dendl;
+ on_ready->complete(0);
+ m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN);
+ return nullptr;
+ }
+
+ ++m_in_flight_aio_modify;
+ m_aio_modify_unsafe_contexts.push_back(on_safe);
+
+ // FLUSH if we hit the low-water mark -- on_safe contexts are
+ // completed by flushes-only so that we don't move the journal
+ // commit position until safely on-disk
+
+ *flush_required = (m_aio_modify_unsafe_contexts.size() ==
+ IN_FLIGHT_IO_LOW_WATER_MARK);
+ if (*flush_required) {
+ ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush"
+ << dendl;
+ }
+
+ // READY for more events if:
+ // * not at high-water mark for IO
+ // * in-flight ops are at a consistent point (snap create has IO flushed,
+ // shrink has adjusted clip boundary, etc) -- should have already been
+ // flagged not-ready
+ if (m_in_flight_aio_modify == IN_FLIGHT_IO_HIGH_WATER_MARK) {
+ ldout(cct, 10) << ": hit AIO replay high-water mark: pausing replay"
+ << dendl;
+ ceph_assert(m_on_aio_ready == nullptr);
+ std::swap(m_on_aio_ready, on_ready);
+ }
+
+ // when the modification is ACKed by librbd, we can process the next
+ // event. when flushed, the completion of the next flush will fire the
+ // on_safe callback
+ auto aio_comp = io::AioCompletion::create_and_start<Context>(
+ new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters)),
+ util::get_image_ctx(&m_image_ctx), aio_type);
+ return aio_comp;
+}
+
+template <typename I>
+io::AioCompletion *Replay<I>::create_aio_flush_completion(Context *on_safe) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ CephContext *cct = m_image_ctx.cct;
+ if (m_shut_down) {
+ ldout(cct, 5) << ": ignoring event after shut down" << dendl;
+ if (on_safe != nullptr) {
+ m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN);
+ }
+ return nullptr;
+ }
+
+ ++m_in_flight_aio_flush;
+
+ // associate all prior write/discard ops to this flush request
+ auto aio_comp = io::AioCompletion::create_and_start<Context>(
+ new C_AioFlushComplete(this, on_safe,
+ std::move(m_aio_modify_unsafe_contexts)),
+ util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH);
+ m_aio_modify_unsafe_contexts.clear();
+ return aio_comp;
+}
+
+template <typename I>
+bool Replay<I>::clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp) {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.image_lock.lock_shared();
+ size_t image_size = m_image_ctx.size;
+ m_image_ctx.image_lock.unlock_shared();
+
+ if (image_offset >= image_size) {
+ // rbd-mirror image sync might race an IO event w/ associated resize between
+ // the point the peer is registered and the sync point is created, so no-op
+ // IO events beyond the current image extents since under normal conditions
+ // it wouldn't have been recorded in the journal
+ ldout(cct, 5) << ": no-op IO event beyond image size" << dendl;
+ aio_comp->get();
+ aio_comp->set_request_count(0);
+ aio_comp->put();
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::Replay<librbd::ImageCtx>;
diff --git a/src/librbd/journal/Replay.h b/src/librbd/journal/Replay.h
new file mode 100644
index 000000000..038601833
--- /dev/null
+++ b/src/librbd/journal/Replay.h
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_REPLAY_H
+#define CEPH_LIBRBD_JOURNAL_REPLAY_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "common/ceph_mutex.h"
+#include "librbd/io/Types.h"
+#include "librbd/journal/Types.h"
+#include <boost/variant.hpp>
+#include <list>
+#include <unordered_set>
+#include <unordered_map>
+
+namespace librbd {
+
+class ImageCtx;
+namespace io { struct AioCompletion; }
+
+namespace journal {
+
+template <typename ImageCtxT = ImageCtx>
+class Replay {
+public:
+ static Replay *create(ImageCtxT &image_ctx) {
+ return new Replay(image_ctx);
+ }
+
+ Replay(ImageCtxT &image_ctx);
+ ~Replay();
+
+ int decode(bufferlist::const_iterator *it, EventEntry *event_entry);
+ void process(const EventEntry &event_entry,
+ Context *on_ready, Context *on_safe);
+
+ void shut_down(bool cancel_ops, Context *on_finish);
+ void flush(Context *on_finish);
+
+ void replay_op_ready(uint64_t op_tid, Context *on_resume);
+
+private:
+ typedef std::unordered_set<int> ReturnValues;
+
+ struct OpEvent {
+ bool op_in_progress = false;
+ bool finish_on_ready = false;
+ Context *on_op_finish_event = nullptr;
+ Context *on_start_ready = nullptr;
+ Context *on_start_safe = nullptr;
+ Context *on_finish_ready = nullptr;
+ Context *on_finish_safe = nullptr;
+ Context *on_op_complete = nullptr;
+ ReturnValues op_finish_error_codes;
+ ReturnValues ignore_error_codes;
+ };
+
+ typedef std::list<uint64_t> OpTids;
+ typedef std::list<Context *> Contexts;
+ typedef std::unordered_set<Context *> ContextSet;
+ typedef std::unordered_map<uint64_t, OpEvent> OpEvents;
+
+ struct C_OpOnComplete : public Context {
+ Replay *replay;
+ uint64_t op_tid;
+ C_OpOnComplete(Replay *replay, uint64_t op_tid)
+ : replay(replay), op_tid(op_tid) {
+ }
+ void finish(int r) override {
+ replay->handle_op_complete(op_tid, r);
+ }
+ };
+
+ struct C_AioModifyComplete : public Context {
+ Replay *replay;
+ Context *on_ready;
+ Context *on_safe;
+ std::set<int> filters;
+ C_AioModifyComplete(Replay *replay, Context *on_ready,
+ Context *on_safe, std::set<int> &&filters)
+ : replay(replay), on_ready(on_ready), on_safe(on_safe),
+ filters(std::move(filters)) {
+ }
+ void finish(int r) override {
+ replay->handle_aio_modify_complete(on_ready, on_safe, r, filters);
+ }
+ };
+
+ struct C_AioFlushComplete : public Context {
+ Replay *replay;
+ Context *on_flush_safe;
+ Contexts on_safe_ctxs;
+ C_AioFlushComplete(Replay *replay, Context *on_flush_safe,
+ Contexts &&on_safe_ctxs)
+ : replay(replay), on_flush_safe(on_flush_safe),
+ on_safe_ctxs(on_safe_ctxs) {
+ }
+ void finish(int r) override {
+ replay->handle_aio_flush_complete(on_flush_safe, on_safe_ctxs, r);
+ }
+ };
+
+ struct EventVisitor : public boost::static_visitor<void> {
+ Replay *replay;
+ Context *on_ready;
+ Context *on_safe;
+
+ EventVisitor(Replay *_replay, Context *_on_ready, Context *_on_safe)
+ : replay(_replay), on_ready(_on_ready), on_safe(_on_safe) {
+ }
+
+ template <typename Event>
+ inline void operator()(const Event &event) const {
+ replay->handle_event(event, on_ready, on_safe);
+ }
+ };
+
+ ImageCtxT &m_image_ctx;
+
+ ceph::mutex m_lock = ceph::make_mutex("Replay<I>::m_lock");
+
+ uint64_t m_in_flight_aio_flush = 0;
+ uint64_t m_in_flight_aio_modify = 0;
+ Contexts m_aio_modify_unsafe_contexts;
+ ContextSet m_aio_modify_safe_contexts;
+
+ OpEvents m_op_events;
+ uint64_t m_in_flight_op_events = 0;
+
+ bool m_shut_down = false;
+ Context *m_flush_ctx = nullptr;
+ Context *m_on_aio_ready = nullptr;
+
+ void handle_event(const AioDiscardEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioWriteEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioWriteSameEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioCompareAndWriteEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const AioFlushEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const OpFinishEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapCreateEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapRemoveEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapRenameEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapProtectEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapUnprotectEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapRollbackEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const RenameEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const ResizeEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const FlattenEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const DemotePromoteEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const SnapLimitEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const UpdateFeaturesEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const MetadataSetEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const MetadataRemoveEvent &event, Context *on_ready,
+ Context *on_safe);
+ void handle_event(const UnknownEvent &event, Context *on_ready,
+ Context *on_safe);
+
+ void handle_aio_modify_complete(Context *on_ready, Context *on_safe,
+ int r, std::set<int> &filters);
+ void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs,
+ int r);
+
+ Context *create_op_context_callback(uint64_t op_tid, Context *on_ready,
+ Context *on_safe, OpEvent **op_event);
+ void handle_op_complete(uint64_t op_tid, int r);
+
+ io::AioCompletion *create_aio_modify_completion(Context *on_ready,
+ Context *on_safe,
+ io::aio_type_t aio_type,
+ bool *flush_required,
+ std::set<int> &&filters);
+ io::AioCompletion *create_aio_flush_completion(Context *on_safe);
+ void handle_aio_completion(io::AioCompletion *aio_comp);
+
+ bool clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::Replay<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_REPLAY_H
diff --git a/src/librbd/journal/ResetRequest.cc b/src/librbd/journal/ResetRequest.cc
new file mode 100644
index 000000000..895d0046e
--- /dev/null
+++ b/src/librbd/journal/ResetRequest.cc
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/ResetRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "include/ceph_assert.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::ResetRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace journal {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template<typename I>
+void ResetRequest<I>::send() {
+ init_journaler();
+}
+
+template<typename I>
+void ResetRequest<I>::init_journaler() {
+ ldout(m_cct, 10) << dendl;
+
+ m_journaler = new Journaler(m_io_ctx, m_image_id, m_client_id, {}, nullptr);
+ Context *ctx = create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_init_journaler>(this);
+ m_journaler->init(ctx);
+}
+
+template<typename I>
+void ResetRequest<I>::handle_init_journaler(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 5) << "journal does not exist" << dendl;
+ m_ret_val = r;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to init journaler: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ } else {
+ int64_t pool_id;
+ m_journaler->get_metadata(&m_order, &m_splay_width, &pool_id);
+
+ if (pool_id != -1) {
+ librados::Rados rados(m_io_ctx);
+ r = rados.pool_reverse_lookup(pool_id, &m_object_pool_name);
+ if (r < 0) {
+ lderr(m_cct) << "failed to lookup data pool: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ }
+ }
+ }
+
+ shut_down_journaler();
+}
+
+template<typename I>
+void ResetRequest<I>::shut_down_journaler() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_op_work_queue, create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_journaler_shutdown>(this));
+ m_journaler->shut_down(ctx);
+}
+
+template<typename I>
+void ResetRequest<I>::handle_journaler_shutdown(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ delete m_journaler;
+ if (r < 0) {
+ lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(r)
+ << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ }
+
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ remove_journal();
+}
+
+template<typename I>
+void ResetRequest<I>::remove_journal() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_remove_journal>(this);
+ auto req = RemoveRequest<I>::create(m_io_ctx, m_image_id, m_client_id,
+ m_op_work_queue, ctx);
+ req->send();
+}
+
+template<typename I>
+void ResetRequest<I>::handle_remove_journal(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove journal: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ create_journal();
+}
+
+template<typename I>
+void ResetRequest<I>::create_journal() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ResetRequest<I>, &ResetRequest<I>::handle_create_journal>(this);
+ journal::TagData tag_data(m_mirror_uuid);
+ auto req = CreateRequest<I>::create(m_io_ctx, m_image_id, m_order,
+ m_splay_width, m_object_pool_name,
+ cls::journal::Tag::TAG_CLASS_NEW,
+ tag_data, m_client_id, m_op_work_queue,
+ ctx);
+ req->send();
+}
+
+template<typename I>
+void ResetRequest<I>::handle_create_journal(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+}
+
+template<typename I>
+void ResetRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::ResetRequest<librbd::ImageCtx>;
diff --git a/src/librbd/journal/ResetRequest.h b/src/librbd/journal/ResetRequest.h
new file mode 100644
index 000000000..f9331f644
--- /dev/null
+++ b/src/librbd/journal/ResetRequest.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H
+#define CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "librbd/journal/TypeTraits.h"
+#include "common/Timer.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+
+namespace journal { class Journaler; }
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class ResetRequest {
+public:
+ static ResetRequest *create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ const std::string &client_id,
+ const std::string &mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish) {
+ return new ResetRequest(io_ctx, image_id, client_id, mirror_uuid,
+ op_work_queue, on_finish);
+ }
+
+ ResetRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ const std::string &client_id, const std::string &mirror_uuid,
+ ContextWQ *op_work_queue, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_client_id(client_id),
+ m_mirror_uuid(mirror_uuid), m_op_work_queue(op_work_queue),
+ m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(m_io_ctx.cct())) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT_JOURNALER
+ * |
+ * v
+ * SHUT_DOWN_JOURNALER
+ * |
+ * v
+ * REMOVE_JOURNAL
+ * |
+ * v
+ * CREATE_JOURNAL
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ std::string m_client_id;
+ std::string m_mirror_uuid;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+ Journaler *m_journaler = nullptr;
+ int m_ret_val = 0;
+
+ uint8_t m_order = 0;
+ uint8_t m_splay_width = 0;
+ std::string m_object_pool_name;
+
+ void init_journaler();
+ void handle_init_journaler(int r);
+
+ void shut_down_journaler();
+ void handle_journaler_shutdown(int r);
+
+ void remove_journal();
+ void handle_remove_journal(int r);
+
+ void create_journal();
+ void handle_create_journal(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::ResetRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H
diff --git a/src/librbd/journal/StandardPolicy.cc b/src/librbd/journal/StandardPolicy.cc
new file mode 100644
index 000000000..7f124aeef
--- /dev/null
+++ b/src/librbd/journal/StandardPolicy.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/StandardPolicy.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/asio/ContextWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::StandardPolicy: "
+
+namespace librbd {
+namespace journal {
+
+template<typename I>
+void StandardPolicy<I>::allocate_tag_on_lock(Context *on_finish) {
+ ceph_assert(m_image_ctx->journal != nullptr);
+
+ if (!m_image_ctx->journal->is_tag_owner()) {
+ lderr(m_image_ctx->cct) << "local image not promoted" << dendl;
+ m_image_ctx->op_work_queue->queue(on_finish, -EPERM);
+ return;
+ }
+
+ m_image_ctx->journal->allocate_local_tag(on_finish);
+}
+
+} // namespace journal
+} // namespace librbd
+
+template class librbd::journal::StandardPolicy<librbd::ImageCtx>;
diff --git a/src/librbd/journal/StandardPolicy.h b/src/librbd/journal/StandardPolicy.h
new file mode 100644
index 000000000..ec8d0148f
--- /dev/null
+++ b/src/librbd/journal/StandardPolicy.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
+
+#include "librbd/journal/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+template<typename ImageCtxT = ImageCtx>
+class StandardPolicy : public Policy {
+public:
+ StandardPolicy(ImageCtxT *image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ bool append_disabled() const override {
+ return false;
+ }
+ bool journal_disabled() const override {
+ return false;
+ }
+ void allocate_tag_on_lock(Context *on_finish) override;
+
+private:
+ ImageCtxT *m_image_ctx;
+};
+
+} // namespace journal
+} // namespace librbd
+
+extern template class librbd::journal::StandardPolicy<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
diff --git a/src/librbd/journal/TypeTraits.h b/src/librbd/journal/TypeTraits.h
new file mode 100644
index 000000000..51b025f6d
--- /dev/null
+++ b/src/librbd/journal/TypeTraits.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
+#define CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
+
+struct ContextWQ;
+
+namespace journal {
+class Future;
+class Journaler;
+class ReplayEntry;
+}
+
+namespace librbd {
+namespace journal {
+
+template <typename ImageCtxT>
+struct TypeTraits {
+ typedef ::journal::Journaler Journaler;
+ typedef ::journal::Future Future;
+ typedef ::journal::ReplayEntry ReplayEntry;
+ typedef ::ContextWQ ContextWQ;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
diff --git a/src/librbd/journal/Types.cc b/src/librbd/journal/Types.cc
new file mode 100644
index 000000000..d76a15e55
--- /dev/null
+++ b/src/librbd/journal/Types.cc
@@ -0,0 +1,956 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+
+namespace librbd {
+namespace journal {
+
+using ceph::encode;
+using ceph::decode;
+
+namespace {
+
+template <typename E>
+class GetTypeVisitor : public boost::static_visitor<E> {
+public:
+ template <typename T>
+ inline E operator()(const T&) const {
+ return T::TYPE;
+ }
+};
+
+class EncodeVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) {
+ }
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ encode(static_cast<uint32_t>(T::TYPE), m_bl);
+ t.encode(m_bl);
+ }
+private:
+ bufferlist &m_bl;
+};
+
+class DecodeVisitor : public boost::static_visitor<void> {
+public:
+ DecodeVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {
+ }
+
+ template <typename T>
+ inline void operator()(T& t) const {
+ t.decode(m_version, m_iter);
+ }
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpVisitor(Formatter *formatter, const std::string &key)
+ : m_formatter(formatter), m_key(key) {}
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ auto type = T::TYPE;
+ m_formatter->dump_string(m_key.c_str(), stringify(type));
+ t.dump(m_formatter);
+ }
+private:
+ ceph::Formatter *m_formatter;
+ std::string m_key;
+};
+
+} // anonymous namespace
+
+void AioDiscardEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ bool skip_partial_discard = (discard_granularity_bytes > 0);
+ encode(skip_partial_discard, bl);
+ encode(discard_granularity_bytes, bl);
+}
+
+void AioDiscardEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+
+ bool skip_partial_discard = false;
+ if (version >= 4) {
+ decode(skip_partial_discard, it);
+ }
+
+ if (version >= 5) {
+ decode(discard_granularity_bytes, it);
+ } else {
+ if (skip_partial_discard) {
+ // use a size larger than the maximum object size which will
+ // truncated down to object size during IO processing
+ discard_granularity_bytes = std::numeric_limits<uint32_t>::max();
+ } else {
+ discard_granularity_bytes = 0;
+ }
+ }
+}
+
+void AioDiscardEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+ f->dump_unsigned("discard_granularity_bytes", discard_granularity_bytes);
+}
+
+uint32_t AioWriteEvent::get_fixed_size() {
+ return EventEntry::get_fixed_size() + 16 /* offset, length */;
+}
+
+void AioWriteEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ encode(data, bl);
+}
+
+void AioWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+ decode(data, it);
+}
+
+void AioWriteEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+void AioWriteSameEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ encode(data, bl);
+}
+
+void AioWriteSameEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+ decode(data, it);
+}
+
+void AioWriteSameEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+uint32_t AioCompareAndWriteEvent::get_fixed_size() {
+ return EventEntry::get_fixed_size() + 32 /* offset, length */;
+}
+
+void AioCompareAndWriteEvent::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(offset, bl);
+ encode(length, bl);
+ encode(cmp_data, bl);
+ encode(write_data, bl);
+}
+
+void AioCompareAndWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(offset, it);
+ decode(length, it);
+ decode(cmp_data, it);
+ decode(write_data, it);
+}
+
+void AioCompareAndWriteEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+void AioFlushEvent::encode(bufferlist& bl) const {
+}
+
+void AioFlushEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void AioFlushEvent::dump(Formatter *f) const {
+}
+
+void OpEventBase::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(op_tid, bl);
+}
+
+void OpEventBase::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(op_tid, it);
+}
+
+void OpEventBase::dump(Formatter *f) const {
+ f->dump_unsigned("op_tid", op_tid);
+}
+
+void OpFinishEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(op_tid, bl);
+ encode(r, bl);
+}
+
+void OpFinishEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(op_tid, it);
+ decode(r, it);
+}
+
+void OpFinishEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("op_tid", op_tid);
+ f->dump_int("result", r);
+}
+
+void SnapEventBase::encode(bufferlist& bl) const {
+ using ceph::encode;
+ OpEventBase::encode(bl);
+ encode(snap_name, bl);
+ encode(snap_namespace, bl);
+}
+
+void SnapEventBase::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(snap_name, it);
+ if (version >= 4) {
+ decode(snap_namespace, it);
+ }
+}
+
+void SnapEventBase::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("snap_name", snap_name);
+ snap_namespace.dump(f);
+}
+
+void SnapCreateEvent::encode(bufferlist &bl) const {
+ SnapEventBase::encode(bl);
+}
+
+void SnapCreateEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ SnapEventBase::decode(version, it);
+ if (version == 3) {
+ decode(snap_namespace, it);
+ }
+}
+
+void SnapCreateEvent::dump(Formatter *f) const {
+ SnapEventBase::dump(f);
+}
+
+void SnapLimitEvent::encode(bufferlist &bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(limit, bl);
+}
+
+void SnapLimitEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(limit, it);
+}
+
+void SnapLimitEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("limit", limit);
+}
+
+void SnapRenameEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(dst_snap_name, bl);
+ encode(snap_id, bl);
+ encode(src_snap_name, bl);
+}
+
+void SnapRenameEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ OpEventBase::decode(version, it);
+ decode(dst_snap_name, it);
+ decode(snap_id, it);
+ if (version >= 2) {
+ decode(src_snap_name, it);
+ }
+}
+
+void SnapRenameEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("src_snap_id", snap_id);
+ f->dump_string("src_snap_name", src_snap_name);
+ f->dump_string("dest_snap_name", dst_snap_name);
+}
+
+void RenameEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(image_name, bl);
+}
+
+void RenameEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(image_name, it);
+}
+
+void RenameEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("image_name", image_name);
+}
+
+void ResizeEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(size, bl);
+}
+
+void ResizeEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(size, it);
+}
+
+void ResizeEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("size", size);
+}
+
+void DemotePromoteEvent::encode(bufferlist& bl) const {
+}
+
+void DemotePromoteEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void DemotePromoteEvent::dump(Formatter *f) const {
+}
+
+void UpdateFeaturesEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(features, bl);
+ encode(enabled, bl);
+}
+
+void UpdateFeaturesEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(features, it);
+ decode(enabled, it);
+}
+
+void UpdateFeaturesEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_unsigned("features", features);
+ f->dump_bool("enabled", enabled);
+}
+
+void MetadataSetEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(key, bl);
+ encode(value, bl);
+}
+
+void MetadataSetEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(key, it);
+ decode(value, it);
+}
+
+void MetadataSetEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("key", key);
+ f->dump_string("value", value);
+}
+
+void MetadataRemoveEvent::encode(bufferlist& bl) const {
+ OpEventBase::encode(bl);
+ using ceph::encode;
+ encode(key, bl);
+}
+
+void MetadataRemoveEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+ OpEventBase::decode(version, it);
+ using ceph::decode;
+ decode(key, it);
+}
+
+void MetadataRemoveEvent::dump(Formatter *f) const {
+ OpEventBase::dump(f);
+ f->dump_string("key", key);
+}
+
+void UnknownEvent::encode(bufferlist& bl) const {
+ ceph_abort();
+}
+
+void UnknownEvent::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void UnknownEvent::dump(Formatter *f) const {
+}
+
+EventType EventEntry::get_event_type() const {
+ return boost::apply_visitor(GetTypeVisitor<EventType>(), event);
+}
+
+void EventEntry::encode(bufferlist& bl) const {
+ ENCODE_START(5, 1, bl);
+ boost::apply_visitor(EncodeVisitor(bl), event);
+ ENCODE_FINISH(bl);
+ encode_metadata(bl);
+}
+
+void EventEntry::decode(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+
+ uint32_t event_type;
+ decode(event_type, it);
+
+ // select the correct payload variant based upon the encoded op
+ switch (event_type) {
+ case EVENT_TYPE_AIO_DISCARD:
+ event = AioDiscardEvent();
+ break;
+ case EVENT_TYPE_AIO_WRITE:
+ event = AioWriteEvent();
+ break;
+ case EVENT_TYPE_AIO_FLUSH:
+ event = AioFlushEvent();
+ break;
+ case EVENT_TYPE_OP_FINISH:
+ event = OpFinishEvent();
+ break;
+ case EVENT_TYPE_SNAP_CREATE:
+ event = SnapCreateEvent();
+ break;
+ case EVENT_TYPE_SNAP_REMOVE:
+ event = SnapRemoveEvent();
+ break;
+ case EVENT_TYPE_SNAP_RENAME:
+ event = SnapRenameEvent();
+ break;
+ case EVENT_TYPE_SNAP_PROTECT:
+ event = SnapProtectEvent();
+ break;
+ case EVENT_TYPE_SNAP_UNPROTECT:
+ event = SnapUnprotectEvent();
+ break;
+ case EVENT_TYPE_SNAP_ROLLBACK:
+ event = SnapRollbackEvent();
+ break;
+ case EVENT_TYPE_RENAME:
+ event = RenameEvent();
+ break;
+ case EVENT_TYPE_RESIZE:
+ event = ResizeEvent();
+ break;
+ case EVENT_TYPE_FLATTEN:
+ event = FlattenEvent();
+ break;
+ case EVENT_TYPE_DEMOTE_PROMOTE:
+ event = DemotePromoteEvent();
+ break;
+ case EVENT_TYPE_SNAP_LIMIT:
+ event = SnapLimitEvent();
+ break;
+ case EVENT_TYPE_UPDATE_FEATURES:
+ event = UpdateFeaturesEvent();
+ break;
+ case EVENT_TYPE_METADATA_SET:
+ event = MetadataSetEvent();
+ break;
+ case EVENT_TYPE_METADATA_REMOVE:
+ event = MetadataRemoveEvent();
+ break;
+ case EVENT_TYPE_AIO_WRITESAME:
+ event = AioWriteSameEvent();
+ break;
+ case EVENT_TYPE_AIO_COMPARE_AND_WRITE:
+ event = AioCompareAndWriteEvent();
+ break;
+ default:
+ event = UnknownEvent();
+ break;
+ }
+
+ boost::apply_visitor(DecodeVisitor(struct_v, it), event);
+ DECODE_FINISH(it);
+ if (struct_v >= 4) {
+ decode_metadata(it);
+ }
+}
+
+void EventEntry::dump(Formatter *f) const {
+ boost::apply_visitor(DumpVisitor(f, "event_type"), event);
+ f->dump_stream("timestamp") << timestamp;
+}
+
+void EventEntry::encode_metadata(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(timestamp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EventEntry::decode_metadata(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+ decode(timestamp, it);
+ DECODE_FINISH(it);
+}
+
+void EventEntry::generate_test_instances(std::list<EventEntry *> &o) {
+ o.push_back(new EventEntry(AioDiscardEvent()));
+ o.push_back(new EventEntry(AioDiscardEvent(123, 345, 4096), utime_t(1, 1)));
+
+ bufferlist bl;
+ bl.append(std::string(32, '1'));
+ o.push_back(new EventEntry(AioWriteEvent()));
+ o.push_back(new EventEntry(AioWriteEvent(123, 456, bl), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(AioFlushEvent()));
+
+ o.push_back(new EventEntry(OpFinishEvent(123, -1), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapCreateEvent(), utime_t(1, 1)));
+ o.push_back(new EventEntry(SnapCreateEvent(234, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapRemoveEvent()));
+ o.push_back(new EventEntry(SnapRemoveEvent(345, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapRenameEvent()));
+ o.push_back(new EventEntry(SnapRenameEvent(456, 1, "src snap", "dest snap"),
+ utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapProtectEvent()));
+ o.push_back(new EventEntry(SnapProtectEvent(567, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapUnprotectEvent()));
+ o.push_back(new EventEntry(SnapUnprotectEvent(678, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(SnapRollbackEvent()));
+ o.push_back(new EventEntry(SnapRollbackEvent(789, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(RenameEvent()));
+ o.push_back(new EventEntry(RenameEvent(890, "image name"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(ResizeEvent()));
+ o.push_back(new EventEntry(ResizeEvent(901, 1234), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(FlattenEvent(123), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(DemotePromoteEvent()));
+
+ o.push_back(new EventEntry(UpdateFeaturesEvent()));
+ o.push_back(new EventEntry(UpdateFeaturesEvent(123, 127, true), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(MetadataSetEvent()));
+ o.push_back(new EventEntry(MetadataSetEvent(123, "key", "value"), utime_t(1, 1)));
+
+ o.push_back(new EventEntry(MetadataRemoveEvent()));
+ o.push_back(new EventEntry(MetadataRemoveEvent(123, "key"), utime_t(1, 1)));
+}
+
+// Journal Client
+
+void ImageClientMeta::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(tag_class, bl);
+ encode(resync_requested, bl);
+}
+
+void ImageClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(tag_class, it);
+ decode(resync_requested, it);
+}
+
+void ImageClientMeta::dump(Formatter *f) const {
+ f->dump_unsigned("tag_class", tag_class);
+ f->dump_bool("resync_requested", resync_requested);
+}
+
+void MirrorPeerSyncPoint::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(snap_name, bl);
+ encode(from_snap_name, bl);
+ encode(object_number, bl);
+ encode(snap_namespace, bl);
+}
+
+void MirrorPeerSyncPoint::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(snap_name, it);
+ decode(from_snap_name, it);
+ decode(object_number, it);
+ if (version >= 2) {
+ decode(snap_namespace, it);
+ }
+}
+
+void MirrorPeerSyncPoint::dump(Formatter *f) const {
+ f->dump_string("snap_name", snap_name);
+ f->dump_string("from_snap_name", from_snap_name);
+ if (object_number) {
+ f->dump_unsigned("object_number", *object_number);
+ }
+ snap_namespace.dump(f);
+}
+
+void MirrorPeerClientMeta::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(image_id, bl);
+ encode(static_cast<uint32_t>(state), bl);
+ encode(sync_object_count, bl);
+ encode(static_cast<uint32_t>(sync_points.size()), bl);
+ for (auto &sync_point : sync_points) {
+ sync_point.encode(bl);
+ }
+ encode(snap_seqs, bl);
+}
+
+void MirrorPeerClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(image_id, it);
+
+ uint32_t decode_state;
+ decode(decode_state, it);
+ state = static_cast<MirrorPeerState>(decode_state);
+
+ decode(sync_object_count, it);
+
+ uint32_t sync_point_count;
+ decode(sync_point_count, it);
+ sync_points.resize(sync_point_count);
+ for (auto &sync_point : sync_points) {
+ sync_point.decode(version, it);
+ }
+
+ decode(snap_seqs, it);
+}
+
+void MirrorPeerClientMeta::dump(Formatter *f) const {
+ f->dump_string("image_id", image_id);
+ f->dump_stream("state") << state;
+ f->dump_unsigned("sync_object_count", sync_object_count);
+ f->open_array_section("sync_points");
+ for (auto &sync_point : sync_points) {
+ f->open_object_section("sync_point");
+ sync_point.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("snap_seqs");
+ for (auto &pair : snap_seqs) {
+ f->open_object_section("snap_seq");
+ f->dump_unsigned("local_snap_seq", pair.first);
+ f->dump_unsigned("peer_snap_seq", pair.second);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void CliClientMeta::encode(bufferlist& bl) const {
+}
+
+void CliClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void CliClientMeta::dump(Formatter *f) const {
+}
+
+void UnknownClientMeta::encode(bufferlist& bl) const {
+ ceph_abort();
+}
+
+void UnknownClientMeta::decode(__u8 version, bufferlist::const_iterator& it) {
+}
+
+void UnknownClientMeta::dump(Formatter *f) const {
+}
+
+ClientMetaType ClientData::get_client_meta_type() const {
+ return boost::apply_visitor(GetTypeVisitor<ClientMetaType>(), client_meta);
+}
+
+void ClientData::encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ boost::apply_visitor(EncodeVisitor(bl), client_meta);
+ ENCODE_FINISH(bl);
+}
+
+void ClientData::decode(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+
+ uint32_t client_meta_type;
+ decode(client_meta_type, it);
+
+ // select the correct payload variant based upon the encoded op
+ switch (client_meta_type) {
+ case IMAGE_CLIENT_META_TYPE:
+ client_meta = ImageClientMeta();
+ break;
+ case MIRROR_PEER_CLIENT_META_TYPE:
+ client_meta = MirrorPeerClientMeta();
+ break;
+ case CLI_CLIENT_META_TYPE:
+ client_meta = CliClientMeta();
+ break;
+ default:
+ client_meta = UnknownClientMeta();
+ break;
+ }
+
+ boost::apply_visitor(DecodeVisitor(struct_v, it), client_meta);
+ DECODE_FINISH(it);
+}
+
+void ClientData::dump(Formatter *f) const {
+ boost::apply_visitor(DumpVisitor(f, "client_meta_type"), client_meta);
+}
+
+void ClientData::generate_test_instances(std::list<ClientData *> &o) {
+ o.push_back(new ClientData(ImageClientMeta()));
+ o.push_back(new ClientData(ImageClientMeta(123)));
+ o.push_back(new ClientData(MirrorPeerClientMeta()));
+ o.push_back(new ClientData(MirrorPeerClientMeta("image_id",
+ {{{}, "snap 2", "snap 1", 123}},
+ {{1, 2}, {3, 4}})));
+ o.push_back(new ClientData(CliClientMeta()));
+}
+
+// Journal Tag
+
+void TagPredecessor::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(mirror_uuid, bl);
+ encode(commit_valid, bl);
+ encode(tag_tid, bl);
+ encode(entry_tid, bl);
+}
+
+void TagPredecessor::decode(bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(mirror_uuid, it);
+ decode(commit_valid, it);
+ decode(tag_tid, it);
+ decode(entry_tid, it);
+}
+
+void TagPredecessor::dump(Formatter *f) const {
+ f->dump_string("mirror_uuid", mirror_uuid);
+ f->dump_string("commit_valid", commit_valid ? "true" : "false");
+ f->dump_unsigned("tag_tid", tag_tid);
+ f->dump_unsigned("entry_tid", entry_tid);
+}
+
+void TagData::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(mirror_uuid, bl);
+ predecessor.encode(bl);
+}
+
+void TagData::decode(bufferlist::const_iterator& it) {
+ using ceph::decode;
+ decode(mirror_uuid, it);
+ predecessor.decode(it);
+}
+
+void TagData::dump(Formatter *f) const {
+ f->dump_string("mirror_uuid", mirror_uuid);
+ f->open_object_section("predecessor");
+ predecessor.dump(f);
+ f->close_section();
+}
+
+void TagData::generate_test_instances(std::list<TagData *> &o) {
+ o.push_back(new TagData());
+ o.push_back(new TagData("mirror-uuid"));
+ o.push_back(new TagData("mirror-uuid", "remote-mirror-uuid", true, 123, 234));
+}
+
+std::ostream &operator<<(std::ostream &out, const EventType &type) {
+ using namespace librbd::journal;
+
+ switch (type) {
+ case EVENT_TYPE_AIO_DISCARD:
+ out << "AioDiscard";
+ break;
+ case EVENT_TYPE_AIO_WRITE:
+ out << "AioWrite";
+ break;
+ case EVENT_TYPE_AIO_FLUSH:
+ out << "AioFlush";
+ break;
+ case EVENT_TYPE_OP_FINISH:
+ out << "OpFinish";
+ break;
+ case EVENT_TYPE_SNAP_CREATE:
+ out << "SnapCreate";
+ break;
+ case EVENT_TYPE_SNAP_REMOVE:
+ out << "SnapRemove";
+ break;
+ case EVENT_TYPE_SNAP_RENAME:
+ out << "SnapRename";
+ break;
+ case EVENT_TYPE_SNAP_PROTECT:
+ out << "SnapProtect";
+ break;
+ case EVENT_TYPE_SNAP_UNPROTECT:
+ out << "SnapUnprotect";
+ break;
+ case EVENT_TYPE_SNAP_ROLLBACK:
+ out << "SnapRollback";
+ break;
+ case EVENT_TYPE_RENAME:
+ out << "Rename";
+ break;
+ case EVENT_TYPE_RESIZE:
+ out << "Resize";
+ break;
+ case EVENT_TYPE_FLATTEN:
+ out << "Flatten";
+ break;
+ case EVENT_TYPE_DEMOTE_PROMOTE:
+ out << "Demote/Promote";
+ break;
+ case EVENT_TYPE_SNAP_LIMIT:
+ out << "SnapLimit";
+ break;
+ case EVENT_TYPE_UPDATE_FEATURES:
+ out << "UpdateFeatures";
+ break;
+ case EVENT_TYPE_METADATA_SET:
+ out << "MetadataSet";
+ break;
+ case EVENT_TYPE_METADATA_REMOVE:
+ out << "MetadataRemove";
+ break;
+ case EVENT_TYPE_AIO_WRITESAME:
+ out << "AioWriteSame";
+ break;
+ case EVENT_TYPE_AIO_COMPARE_AND_WRITE:
+ out << "AioCompareAndWrite";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(type) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ClientMetaType &type) {
+ using namespace librbd::journal;
+
+ switch (type) {
+ case IMAGE_CLIENT_META_TYPE:
+ out << "Master Image";
+ break;
+ case MIRROR_PEER_CLIENT_META_TYPE:
+ out << "Mirror Peer";
+ break;
+ case CLI_CLIENT_META_TYPE:
+ out << "CLI Tool";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(type) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta) {
+ out << "[tag_class=" << meta.tag_class << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync) {
+ out << "[snap_name=" << sync.snap_name << ", "
+ << "from_snap_name=" << sync.from_snap_name;
+ if (sync.object_number) {
+ out << ", " << *sync.object_number;
+ }
+ out << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const MirrorPeerState &state) {
+ switch (state) {
+ case MIRROR_PEER_STATE_SYNCING:
+ out << "Syncing";
+ break;
+ case MIRROR_PEER_STATE_REPLAYING:
+ out << "Replaying";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta) {
+ out << "[image_id=" << meta.image_id << ", "
+ << "state=" << meta.state << ", "
+ << "sync_object_count=" << meta.sync_object_count << ", "
+ << "sync_points=[";
+ std::string delimiter;
+ for (auto &sync_point : meta.sync_points) {
+ out << delimiter << "[" << sync_point << "]";
+ delimiter = ", ";
+ }
+ out << "], snap_seqs=[";
+ delimiter = "";
+ for (auto &pair : meta.snap_seqs) {
+ out << delimiter << "["
+ << "local_snap_seq=" << pair.first << ", "
+ << "peer_snap_seq" << pair.second << "]";
+ delimiter = ", ";
+ }
+ out << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor) {
+ out << "["
+ << "mirror_uuid=" << predecessor.mirror_uuid;
+ if (predecessor.commit_valid) {
+ out << ", "
+ << "tag_tid=" << predecessor.tag_tid << ", "
+ << "entry_tid=" << predecessor.entry_tid;
+ }
+ out << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const TagData &tag_data) {
+ out << "["
+ << "mirror_uuid=" << tag_data.mirror_uuid << ", "
+ << "predecessor=" << tag_data.predecessor
+ << "]";
+ return out;
+}
+
+} // namespace journal
+} // namespace librbd
+
diff --git a/src/librbd/journal/Types.h b/src/librbd/journal/Types.h
new file mode 100644
index 000000000..ae5681ade
--- /dev/null
+++ b/src/librbd/journal/Types.h
@@ -0,0 +1,685 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_TYPES_H
+#define CEPH_LIBRBD_JOURNAL_TYPES_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/types.h"
+#include "include/utime.h"
+#include "librbd/Types.h"
+#include <iosfwd>
+#include <list>
+#include <boost/none.hpp>
+#include <boost/optional.hpp>
+#include <boost/variant.hpp>
+#include <boost/mpl/vector.hpp>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace librbd {
+namespace journal {
+
+enum EventType {
+ EVENT_TYPE_AIO_DISCARD = 0,
+ EVENT_TYPE_AIO_WRITE = 1,
+ EVENT_TYPE_AIO_FLUSH = 2,
+ EVENT_TYPE_OP_FINISH = 3,
+ EVENT_TYPE_SNAP_CREATE = 4,
+ EVENT_TYPE_SNAP_REMOVE = 5,
+ EVENT_TYPE_SNAP_RENAME = 6,
+ EVENT_TYPE_SNAP_PROTECT = 7,
+ EVENT_TYPE_SNAP_UNPROTECT = 8,
+ EVENT_TYPE_SNAP_ROLLBACK = 9,
+ EVENT_TYPE_RENAME = 10,
+ EVENT_TYPE_RESIZE = 11,
+ EVENT_TYPE_FLATTEN = 12,
+ EVENT_TYPE_DEMOTE_PROMOTE = 13,
+ EVENT_TYPE_SNAP_LIMIT = 14,
+ EVENT_TYPE_UPDATE_FEATURES = 15,
+ EVENT_TYPE_METADATA_SET = 16,
+ EVENT_TYPE_METADATA_REMOVE = 17,
+ EVENT_TYPE_AIO_WRITESAME = 18,
+ EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19,
+};
+
+struct AioDiscardEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_DISCARD;
+
+ uint64_t offset = 0;
+ uint64_t length = 0;
+ uint32_t discard_granularity_bytes = 0;
+
+ AioDiscardEvent() {
+ }
+ AioDiscardEvent(uint64_t _offset, uint64_t _length,
+ uint32_t discard_granularity_bytes)
+ : offset(_offset), length(_length),
+ discard_granularity_bytes(discard_granularity_bytes) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioWriteEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_WRITE;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist data;
+
+ static uint32_t get_fixed_size();
+
+ AioWriteEvent() : offset(0), length(0) {
+ }
+ AioWriteEvent(uint64_t _offset, uint64_t _length, const bufferlist &_data)
+ : offset(_offset), length(_length), data(_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioWriteSameEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_WRITESAME;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist data;
+
+ AioWriteSameEvent() : offset(0), length(0) {
+ }
+ AioWriteSameEvent(uint64_t _offset, uint64_t _length,
+ const bufferlist &_data)
+ : offset(_offset), length(_length), data(_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioCompareAndWriteEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_COMPARE_AND_WRITE;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist cmp_data;
+ bufferlist write_data;
+
+ static uint32_t get_fixed_size();
+
+ AioCompareAndWriteEvent() : offset(0), length(0) {
+ }
+ AioCompareAndWriteEvent(uint64_t _offset, uint64_t _length,
+ const bufferlist &_cmp_data, const bufferlist &_write_data)
+ : offset(_offset), length(_length), cmp_data(_cmp_data), write_data(_write_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct AioFlushEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_FLUSH;
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct OpEventBase {
+ uint64_t op_tid;
+
+protected:
+ OpEventBase() : op_tid(0) {
+ }
+ OpEventBase(uint64_t op_tid) : op_tid(op_tid) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct OpFinishEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_OP_FINISH;
+
+ int r;
+
+ OpFinishEvent() : r(0) {
+ }
+ OpFinishEvent(uint64_t op_tid, int r) : OpEventBase(op_tid), r(r) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapEventBase : public OpEventBase {
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+
+protected:
+ SnapEventBase() {
+ }
+ SnapEventBase(uint64_t op_tid, const cls::rbd::SnapshotNamespace& _snap_namespace,
+ const std::string &_snap_name)
+ : OpEventBase(op_tid),
+ snap_namespace(_snap_namespace),
+ snap_name(_snap_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapCreateEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_CREATE;
+
+ SnapCreateEvent() {
+ }
+ SnapCreateEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapRemoveEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_REMOVE;
+
+ SnapRemoveEvent() {
+ }
+ SnapRemoveEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct SnapRenameEvent : public OpEventBase{
+ static const EventType TYPE = EVENT_TYPE_SNAP_RENAME;
+
+ uint64_t snap_id;
+ std::string src_snap_name;
+ std::string dst_snap_name;
+
+ SnapRenameEvent() : snap_id(CEPH_NOSNAP) {
+ }
+ SnapRenameEvent(uint64_t op_tid, uint64_t src_snap_id,
+ const std::string &src_snap_name,
+ const std::string &dest_snap_name)
+ : OpEventBase(op_tid),
+ snap_id(src_snap_id),
+ src_snap_name(src_snap_name),
+ dst_snap_name(dest_snap_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapProtectEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_PROTECT;
+
+ SnapProtectEvent() {
+ }
+ SnapProtectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct SnapUnprotectEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_UNPROTECT;
+
+ SnapUnprotectEvent() {
+ }
+ SnapUnprotectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct SnapLimitEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_LIMIT;
+ uint64_t limit;
+
+ SnapLimitEvent() {
+ }
+ SnapLimitEvent(uint64_t op_tid, const uint64_t _limit)
+ : OpEventBase(op_tid), limit(_limit) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct SnapRollbackEvent : public SnapEventBase {
+ static const EventType TYPE = EVENT_TYPE_SNAP_ROLLBACK;
+
+ SnapRollbackEvent() {
+ }
+ SnapRollbackEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name)
+ : SnapEventBase(op_tid, snap_namespace, snap_name) {
+ }
+
+ using SnapEventBase::encode;
+ using SnapEventBase::decode;
+ using SnapEventBase::dump;
+};
+
+struct RenameEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_RENAME;
+
+ std::string image_name;
+
+ RenameEvent() {
+ }
+ RenameEvent(uint64_t op_tid, const std::string &_image_name)
+ : OpEventBase(op_tid), image_name(_image_name) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct ResizeEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_RESIZE;
+
+ uint64_t size;
+
+ ResizeEvent() : size(0) {
+ }
+ ResizeEvent(uint64_t op_tid, uint64_t _size)
+ : OpEventBase(op_tid), size(_size) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct FlattenEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_FLATTEN;
+
+ FlattenEvent() {
+ }
+ FlattenEvent(uint64_t op_tid) : OpEventBase(op_tid) {
+ }
+
+ using OpEventBase::encode;
+ using OpEventBase::decode;
+ using OpEventBase::dump;
+};
+
+struct DemotePromoteEvent {
+ static const EventType TYPE = static_cast<EventType>(
+ EVENT_TYPE_DEMOTE_PROMOTE);
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct UpdateFeaturesEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_UPDATE_FEATURES;
+
+ uint64_t features;
+ bool enabled;
+
+ UpdateFeaturesEvent() : features(0), enabled(false) {
+ }
+ UpdateFeaturesEvent(uint64_t op_tid, uint64_t _features, bool _enabled)
+ : OpEventBase(op_tid), features(_features), enabled(_enabled) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct MetadataSetEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_METADATA_SET;
+
+ string key;
+ string value;
+
+ MetadataSetEvent() {
+ }
+ MetadataSetEvent(uint64_t op_tid, const string &_key, const string &_value)
+ : OpEventBase(op_tid), key(_key), value(_value) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct MetadataRemoveEvent : public OpEventBase {
+ static const EventType TYPE = EVENT_TYPE_METADATA_REMOVE;
+
+ string key;
+
+ MetadataRemoveEvent() {
+ }
+ MetadataRemoveEvent(uint64_t op_tid, const string &_key)
+ : OpEventBase(op_tid), key(_key) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownEvent {
+ static const EventType TYPE = static_cast<EventType>(-1);
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::mpl::vector<AioDiscardEvent,
+ AioWriteEvent,
+ AioFlushEvent,
+ OpFinishEvent,
+ SnapCreateEvent,
+ SnapRemoveEvent,
+ SnapRenameEvent,
+ SnapProtectEvent,
+ SnapUnprotectEvent,
+ SnapRollbackEvent,
+ RenameEvent,
+ ResizeEvent,
+ FlattenEvent,
+ DemotePromoteEvent,
+ SnapLimitEvent,
+ UpdateFeaturesEvent,
+ MetadataSetEvent,
+ MetadataRemoveEvent,
+ AioWriteSameEvent,
+ AioCompareAndWriteEvent,
+ UnknownEvent> EventVector;
+typedef boost::make_variant_over<EventVector>::type Event;
+
+struct EventEntry {
+ static uint32_t get_fixed_size() {
+ return EVENT_FIXED_SIZE + METADATA_FIXED_SIZE;
+ }
+
+ EventEntry() : event(UnknownEvent()) {
+ }
+ EventEntry(const Event &_event, const utime_t &_timestamp = utime_t())
+ : event(_event), timestamp(_timestamp) {
+ }
+
+ Event event;
+ utime_t timestamp;
+
+ EventType get_event_type() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<EventEntry *> &o);
+
+private:
+ static const uint32_t EVENT_FIXED_SIZE = 14; /// version encoding, type
+ static const uint32_t METADATA_FIXED_SIZE = 14; /// version encoding, timestamp
+
+ void encode_metadata(bufferlist& bl) const;
+ void decode_metadata(bufferlist::const_iterator& it);
+};
+
+// Journal Client data structures
+
+enum ClientMetaType {
+ IMAGE_CLIENT_META_TYPE = 0,
+ MIRROR_PEER_CLIENT_META_TYPE = 1,
+ CLI_CLIENT_META_TYPE = 2
+};
+
+struct ImageClientMeta {
+ static const ClientMetaType TYPE = IMAGE_CLIENT_META_TYPE;
+
+ uint64_t tag_class = 0;
+ bool resync_requested = false;
+
+ ImageClientMeta() {
+ }
+ ImageClientMeta(uint64_t tag_class) : tag_class(tag_class) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct MirrorPeerSyncPoint {
+ typedef boost::optional<uint64_t> ObjectNumber;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+ std::string from_snap_name;
+ ObjectNumber object_number;
+
+ MirrorPeerSyncPoint() : MirrorPeerSyncPoint({}, "", "", boost::none) {
+ }
+ MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ const ObjectNumber &object_number)
+ : MirrorPeerSyncPoint(snap_namespace, snap_name, "", object_number) {
+ }
+ MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string &snap_name,
+ const std::string &from_snap_name,
+ const ObjectNumber &object_number)
+ : snap_namespace(snap_namespace), snap_name(snap_name),
+ from_snap_name(from_snap_name), object_number(object_number) {
+ }
+
+ inline bool operator==(const MirrorPeerSyncPoint &sync) const {
+ return (snap_name == sync.snap_name &&
+ from_snap_name == sync.from_snap_name &&
+ object_number == sync.object_number &&
+ snap_namespace == sync.snap_namespace);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+enum MirrorPeerState {
+ MIRROR_PEER_STATE_SYNCING,
+ MIRROR_PEER_STATE_REPLAYING
+};
+
+struct MirrorPeerClientMeta {
+ typedef std::list<MirrorPeerSyncPoint> SyncPoints;
+
+ static const ClientMetaType TYPE = MIRROR_PEER_CLIENT_META_TYPE;
+
+ std::string image_id;
+ MirrorPeerState state = MIRROR_PEER_STATE_SYNCING; ///< replay state
+ uint64_t sync_object_count = 0; ///< maximum number of objects ever sync'ed
+ SyncPoints sync_points; ///< max two in-use snapshots for sync
+ SnapSeqs snap_seqs; ///< local to peer snap seq mapping
+
+ MirrorPeerClientMeta() {
+ }
+ MirrorPeerClientMeta(const std::string &image_id,
+ const SyncPoints &sync_points = SyncPoints(),
+ const SnapSeqs &snap_seqs = SnapSeqs())
+ : image_id(image_id), sync_points(sync_points), snap_seqs(snap_seqs) {
+ }
+
+ inline bool operator==(const MirrorPeerClientMeta &meta) const {
+ return (image_id == meta.image_id &&
+ state == meta.state &&
+ sync_object_count == meta.sync_object_count &&
+ sync_points == meta.sync_points &&
+ snap_seqs == meta.snap_seqs);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct CliClientMeta {
+ static const ClientMetaType TYPE = CLI_CLIENT_META_TYPE;
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownClientMeta {
+ static const ClientMetaType TYPE = static_cast<ClientMetaType>(-1);
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ImageClientMeta,
+ MirrorPeerClientMeta,
+ CliClientMeta,
+ UnknownClientMeta> ClientMeta;
+
+struct ClientData {
+ ClientData() {
+ }
+ ClientData(const ClientMeta &client_meta) : client_meta(client_meta) {
+ }
+
+ ClientMeta client_meta;
+
+ ClientMetaType get_client_meta_type() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<ClientData *> &o);
+};
+
+// Journal Tag data structures
+
+struct TagPredecessor {
+ std::string mirror_uuid; // empty if local
+ bool commit_valid = false;
+ uint64_t tag_tid = 0;
+ uint64_t entry_tid = 0;
+
+ TagPredecessor() {
+ }
+ TagPredecessor(const std::string &mirror_uuid, bool commit_valid,
+ uint64_t tag_tid, uint64_t entry_tid)
+ : mirror_uuid(mirror_uuid), commit_valid(commit_valid), tag_tid(tag_tid),
+ entry_tid(entry_tid) {
+ }
+
+ inline bool operator==(const TagPredecessor &rhs) const {
+ return (mirror_uuid == rhs.mirror_uuid &&
+ commit_valid == rhs.commit_valid &&
+ tag_tid == rhs.tag_tid &&
+ entry_tid == rhs.entry_tid);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+struct TagData {
+ // owner of the tag (exclusive lock epoch)
+ std::string mirror_uuid; // empty if local
+
+ // mapping to last committed record of previous tag
+ TagPredecessor predecessor;
+
+ TagData() {
+ }
+ TagData(const std::string &mirror_uuid) : mirror_uuid(mirror_uuid) {
+ }
+ TagData(const std::string &mirror_uuid,
+ const std::string &predecessor_mirror_uuid,
+ bool predecessor_commit_valid,
+ uint64_t predecessor_tag_tid, uint64_t predecessor_entry_tid)
+ : mirror_uuid(mirror_uuid),
+ predecessor(predecessor_mirror_uuid, predecessor_commit_valid,
+ predecessor_tag_tid, predecessor_entry_tid) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<TagData *> &o);
+};
+
+std::ostream &operator<<(std::ostream &out, const EventType &type);
+std::ostream &operator<<(std::ostream &out, const ClientMetaType &type);
+std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerState &meta);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta);
+std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor);
+std::ostream &operator<<(std::ostream &out, const TagData &tag_data);
+
+struct Listener {
+ virtual ~Listener() {
+ }
+
+ /// invoked when journal close is requested
+ virtual void handle_close() = 0;
+
+ /// invoked when journal is promoted to primary
+ virtual void handle_promoted() = 0;
+
+ /// invoked when journal resync is requested
+ virtual void handle_resync() = 0;
+};
+
+WRITE_CLASS_ENCODER(EventEntry);
+WRITE_CLASS_ENCODER(ClientData);
+WRITE_CLASS_ENCODER(TagData);
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_TYPES_H
diff --git a/src/librbd/journal/Utils.cc b/src/librbd/journal/Utils.cc
new file mode 100644
index 000000000..231bcae2d
--- /dev/null
+++ b/src/librbd/journal/Utils.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/journal/Types.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::"
+
+namespace librbd {
+namespace journal {
+namespace util {
+
+int C_DecodeTag::decode(bufferlist::const_iterator *it, TagData *tag_data) {
+ try {
+ using ceph::decode;
+ decode(*tag_data, *it);
+ } catch (const buffer::error &err) {
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+int C_DecodeTag::process(int r) {
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": "
+ << "failed to allocate tag: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::lock_guard locker{*lock};
+ *tag_tid = tag.tid;
+
+ auto data_it = tag.data.cbegin();
+ r = decode(&data_it, tag_data);
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": "
+ << "failed to decode allocated tag" << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "C_DecodeTag: " << this << " " << __func__ << ": "
+ << "allocated journal tag: "
+ << "tid=" << tag.tid << ", "
+ << "data=" << *tag_data << dendl;
+ return 0;
+}
+
+int C_DecodeTags::process(int r) {
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "failed to retrieve journal tags: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (tags.empty()) {
+ lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "no journal tags retrieved" << dendl;
+ return -ENOENT;
+ }
+
+ std::lock_guard locker{*lock};
+ *tag_tid = tags.back().tid;
+ auto data_it = tags.back().data.cbegin();
+ r = C_DecodeTag::decode(&data_it, tag_data);
+ if (r < 0) {
+ lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "failed to decode journal tag" << dendl;
+ return r;
+ }
+
+ ldout(cct, 20) << "C_DecodeTags: " << this << " " << __func__ << ": "
+ << "most recent journal tag: "
+ << "tid=" << *tag_tid << ", "
+ << "data=" << *tag_data << dendl;
+ return 0;
+}
+
+} // namespace util
+} // namespace journal
+} // namespace librbd
diff --git a/src/librbd/journal/Utils.h b/src/librbd/journal/Utils.h
new file mode 100644
index 000000000..93643f9f9
--- /dev/null
+++ b/src/librbd/journal/Utils.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_UTILS_H
+#define CEPH_LIBRBD_JOURNAL_UTILS_H
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "cls/journal/cls_journal_types.h"
+#include <list>
+
+
+namespace librbd {
+namespace journal {
+
+struct TagData;
+
+namespace util {
+
+struct C_DecodeTag : public Context {
+ CephContext *cct;
+ ceph::mutex *lock;
+ uint64_t *tag_tid;
+ TagData *tag_data;
+ Context *on_finish;
+
+ cls::journal::Tag tag;
+
+ C_DecodeTag(CephContext *cct, ceph::mutex *lock, uint64_t *tag_tid,
+ TagData *tag_data, Context *on_finish)
+ : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data),
+ on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ on_finish->complete(process(r));
+ Context::complete(0);
+ }
+ void finish(int r) override {
+ }
+
+ int process(int r);
+
+ static int decode(bufferlist::const_iterator *it, TagData *tag_data);
+
+};
+
+struct C_DecodeTags : public Context {
+ typedef std::list<cls::journal::Tag> Tags;
+
+ CephContext *cct;
+ ceph::mutex *lock;
+ uint64_t *tag_tid;
+ TagData *tag_data;
+ Context *on_finish;
+
+ Tags tags;
+
+ C_DecodeTags(CephContext *cct, ceph::mutex *lock, uint64_t *tag_tid,
+ TagData *tag_data, Context *on_finish)
+ : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data),
+ on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ on_finish->complete(process(r));
+ Context::complete(0);
+ }
+ void finish(int r) override {
+ }
+
+ int process(int r);
+};
+
+} // namespace util
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_UTILS_H
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
new file mode 100644
index 000000000..eb2df58e7
--- /dev/null
+++ b/src/librbd/librbd.cc
@@ -0,0 +1,7398 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/int_types.h"
+
+#include <errno.h>
+
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/TracepointProvider.h"
+#include "include/Context.h"
+
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Features.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/DiffIterate.h"
+#include "librbd/api/Group.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Io.h"
+#include "librbd/api/Migration.h"
+#include "librbd/api/Mirror.h"
+#include "librbd/api/Namespace.h"
+#include "librbd/api/Pool.h"
+#include "librbd/api/PoolMetadata.h"
+#include "librbd/api/Snapshot.h"
+#include "librbd/api/Trash.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/librbd.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd: "
+
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using librados::snap_t;
+using librados::IoCtx;
+
+namespace {
+
+TracepointProvider::Traits tracepoint_traits("librbd_tp.so", "rbd_tracing");
+
+struct UserBufferDeleter : public deleter::impl {
+ CephContext* cct;
+ librbd::io::AioCompletion* aio_completion;
+
+ UserBufferDeleter(CephContext* cct, librbd::io::AioCompletion* aio_completion)
+ : deleter::impl(deleter()), cct(cct), aio_completion(aio_completion) {
+ aio_completion->block(cct);
+ }
+
+ ~UserBufferDeleter() override {
+ aio_completion->unblock(cct);
+ }
+};
+
+static auto create_write_raw(librbd::ImageCtx *ictx, const char *buf,
+ size_t len,
+ librbd::io::AioCompletion* aio_completion) {
+ if (ictx->disable_zero_copy || aio_completion == nullptr) {
+ // must copy the buffer if writeback/writearound cache is in-use (or using
+ // non-AIO)
+ return buffer::copy(buf, len);
+ }
+
+ // avoid copying memory for AIO operations, but possibly delay completions
+ // until the last reference to the user's memory has been released
+ return ceph::unique_leakable_ptr<ceph::buffer::raw>(
+ buffer::claim_buffer(
+ len, const_cast<char*>(buf),
+ deleter(new UserBufferDeleter(ictx->cct, aio_completion))));
+}
+
+static int get_iovec_length(const struct iovec *iov, int iovcnt, size_t &len)
+{
+ len = 0;
+
+ if (iovcnt <= 0) {
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < iovcnt; ++i) {
+ const struct iovec &io = iov[i];
+ // check for overflow
+ if (len + io.iov_len < len) {
+ return -EINVAL;
+ }
+ len += io.iov_len;
+ }
+
+ return 0;
+}
+
+static bufferlist iovec_to_bufferlist(librbd::ImageCtx *ictx,
+ const struct iovec *iov,
+ int iovcnt,
+ librbd::io::AioCompletion* aio_completion)
+{
+ bufferlist bl;
+ for (int i = 0; i < iovcnt; ++i) {
+ const struct iovec &io = iov[i];
+ bl.push_back(create_write_raw(ictx, static_cast<char*>(io.iov_base),
+ io.iov_len, aio_completion));
+ }
+ return bl;
+}
+
+CephContext* get_cct(IoCtx &io_ctx) {
+ return reinterpret_cast<CephContext*>(io_ctx.cct());
+}
+
+librbd::io::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) {
+ return reinterpret_cast<librbd::io::AioCompletion *>(comp->pc);
+}
+
+struct C_AioCompletion : public Context {
+ CephContext *cct;
+ librbd::io::aio_type_t aio_type;
+ librbd::io::AioCompletion* aio_comp;
+
+ C_AioCompletion(librbd::ImageCtx *ictx, librbd::io::aio_type_t aio_type,
+ librbd::io::AioCompletion* aio_comp)
+ : cct(ictx->cct), aio_type(aio_type), aio_comp(aio_comp) {
+ aio_comp->init_time(ictx, aio_type);
+ aio_comp->get();
+ }
+ virtual ~C_AioCompletion() {
+ aio_comp->put();
+ }
+
+ void finish(int r) override {
+ ldout(cct, 20) << "C_AioCompletion::finish: r=" << r << dendl;
+ if (r < 0) {
+ aio_comp->fail(r);
+ } else {
+ aio_comp->complete();
+ }
+ }
+};
+
+struct C_OpenComplete : public C_AioCompletion {
+ librbd::ImageCtx *ictx;
+ void **ictxp;
+ C_OpenComplete(librbd::ImageCtx *ictx, librbd::io::AioCompletion* comp,
+ void **ictxp)
+ : C_AioCompletion(ictx, librbd::io::AIO_TYPE_OPEN, comp),
+ ictx(ictx), ictxp(ictxp) {
+ }
+ void finish(int r) override {
+ ldout(cct, 20) << "C_OpenComplete::finish: r=" << r << dendl;
+ if (r < 0) {
+ *ictxp = nullptr;
+ } else {
+ *ictxp = ictx;
+ }
+
+ C_AioCompletion::finish(r);
+ }
+};
+
+struct C_OpenAfterCloseComplete : public Context {
+ librbd::ImageCtx *ictx;
+ librbd::io::AioCompletion* comp;
+ void **ictxp;
+ C_OpenAfterCloseComplete(librbd::ImageCtx *ictx,
+ librbd::io::AioCompletion* comp,
+ void **ictxp)
+ : ictx(ictx), comp(comp), ictxp(ictxp) {
+ }
+ void finish(int r) override {
+ ldout(ictx->cct, 20) << "C_OpenAfterCloseComplete::finish: r=" << r
+ << dendl;
+ *ictxp = nullptr;
+
+ ictx->state->open(0, new C_OpenComplete(ictx, comp, ictxp));
+ }
+};
+
+struct C_UpdateWatchCB : public librbd::UpdateWatchCtx {
+ rbd_update_callback_t watch_cb;
+ void *arg;
+ uint64_t handle = 0;
+
+ C_UpdateWatchCB(rbd_update_callback_t watch_cb, void *arg) :
+ watch_cb(watch_cb), arg(arg) {
+ }
+ void handle_notify() override {
+ watch_cb(arg);
+ }
+};
+
+struct C_QuiesceWatchCB : public librbd::QuiesceWatchCtx {
+ rbd_update_callback_t quiesce_cb;
+ rbd_update_callback_t unquiesce_cb;
+ void *arg;
+ uint64_t handle = 0;
+
+ C_QuiesceWatchCB(rbd_update_callback_t quiesce_cb,
+ rbd_update_callback_t unquiesce_cb, void *arg) :
+ quiesce_cb(quiesce_cb), unquiesce_cb(unquiesce_cb), arg(arg) {
+ }
+ void handle_quiesce() override {
+ quiesce_cb(arg);
+ }
+ void handle_unquiesce() override {
+ unquiesce_cb(arg);
+ }
+};
+
+void group_image_status_cpp_to_c(const librbd::group_image_info_t &cpp_info,
+ rbd_group_image_info_t *c_info) {
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->pool = cpp_info.pool;
+ c_info->state = cpp_info.state;
+}
+
+void group_info_cpp_to_c(const librbd::group_info_t &cpp_info,
+ rbd_group_info_t *c_info) {
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->pool = cpp_info.pool;
+}
+
+void group_snap_info_cpp_to_c(const librbd::group_snap_info_t &cpp_info,
+ rbd_group_snap_info_t *c_info) {
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->state = cpp_info.state;
+}
+
+void mirror_image_info_cpp_to_c(const librbd::mirror_image_info_t &cpp_info,
+ rbd_mirror_image_info_t *c_info) {
+ c_info->global_id = strdup(cpp_info.global_id.c_str());
+ c_info->state = cpp_info.state;
+ c_info->primary = cpp_info.primary;
+}
+
+int get_local_mirror_image_site_status(
+ const librbd::mirror_image_global_status_t& status,
+ librbd::mirror_image_site_status_t* local_status) {
+ auto it = std::find_if(status.site_statuses.begin(),
+ status.site_statuses.end(),
+ [](const librbd::mirror_image_site_status_t& s) {
+ return (s.mirror_uuid ==
+ cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID);
+ });
+ if (it == status.site_statuses.end()) {
+ return -ENOENT;
+ }
+
+ *local_status = *it;
+ return 0;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+int mirror_image_global_status_cpp_to_c(
+ const librbd::mirror_image_global_status_t &cpp_status,
+ rbd_mirror_image_status_t *c_status) {
+ c_status->name = strdup(cpp_status.name.c_str());
+ mirror_image_info_cpp_to_c(cpp_status.info, &c_status->info);
+
+ librbd::mirror_image_site_status_t local_status;
+ int r = get_local_mirror_image_site_status(cpp_status, &local_status);
+ if (r < 0) {
+ return r;
+ }
+
+ c_status->state = local_status.state;
+ c_status->description = strdup(local_status.description.c_str());
+ c_status->last_update = local_status.last_update;
+ c_status->up = local_status.up;
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+
+void mirror_image_global_status_cpp_to_c(
+ const librbd::mirror_image_global_status_t &cpp_status,
+ rbd_mirror_image_global_status_t *c_status) {
+ c_status->name = strdup(cpp_status.name.c_str());
+ mirror_image_info_cpp_to_c(cpp_status.info, &c_status->info);
+
+ c_status->site_statuses_count = cpp_status.site_statuses.size();
+ c_status->site_statuses = (rbd_mirror_image_site_status_t*)calloc(
+ cpp_status.site_statuses.size(), sizeof(rbd_mirror_image_site_status_t));
+
+ auto idx = 0U;
+ for (auto it = cpp_status.site_statuses.begin();
+ it != cpp_status.site_statuses.end(); ++it) {
+ auto& s_status = c_status->site_statuses[idx++];
+ s_status.mirror_uuid = strdup(it->mirror_uuid.c_str());
+ s_status.state = it->state;
+ s_status.description = strdup(it->description.c_str());
+ s_status.last_update = it->last_update;
+ s_status.up = it->up;
+ }
+}
+
+void trash_image_info_cpp_to_c(const librbd::trash_image_info_t &cpp_info,
+ rbd_trash_image_info_t *c_info) {
+ c_info->id = strdup(cpp_info.id.c_str());
+ c_info->name = strdup(cpp_info.name.c_str());
+ c_info->source = cpp_info.source;
+ c_info->deletion_time = cpp_info.deletion_time;
+ c_info->deferment_end_time = cpp_info.deferment_end_time;
+}
+
+void config_option_cpp_to_c(const librbd::config_option_t &cpp_option,
+ rbd_config_option_t *c_option) {
+ c_option->name = strdup(cpp_option.name.c_str());
+ c_option->value = strdup(cpp_option.value.c_str());
+ c_option->source = cpp_option.source;
+}
+
+void config_option_cleanup(rbd_config_option_t &option) {
+ free(option.name);
+ free(option.value);
+}
+
+struct C_MirrorImageGetInfo : public Context {
+ rbd_mirror_image_info_t *mirror_image_info;
+ Context *on_finish;
+
+ librbd::mirror_image_info_t cpp_mirror_image_info;
+
+ C_MirrorImageGetInfo(rbd_mirror_image_info_t *mirror_image_info,
+ Context *on_finish)
+ : mirror_image_info(mirror_image_info), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_info_cpp_to_c(cpp_mirror_image_info, mirror_image_info);
+ on_finish->complete(0);
+ }
+};
+
+struct C_MirrorImageGetGlobalStatus : public Context {
+ rbd_mirror_image_global_status_t *mirror_image_global_status;
+ Context *on_finish;
+
+ librbd::mirror_image_global_status_t cpp_mirror_image_global_status;
+
+ C_MirrorImageGetGlobalStatus(
+ rbd_mirror_image_global_status_t *mirror_image_global_status,
+ Context *on_finish)
+ : mirror_image_global_status(mirror_image_global_status),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ mirror_image_global_status_cpp_to_c(cpp_mirror_image_global_status,
+ mirror_image_global_status);
+ on_finish->complete(0);
+ }
+};
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+struct C_MirrorImageGetStatus : public Context {
+ librbd::mirror_image_status_t *mirror_image_status_cpp = nullptr;
+ rbd_mirror_image_status_t *mirror_image_status = nullptr;
+ Context *on_finish;
+
+ librbd::mirror_image_global_status_t cpp_mirror_image_global_status;
+
+ C_MirrorImageGetStatus(rbd_mirror_image_status_t *mirror_image_status,
+ Context *on_finish)
+ : mirror_image_status(mirror_image_status), on_finish(on_finish) {
+ }
+ C_MirrorImageGetStatus(librbd::mirror_image_status_t *mirror_image_status,
+ Context *on_finish)
+ : mirror_image_status_cpp(mirror_image_status), on_finish(on_finish) {
+ }
+
+
+ void finish(int r) override {
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ if (mirror_image_status != nullptr) {
+ r = mirror_image_global_status_cpp_to_c(cpp_mirror_image_global_status,
+ mirror_image_status);
+ } else if (mirror_image_status_cpp != nullptr) {
+ librbd::mirror_image_site_status_t local_status;
+ r = get_local_mirror_image_site_status(cpp_mirror_image_global_status,
+ &local_status);
+ if (r >= 0) {
+ *mirror_image_status_cpp = {
+ cpp_mirror_image_global_status.name,
+ cpp_mirror_image_global_status.info,
+ local_status.state, local_status.description,
+ local_status.last_update, local_status.up};
+ }
+ }
+ on_finish->complete(r);
+ }
+};
+
+#pragma GCC diagnostic pop
+
+} // anonymous namespace
+
+namespace librbd {
+ ProgressContext::~ProgressContext()
+ {
+ }
+
+ class CProgressContext : public ProgressContext
+ {
+ public:
+ CProgressContext(librbd_progress_fn_t fn, void *data)
+ : m_fn(fn), m_data(data)
+ {
+ }
+ int update_progress(uint64_t offset, uint64_t src_size) override
+ {
+ return m_fn(offset, src_size, m_data);
+ }
+ private:
+ librbd_progress_fn_t m_fn;
+ void *m_data;
+ };
+
+ /*
+ * Pool stats
+ */
+ PoolStats::PoolStats() {
+ rbd_pool_stats_create(&pool_stats);
+ }
+
+ PoolStats::~PoolStats() {
+ rbd_pool_stats_destroy(pool_stats);
+ }
+
+ int PoolStats::add(rbd_pool_stat_option_t option, uint64_t* opt_val) {
+ return rbd_pool_stats_option_add_uint64(pool_stats, option, opt_val);
+ }
+
+ /*
+ * RBD
+ */
+ RBD::RBD()
+ {
+ }
+
+ RBD::~RBD()
+ {
+ }
+
+ void RBD::version(int *major, int *minor, int *extra)
+ {
+ rbd_version(major, minor, extra);
+ }
+
+ int RBD::open(IoCtx& io_ctx, Image& image, const char *name)
+ {
+ return open(io_ctx, image, name, NULL);
+ }
+
+ int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id)
+ {
+ return open_by_id(io_ctx, image, id, nullptr);
+ }
+
+ int RBD::open(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = NULL;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = nullptr;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_by_id_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = NULL;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
+ image.ctx = nullptr;
+ }
+
+ int r = ictx->state->open(0);
+ if (r < 0) {
+ tracepoint(librbd, open_image_by_id_exit, r);
+ return r;
+ }
+
+ image.ctx = (image_ctx_t) ictx;
+ tracepoint(librbd, open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != NULL) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+ }
+
+ int RBD::aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+ const char *snap_name, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, c->pc);
+
+ if (image.ctx != nullptr) {
+ reinterpret_cast<ImageCtx*>(image.ctx)->state->close(
+ new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx));
+ } else {
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c),
+ &image.ctx));
+ }
+ tracepoint(librbd, aio_open_image_by_id_exit, 0);
+ return 0;
+ }
+
+ int RBD::features_to_string(uint64_t features, std::string *str_features)
+ {
+ std::stringstream err;
+ *str_features = librbd::rbd_features_to_string(features, &err);
+ if (!err.str().empty()) {
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ int RBD::features_from_string(const std::string str_features, uint64_t *features)
+ {
+ std::stringstream err;
+ *features = librbd::rbd_features_from_string(str_features, &err);
+ if (!err.str().empty()) {
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order);
+ int r = librbd::create(io_ctx, name, size, order);
+ tracepoint(librbd, create_exit, r, *order);
+ return r;
+ }
+
+ int RBD::create2(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order);
+ int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0);
+ tracepoint(librbd, create2_exit, r, *order);
+ return r;
+ }
+
+ int RBD::create3(IoCtx& io_ctx, const char *name, uint64_t size,
+ uint64_t features, int *order, uint64_t stripe_unit,
+ uint64_t stripe_count)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count);
+ int r = librbd::create(io_ctx, name, size, false, features, order,
+ stripe_unit, stripe_count);
+ tracepoint(librbd, create3_exit, r, *order);
+ return r;
+ }
+
+ int RBD::create4(IoCtx& io_ctx, const char *name, uint64_t size,
+ ImageOptions& opts)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts.opts);
+ int r = librbd::create(io_ctx, name, "", size, opts, "", "", false);
+ tracepoint(librbd, create4_exit, r);
+ return r;
+ }
+
+ int RBD::clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+ tracepoint(librbd, clone_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features);
+ int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
+ features, c_order, 0, 0);
+ tracepoint(librbd, clone_exit, r, *c_order);
+ return r;
+ }
+
+ int RBD::clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name, uint64_t features,
+ int *c_order, uint64_t stripe_unit, int stripe_count)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+ tracepoint(librbd, clone2_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features, stripe_unit, stripe_count);
+ int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
+ features, c_order, stripe_unit, stripe_count);
+ tracepoint(librbd, clone2_exit, r, *c_order);
+ return r;
+ }
+
+ int RBD::clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+ IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+ tracepoint(librbd, clone3_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, c_opts.opts);
+ int r = librbd::clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx,
+ nullptr, c_name, c_opts, "", "");
+ tracepoint(librbd, clone3_exit, r);
+ return r;
+ }
+
+ int RBD::remove(IoCtx& io_ctx, const char *name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+ }
+
+ int RBD::remove_with_progress(IoCtx& io_ctx, const char *name,
+ ProgressContext& pctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ int r = librbd::api::Image<>::remove(io_ctx, name, pctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+ }
+
+ int RBD::trash_move(IoCtx &io_ctx, const char *name, uint64_t delay) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER,
+ name, delay);
+ tracepoint(librbd, trash_move_exit, r);
+ return r;
+ }
+
+ int RBD::trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info) {
+ return librbd::api::Trash<>::get(io_ctx, id, info);
+ }
+
+ int RBD::trash_list(IoCtx &io_ctx, vector<trash_image_info_t> &entries) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_list_enter,
+ io_ctx.get_pool_name().c_str(), io_ctx.get_id());
+ int r = librbd::api::Trash<>::list(io_ctx, entries, true);
+#ifdef WITH_LTTNG
+ if (r >= 0) {
+ for (const auto& entry : entries) {
+ tracepoint(librbd, trash_list_entry, entry.id.c_str());
+ }
+ }
+#endif
+ tracepoint(librbd, trash_list_exit, r, r);
+ return r;
+ }
+
+ int RBD::trash_remove(IoCtx &io_ctx, const char *image_id, bool force) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+ }
+
+ int RBD::trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+ bool force, ProgressContext &pctx) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, pctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+ }
+
+ int RBD::trash_restore(IoCtx &io_ctx, const char *id, const char *name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), id, name);
+ int r = librbd::api::Trash<>::restore(
+ io_ctx, librbd::api::Trash<>::ALLOWED_RESTORE_SOURCES, id, name);
+ tracepoint(librbd, trash_undelete_exit, r);
+ return r;
+ }
+
+ int RBD::trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ NoOpProgressContext nop_pctx;
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+ }
+
+ int RBD::trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts,
+ float threshold, ProgressContext &pctx) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+ }
+
+ int RBD::namespace_create(IoCtx& io_ctx, const char *namespace_name) {
+ return librbd::api::Namespace<>::create(io_ctx, namespace_name);
+ }
+
+ int RBD::namespace_remove(IoCtx& io_ctx, const char *namespace_name) {
+ return librbd::api::Namespace<>::remove(io_ctx, namespace_name);
+ }
+
+ int RBD::namespace_list(IoCtx& io_ctx,
+ std::vector<std::string>* namespace_names) {
+ return librbd::api::Namespace<>::list(io_ctx, namespace_names);
+ }
+
+ int RBD::namespace_exists(IoCtx& io_ctx, const char *namespace_name,
+ bool *exists) {
+ return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists);
+ }
+
+ int RBD::pool_init(IoCtx& io_ctx, bool force) {
+ return librbd::api::Pool<>::init(io_ctx, force);
+ }
+
+ int RBD::pool_stats_get(IoCtx& io_ctx, PoolStats* stats) {
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats->pool_stats);
+ return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options);
+ }
+
+ int RBD::list(IoCtx& io_ctx, vector<string>& names)
+ {
+ std::vector<image_spec_t> image_specs;
+ int r = list2(io_ctx, &image_specs);
+ if (r < 0) {
+ return r;
+ }
+
+ names.clear();
+ for (auto& it : image_specs) {
+ names.push_back(it.name);
+ }
+ return 0;
+ }
+
+ int RBD::list2(IoCtx& io_ctx, std::vector<image_spec_t> *images)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+
+ int r = librbd::api::Image<>::list_images(io_ctx, images);
+#ifdef WITH_LTTNG
+ if (r >= 0) {
+ for (auto& it : *images) {
+ tracepoint(librbd, list_entry, it.name.c_str());
+ }
+ }
+#endif
+ tracepoint(librbd, list_exit, r, r);
+ return r;
+ }
+
+ int RBD::rename(IoCtx& src_io_ctx, const char *srcname, const char *destname)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx));
+ tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname);
+ int r = librbd::rename(src_io_ctx, srcname, destname);
+ tracepoint(librbd, rename_exit, r);
+ return r;
+ }
+
+ int RBD::migration_prepare(IoCtx& io_ctx, const char *image_name,
+ IoCtx& dest_io_ctx, const char *dest_image_name,
+ ImageOptions& opts)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(),
+ dest_io_ctx.get_id(), dest_image_name, opts.opts);
+ int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx,
+ dest_image_name, opts);
+ tracepoint(librbd, migration_prepare_exit, r);
+ return r;
+ }
+
+ int RBD::migration_prepare_import(const char *source_spec, IoCtx& dest_io_ctx,
+ const char *dest_image_name,
+ ImageOptions& opts) {
+ return librbd::api::Migration<>::prepare_import(source_spec, dest_io_ctx,
+ dest_image_name, opts);
+ }
+
+ int RBD::migration_execute(IoCtx& io_ctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+ }
+
+ int RBD::migration_execute_with_progress(IoCtx& io_ctx,
+ const char *image_name,
+ librbd::ProgressContext &prog_ctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+ }
+
+ int RBD::migration_abort(IoCtx& io_ctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+ }
+
+ int RBD::migration_abort_with_progress(IoCtx& io_ctx, const char *image_name,
+ librbd::ProgressContext &prog_ctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+ }
+
+ int RBD::migration_commit(IoCtx& io_ctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+ }
+
+ int RBD::migration_commit_with_progress(IoCtx& io_ctx, const char *image_name,
+ librbd::ProgressContext &prog_ctx)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+ }
+
+ int RBD::migration_status(IoCtx& io_ctx, const char *image_name,
+ image_migration_status_t *status,
+ size_t status_size)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+
+ if (status_size != sizeof(image_migration_status_t)) {
+ tracepoint(librbd, migration_status_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Migration<>::status(io_ctx, image_name, status);
+ tracepoint(librbd, migration_status_exit, r);
+ return r;
+ }
+
+ int RBD::mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode) {
+ return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode);
+ }
+
+ int RBD::mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode) {
+ return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode);
+ }
+
+ int RBD::mirror_uuid_get(IoCtx& io_ctx, std::string* mirror_uuid) {
+ return librbd::api::Mirror<>::uuid_get(io_ctx, mirror_uuid);
+ }
+
+ int RBD::mirror_site_name_get(librados::Rados& rados,
+ std::string* site_name) {
+ return librbd::api::Mirror<>::site_name_get(rados, site_name);
+ }
+
+ int RBD::mirror_site_name_set(librados::Rados& rados,
+ const std::string& site_name) {
+ return librbd::api::Mirror<>::site_name_set(rados, site_name);
+ }
+
+ int RBD::mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token) {
+ return librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, token);
+ }
+
+ int RBD::mirror_peer_bootstrap_import(IoCtx& io_ctx,
+ rbd_mirror_peer_direction_t direction,
+ const std::string& token) {
+ return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction,
+ token);
+ }
+
+ int RBD::mirror_peer_site_add(IoCtx& io_ctx, std::string *uuid,
+ mirror_peer_direction_t direction,
+ const std::string &site_name,
+ const std::string &client_name) {
+ return librbd::api::Mirror<>::peer_site_add(
+ io_ctx, uuid, direction, site_name, client_name);
+ }
+
+ int RBD::mirror_peer_site_remove(IoCtx& io_ctx, const std::string &uuid) {
+ return librbd::api::Mirror<>::peer_site_remove(io_ctx, uuid);
+ }
+
+ int RBD::mirror_peer_site_list(
+ IoCtx& io_ctx, std::vector<mirror_peer_site_t> *peer_sites) {
+ return librbd::api::Mirror<>::peer_site_list(io_ctx, peer_sites);
+ }
+
+ int RBD::mirror_peer_site_set_client_name(
+ IoCtx& io_ctx, const std::string &uuid, const std::string &client_name) {
+ return librbd::api::Mirror<>::peer_site_set_client(io_ctx, uuid,
+ client_name);
+ }
+
+ int RBD::mirror_peer_site_set_name(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &site_name) {
+ return librbd::api::Mirror<>::peer_site_set_name(io_ctx, uuid,
+ site_name);
+ }
+
+ int RBD::mirror_peer_site_set_direction(IoCtx& io_ctx,
+ const std::string& uuid,
+ mirror_peer_direction_t direction) {
+ return librbd::api::Mirror<>::peer_site_set_direction(io_ctx, uuid,
+ direction);
+ }
+
+ int RBD::mirror_peer_site_get_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ std::map<std::string, std::string> *key_vals) {
+ return librbd::api::Mirror<>::peer_site_get_attributes(io_ctx, uuid,
+ key_vals);
+ }
+
+ int RBD::mirror_peer_site_set_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ const std::map<std::string, std::string>& key_vals) {
+ return librbd::api::Mirror<>::peer_site_set_attributes(io_ctx, uuid,
+ key_vals);
+ }
+
+ int RBD::mirror_image_global_status_list(
+ IoCtx& io_ctx, const std::string &start_id, size_t max,
+ std::map<std::string, mirror_image_global_status_t> *global_statuses) {
+ return librbd::api::Mirror<>::image_global_status_list(
+ io_ctx, start_id, max, global_statuses);
+ }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+ int RBD::mirror_peer_add(IoCtx& io_ctx, std::string *uuid,
+ const std::string &cluster_name,
+ const std::string &client_name) {
+ return librbd::api::Mirror<>::peer_site_add(
+ io_ctx, uuid, RBD_MIRROR_PEER_DIRECTION_RX_TX, cluster_name, client_name);
+ }
+
+ int RBD::mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid) {
+ return librbd::api::Mirror<>::peer_site_remove(io_ctx, uuid);
+ }
+
+ int RBD::mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers) {
+ std::vector<mirror_peer_site_t> peer_sites;
+ int r = librbd::api::Mirror<>::peer_site_list(io_ctx, &peer_sites);
+ if (r < 0) {
+ return r;
+ }
+
+ peers->clear();
+ peers->reserve(peer_sites.size());
+ for (auto& peer_site : peer_sites) {
+ peers->push_back({peer_site.uuid, peer_site.site_name,
+ peer_site.client_name});
+ }
+ return 0;
+ }
+
+ int RBD::mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &client_name) {
+ return librbd::api::Mirror<>::peer_site_set_client(io_ctx, uuid,
+ client_name);
+ }
+
+ int RBD::mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid,
+ const std::string &cluster_name) {
+ return librbd::api::Mirror<>::peer_site_set_name(io_ctx, uuid,
+ cluster_name);
+ }
+
+ int RBD::mirror_peer_get_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ std::map<std::string, std::string> *key_vals) {
+ return librbd::api::Mirror<>::peer_site_get_attributes(io_ctx, uuid,
+ key_vals);
+ }
+
+ int RBD::mirror_peer_set_attributes(
+ IoCtx& io_ctx, const std::string &uuid,
+ const std::map<std::string, std::string>& key_vals) {
+ return librbd::api::Mirror<>::peer_site_set_attributes(io_ctx, uuid,
+ key_vals);
+ }
+
+ int RBD::mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id,
+ size_t max, std::map<std::string, mirror_image_status_t> *images) {
+ std::map<std::string, mirror_image_global_status_t> global_statuses;
+
+ int r = librbd::api::Mirror<>::image_global_status_list(
+ io_ctx, start_id, max, &global_statuses);
+ if (r < 0) {
+ return r;
+ }
+
+ images->clear();
+ for (auto &[id, global_status] : global_statuses) {
+ if (global_status.site_statuses.empty() ||
+ global_status.site_statuses[0].mirror_uuid !=
+ cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID) {
+ continue;
+ }
+
+ auto& site_status = global_status.site_statuses[0];
+ (*images)[id] = mirror_image_status_t{
+ global_status.name, global_status.info, site_status.state,
+ site_status.description, site_status.last_update, site_status.up};
+ }
+
+ return 0;
+ }
+
+#pragma GCC diagnostic pop
+
+ int RBD::mirror_image_status_summary(IoCtx& io_ctx,
+ std::map<mirror_image_status_state_t, int> *states) {
+ return librbd::api::Mirror<>::image_status_summary(io_ctx, states);
+ }
+
+ int RBD::mirror_image_instance_id_list(IoCtx& io_ctx,
+ const std::string &start_id, size_t max,
+ std::map<std::string, std::string> *instance_ids) {
+ return librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max,
+ instance_ids);
+ }
+
+ int RBD::mirror_image_info_list(
+ IoCtx& io_ctx, mirror_image_mode_t *mode_filter,
+ const std::string &start_id, size_t max,
+ std::map<std::string, std::pair<mirror_image_mode_t,
+ mirror_image_info_t>> *entries) {
+ return librbd::api::Mirror<>::image_info_list(io_ctx, mode_filter, start_id,
+ max, entries);
+ }
+
+ int RBD::group_create(IoCtx& io_ctx, const char *group_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), group_name);
+ int r = librbd::api::Group<>::create(io_ctx, group_name);
+ tracepoint(librbd, group_create_exit, r);
+ return r;
+ }
+
+ int RBD::group_remove(IoCtx& io_ctx, const char *group_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), group_name);
+ int r = librbd::api::Group<>::remove(io_ctx, group_name);
+ tracepoint(librbd, group_remove_exit, r);
+ return r;
+ }
+
+ int RBD::group_list(IoCtx& io_ctx, vector<string> *names)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+
+ int r = librbd::api::Group<>::list(io_ctx, names);
+ if (r >= 0) {
+ for (auto itr : *names) {
+ tracepoint(librbd, group_list_entry, itr.c_str());
+ }
+ }
+ tracepoint(librbd, group_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_rename(IoCtx& io_ctx, const char *src_name,
+ const char *dest_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), src_name, dest_name);
+ int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name);
+ tracepoint(librbd, group_rename_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_add(IoCtx& group_ioctx, const char *group_name,
+ IoCtx& image_ioctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_add_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+ int r = librbd::api::Group<>::image_add(group_ioctx, group_name,
+ image_ioctx, image_name);
+ tracepoint(librbd, group_image_add_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_remove(IoCtx& group_ioctx, const char *group_name,
+ IoCtx& image_ioctx, const char *image_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+ int r = librbd::api::Group<>::image_remove(group_ioctx, group_name,
+ image_ioctx, image_name);
+ tracepoint(librbd, group_image_remove_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_remove_by_id(IoCtx& group_ioctx, const char *group_name,
+ IoCtx& image_ioctx, const char *image_id)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_by_id_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_id);
+ int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name,
+ image_ioctx, image_id);
+ tracepoint(librbd, group_image_remove_by_id_exit, r);
+ return r;
+ }
+
+ int RBD::group_image_list(IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_image_info_t> *images,
+ size_t group_image_info_size)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_list_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+
+ if (group_image_info_size != sizeof(group_image_info_t)) {
+ tracepoint(librbd, group_image_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Group<>::image_list(group_ioctx, group_name, images);
+ tracepoint(librbd, group_image_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_create(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_create_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_create(group_ioctx, group_name,
+ snap_name, 0);
+ tracepoint(librbd, group_snap_create_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_create2(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name, uint32_t flags) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_create_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_create(group_ioctx, group_name,
+ snap_name, flags);
+ tracepoint(librbd, group_snap_create_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_remove(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_remove_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name,
+ snap_name);
+ tracepoint(librbd, group_snap_remove_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_list(IoCtx& group_ioctx, const char *group_name,
+ std::vector<group_snap_info_t> *snaps,
+ size_t group_snap_info_size)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_list_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+
+ if (group_snap_info_size != sizeof(group_snap_info_t)) {
+ tracepoint(librbd, group_snap_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, snaps);
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_rename(IoCtx& group_ioctx, const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name)
+ {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rename_enter,
+ group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(),
+ group_name, old_snap_name, new_snap_name);
+ int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name,
+ old_snap_name, new_snap_name);
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_rollback(IoCtx& group_ioctx, const char *group_name,
+ const char *snap_name) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+ tracepoint(librbd, group_snap_rollback_exit, r);
+ return r;
+ }
+
+ int RBD::group_snap_rollback_with_progress(IoCtx& group_ioctx,
+ const char *group_name,
+ const char *snap_name,
+ ProgressContext& prog_ctx) {
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+ tracepoint(librbd, group_snap_rollback_exit, r);
+ return r;
+ }
+
+ int RBD::pool_metadata_get(IoCtx& ioctx, const std::string &key,
+ std::string *value)
+ {
+ int r = librbd::api::PoolMetadata<>::get(ioctx, key, value);
+ return r;
+ }
+
+ int RBD::pool_metadata_set(IoCtx& ioctx, const std::string &key,
+ const std::string &value)
+ {
+ int r = librbd::api::PoolMetadata<>::set(ioctx, key, value);
+ return r;
+ }
+
+ int RBD::pool_metadata_remove(IoCtx& ioctx, const std::string &key)
+ {
+ int r = librbd::api::PoolMetadata<>::remove(ioctx, key);
+ return r;
+ }
+
+ int RBD::pool_metadata_list(IoCtx& ioctx, const std::string &start,
+ uint64_t max, map<string, bufferlist> *pairs)
+ {
+ int r = librbd::api::PoolMetadata<>::list(ioctx, start, max, pairs);
+ return r;
+ }
+
+ int RBD::config_list(IoCtx& io_ctx, std::vector<config_option_t> *options) {
+ return librbd::api::Config<>::list(io_ctx, options);
+ }
+
+ RBD::AioCompletion::AioCompletion(void *cb_arg, callback_t complete_cb)
+ {
+ auto aio_comp = librbd::io::AioCompletion::create(
+ cb_arg, complete_cb, this);
+ aio_comp->external_callback = true;
+ pc = reinterpret_cast<void*>(aio_comp);
+ }
+
+ bool RBD::AioCompletion::is_complete()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->is_complete();
+ }
+
+ int RBD::AioCompletion::wait_for_complete()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->wait_for_complete();
+ }
+
+ ssize_t RBD::AioCompletion::get_return_value()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->get_return_value();
+ }
+
+ void *RBD::AioCompletion::get_arg()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ return c->get_arg();
+ }
+
+ void RBD::AioCompletion::release()
+ {
+ librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc;
+ c->release();
+ delete this;
+ }
+
+ /*
+ ImageOptions
+ */
+
+ ImageOptions::ImageOptions()
+ {
+ librbd::image_options_create(&opts);
+ }
+
+ ImageOptions::ImageOptions(rbd_image_options_t opts_)
+ {
+ librbd::image_options_create_ref(&opts, opts_);
+ }
+
+ ImageOptions::ImageOptions(const ImageOptions &imgopts)
+ {
+ librbd::image_options_copy(&opts, imgopts);
+ }
+
+ ImageOptions::~ImageOptions()
+ {
+ librbd::image_options_destroy(opts);
+ }
+
+ int ImageOptions::set(int optname, const std::string& optval)
+ {
+ return librbd::image_options_set(opts, optname, optval);
+ }
+
+ int ImageOptions::set(int optname, uint64_t optval)
+ {
+ return librbd::image_options_set(opts, optname, optval);
+ }
+
+ int ImageOptions::get(int optname, std::string* optval) const
+ {
+ return librbd::image_options_get(opts, optname, optval);
+ }
+
+ int ImageOptions::get(int optname, uint64_t* optval) const
+ {
+ return librbd::image_options_get(opts, optname, optval);
+ }
+
+ int ImageOptions::is_set(int optname, bool* is_set)
+ {
+ return librbd::image_options_is_set(opts, optname, is_set);
+ }
+
+ int ImageOptions::unset(int optname)
+ {
+ return librbd::image_options_unset(opts, optname);
+ }
+
+ void ImageOptions::clear()
+ {
+ librbd::image_options_clear(opts);
+ }
+
+ bool ImageOptions::empty() const
+ {
+ return librbd::image_options_is_empty(opts);
+ }
+
+ /*
+ Image
+ */
+
+ Image::Image() : ctx(NULL)
+ {
+ }
+
+ Image::~Image()
+ {
+ close();
+ }
+
+ int Image::close()
+ {
+ int r = 0;
+ if (ctx) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+
+ r = ictx->state->close();
+ ctx = NULL;
+
+ tracepoint(librbd, close_image_exit, r);
+ }
+ return r;
+ }
+
+ int Image::aio_close(RBD::AioCompletion *c)
+ {
+ if (!ctx) {
+ return -EINVAL;
+ }
+
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), c->pc);
+
+ ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE,
+ get_aio_completion(c)));
+ ctx = NULL;
+
+ tracepoint(librbd, aio_close_image_exit, 0);
+ return 0;
+ }
+
+ int Image::resize(uint64_t size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->resize(size, true, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+ }
+
+ int Image::resize2(uint64_t size, bool allow_shrink, librbd::ProgressContext& pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ int r = ictx->operations->resize(size, allow_shrink, pctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+ }
+
+ int Image::resize_with_progress(uint64_t size, librbd::ProgressContext& pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ int r = ictx->operations->resize(size, true, pctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+ }
+
+ int Image::stat(image_info_t& info, size_t infosize)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::info(ictx, info, infosize);
+ tracepoint(librbd, stat_exit, r, &info);
+ return r;
+ }
+
+ int Image::old_format(uint8_t *old)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_old_format(ictx, old);
+ tracepoint(librbd, get_old_format_exit, r, *old);
+ return r;
+ }
+
+ int Image::size(uint64_t *size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_size(ictx, size);
+ tracepoint(librbd, get_size_exit, r, *size);
+ return r;
+ }
+
+ int Image::get_group(group_info_t *group_info, size_t group_info_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, image_get_group_enter, ictx->name.c_str());
+
+ if (group_info_size != sizeof(group_info_t)) {
+ tracepoint(librbd, image_get_group_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Group<>::image_get_group(ictx, group_info);
+ tracepoint(librbd, image_get_group_exit, r);
+ return r;
+ }
+
+ int Image::features(uint64_t *features)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_features(ictx, features);
+ tracepoint(librbd, get_features_exit, r, *features);
+ return r;
+ }
+
+ int Image::update_features(uint64_t features, bool enabled)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ tracepoint(librbd, update_features_enter, ictx, features, enabled);
+ int r = ictx->operations->update_features(features, enabled);
+ tracepoint(librbd, update_features_exit, r);
+ return r;
+ }
+
+ int Image::get_op_features(uint64_t *op_features)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Image<>::get_op_features(ictx, op_features);
+ }
+
+ uint64_t Image::get_stripe_unit() const
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ uint64_t stripe_unit = ictx->get_stripe_unit();
+ tracepoint(librbd, get_stripe_unit_exit, 0, stripe_unit);
+ return stripe_unit;
+ }
+
+ uint64_t Image::get_stripe_count() const
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ uint64_t stripe_count = ictx->get_stripe_count();
+ tracepoint(librbd, get_stripe_count_exit, 0, stripe_count);
+ return stripe_count;
+ }
+
+ int Image::get_create_timestamp(struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_create_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_create_timestamp_exit, 0, timestamp);
+ return 0;
+ }
+
+ int Image::get_access_timestamp(struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ {
+ std::shared_lock timestamp_locker{ictx->timestamp_lock};
+ utime_t time = ictx->get_access_timestamp();
+ time.to_timespec(timestamp);
+ }
+ tracepoint(librbd, get_access_timestamp_exit, 0, timestamp);
+ return 0;
+ }
+
+ int Image::get_modify_timestamp(struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ {
+ std::shared_lock timestamp_locker{ictx->timestamp_lock};
+ utime_t time = ictx->get_modify_timestamp();
+ time.to_timespec(timestamp);
+ }
+ tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp);
+ return 0;
+ }
+
+ int Image::overlap(uint64_t *overlap)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_overlap(ictx, overlap);
+ tracepoint(librbd, get_overlap_exit, r, *overlap);
+ return r;
+ }
+
+ int Image::get_name(std::string *name)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ *name = ictx->name;
+ return 0;
+ }
+
+ int Image::get_id(std::string *id)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ if (ictx->old_format) {
+ return -EINVAL;
+ }
+ *id = ictx->id;
+ return 0;
+ }
+
+ std::string Image::get_block_name_prefix()
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ return ictx->object_prefix;
+ }
+
+ int64_t Image::get_data_pool_id()
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+ return librbd::api::Image<>::get_data_pool_id(ictx);
+ }
+
+ int Image::parent_info(string *parent_pool_name, string *parent_name,
+ string *parent_snap_name)
+ {
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = get_parent(&parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name != nullptr) {
+ *parent_pool_name = parent_image.pool_name;
+ }
+ if (parent_name != nullptr) {
+ *parent_name = parent_image.image_name;
+ }
+ if (parent_snap_name != nullptr) {
+ *parent_snap_name = parent_snap.name;
+ }
+ }
+ return r;
+ }
+
+ int Image::parent_info2(string *parent_pool_name, string *parent_name,
+ string *parent_id, string *parent_snap_name)
+ {
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = get_parent(&parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name != nullptr) {
+ *parent_pool_name = parent_image.pool_name;
+ }
+ if (parent_name != nullptr) {
+ *parent_name = parent_image.image_name;
+ }
+ if (parent_id != nullptr) {
+ *parent_id = parent_image.image_id;
+ }
+ if (parent_snap_name != nullptr) {
+ *parent_snap_name = parent_snap.name;
+ }
+ }
+ return r;
+ }
+
+ int Image::get_parent(linked_image_spec_t *parent_image,
+ snap_spec_t *parent_snap)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = librbd::api::Image<>::get_parent(ictx, parent_image, parent_snap);
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image->pool_name.c_str(),
+ parent_image->image_name.c_str(),
+ parent_image->image_id.c_str(),
+ parent_snap->name.c_str());
+ return r;
+ }
+
+ int Image::get_migration_source_spec(std::string* source_spec)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+ return librbd::api::Migration<>::get_source_spec(ictx, source_spec);
+ }
+
+ int Image::get_flags(uint64_t *flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, get_flags_enter, ictx);
+ int r = librbd::get_flags(ictx, flags);
+ tracepoint(librbd, get_flags_exit, ictx, r, *flags);
+ return r;
+ }
+
+ int Image::set_image_notification(int fd, int type)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, set_image_notification_enter, ictx, fd, type);
+ int r = librbd::set_image_notification(ictx, fd, type);
+ tracepoint(librbd, set_image_notification_exit, ictx, r);
+ return r;
+ }
+
+ int Image::is_exclusive_lock_owner(bool *is_owner)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, is_exclusive_lock_owner_enter, ictx);
+ int r = librbd::is_exclusive_lock_owner(ictx, is_owner);
+ tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner);
+ return r;
+ }
+
+ int Image::lock_acquire(rbd_lock_mode_t lock_mode)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_acquire_enter, ictx, lock_mode);
+ int r = librbd::lock_acquire(ictx, lock_mode);
+ tracepoint(librbd, lock_acquire_exit, ictx, r);
+ return r;
+ }
+
+ int Image::lock_release()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_release_enter, ictx);
+ int r = librbd::lock_release(ictx);
+ tracepoint(librbd, lock_release_exit, ictx, r);
+ return r;
+ }
+
+ int Image::lock_get_owners(rbd_lock_mode_t *lock_mode,
+ std::list<std::string> *lock_owners)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_get_owners_enter, ictx);
+ int r = librbd::lock_get_owners(ictx, lock_mode, lock_owners);
+ tracepoint(librbd, lock_get_owners_exit, ictx, r);
+ return r;
+ }
+
+ int Image::lock_break(rbd_lock_mode_t lock_mode,
+ const std::string &lock_owner)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner.c_str());
+ int r = librbd::lock_break(ictx, lock_mode, lock_owner);
+ tracepoint(librbd, lock_break_exit, ictx, r);
+ return r;
+ }
+
+ int Image::rebuild_object_map(ProgressContext &prog_ctx)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx);
+ return ictx->operations->rebuild_object_map(prog_ctx);
+ }
+
+ int Image::check_object_map(ProgressContext &prog_ctx)
+ {
+ ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx);
+ return ictx->operations->check_object_map(prog_ctx);
+ }
+
+ int Image::copy(IoCtx& dest_io_ctx, const char *destname)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ ImageOptions opts;
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy_exit, r);
+ return r;
+ }
+
+ int Image::copy2(Image& dest)
+ {
+ ImageCtx *srcctx = (ImageCtx *)ctx;
+ ImageCtx *destctx = (ImageCtx *)dest.ctx;
+ tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(srcctx, destctx, prog_ctx, 0);
+ tracepoint(librbd, copy2_exit, r);
+ return r;
+ }
+
+ int Image::copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy3_exit, r);
+ return r;
+ }
+
+ int Image::copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, size_t sparse_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, sparse_size);
+ tracepoint(librbd, copy4_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ librbd::ProgressContext &pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ ImageOptions opts;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0);
+ tracepoint(librbd, copy_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress2(Image& dest, librbd::ProgressContext &pctx)
+ {
+ ImageCtx *srcctx = (ImageCtx *)ctx;
+ ImageCtx *destctx = (ImageCtx *)dest.ctx;
+ tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only);
+ int r = librbd::copy(srcctx, destctx, pctx, 0);
+ tracepoint(librbd, copy2_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts,
+ librbd::ProgressContext &pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts);
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0);
+ tracepoint(librbd, copy3_exit, r);
+ return r;
+ }
+
+ int Image::copy_with_progress4(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts,
+ librbd::ProgressContext &pctx,
+ size_t sparse_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size);
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, sparse_size);
+ tracepoint(librbd, copy4_exit, r);
+ return r;
+ }
+
+ int Image::deep_copy(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, opts.opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, r);
+ return r;
+ }
+
+ int Image::deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+ ImageOptions& opts,
+ librbd::ProgressContext &prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, opts.opts);
+ int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, r);
+ return r;
+ }
+
+ int Image::encryption_format(encryption_format_t format,
+ encryption_options_t opts,
+ size_t opts_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Image<>::encryption_format(
+ ictx, format, opts, opts_size, false);
+ }
+
+ int Image::encryption_load(encryption_format_t format,
+ encryption_options_t opts,
+ size_t opts_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Image<>::encryption_load(
+ ictx, format, opts, opts_size, false);
+ }
+
+ int Image::flatten()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+ }
+
+ int Image::flatten_with_progress(librbd::ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+ }
+
+ int Image::sparsify(size_t sparse_size)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+ }
+
+ int Image::sparsify_with_progress(size_t sparse_size,
+ librbd::ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+ }
+
+ int Image::list_children(set<pair<string, string> > *children)
+ {
+ std::vector<linked_image_spec_t> images;
+ int r = list_children3(&images);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto& image : images) {
+ if (!image.trash) {
+ children->insert({image.pool_name, image.image_name});
+ }
+ }
+ return 0;
+ }
+
+ int Image::list_children2(vector<librbd::child_info_t> *children)
+ {
+ std::vector<linked_image_spec_t> images;
+ int r = list_children3(&images);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto& image : images) {
+ children->push_back({
+ .pool_name = image.pool_name,
+ .image_name = image.image_name,
+ .image_id = image.image_id,
+ .trash = image.trash});
+ }
+
+ return 0;
+ }
+
+ int Image::list_children3(std::vector<linked_image_spec_t> *images)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = librbd::api::Image<>::list_children(ictx, images);
+#ifdef WITH_LTTNG
+ if (r >= 0) {
+ for (auto& it : *images) {
+ tracepoint(librbd, list_children_entry, it.pool_name.c_str(),
+ it.image_name.c_str());
+ }
+ }
+#endif
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ int Image::list_descendants(std::vector<linked_image_spec_t> *images)
+ {
+ auto ictx = reinterpret_cast<ImageCtx*>(ctx);
+
+ images->clear();
+ int r = librbd::api::Image<>::list_descendants(ictx, {}, images);
+ return r;
+ }
+
+ int Image::list_lockers(std::list<librbd::locker_t> *lockers,
+ bool *exclusive, string *tag)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::list_lockers(ictx, lockers, exclusive, tag);
+ if (r >= 0) {
+ for (std::list<librbd::locker_t>::const_iterator it = lockers->begin();
+ it != lockers->end(); ++it) {
+ tracepoint(librbd, list_lockers_entry, it->client.c_str(), it->cookie.c_str(), it->address.c_str());
+ }
+ }
+ tracepoint(librbd, list_lockers_exit, r);
+ return r;
+ }
+
+ int Image::lock_exclusive(const string& cookie)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str());
+ int r = librbd::lock(ictx, true, cookie, "");
+ tracepoint(librbd, lock_exclusive_exit, r);
+ return r;
+ }
+
+ int Image::lock_shared(const string& cookie, const std::string& tag)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str(), tag.c_str());
+ int r = librbd::lock(ictx, false, cookie, tag);
+ tracepoint(librbd, lock_shared_exit, r);
+ return r;
+ }
+
+ int Image::unlock(const string& cookie)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str());
+ int r = librbd::unlock(ictx, cookie);
+ tracepoint(librbd, unlock_exit, r);
+ return r;
+ }
+
+ int Image::break_lock(const string& client, const string& cookie)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client.c_str(), cookie.c_str());
+ int r = librbd::break_lock(ictx, client, cookie);
+ tracepoint(librbd, break_lock_exit, r);
+ return r;
+ }
+
+ int Image::snap_create(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ auto flags = librbd::util::get_default_snap_create_flags(ictx);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx);
+ tracepoint(librbd, snap_create_exit, r);
+ return r;
+ }
+
+ int Image::snap_create2(const char *snap_name, uint32_t flags,
+ ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx);
+ tracepoint(librbd, snap_create_exit, r);
+ return r;
+ }
+
+ int Image::snap_remove(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Snapshot<>::remove(ictx, snap_name, 0, prog_ctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+ }
+
+ int Image::snap_remove2(const char *snap_name, uint32_t flags, ProgressContext& pctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags);
+ int r = librbd::api::Snapshot<>::remove(ictx, snap_name, flags, pctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+ }
+
+ int Image::snap_remove_by_id(uint64_t snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Snapshot<>::remove(ictx, snap_id);
+ }
+
+ int Image::snap_rollback(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+ }
+
+ int Image::snap_rename(const char *srcname, const char *dstname)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname);
+ int r = ictx->operations->snap_rename(srcname, dstname);
+ tracepoint(librbd, snap_rename_exit, r);
+ return r;
+ }
+
+ int Image::snap_rollback_with_progress(const char *snap_name,
+ ProgressContext& prog_ctx)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+ }
+
+ int Image::snap_protect(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_protect_exit, r);
+ return r;
+ }
+
+ int Image::snap_unprotect(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_unprotect_exit, r);
+ return r;
+ }
+
+ int Image::snap_is_protected(const char *snap_name, bool *is_protected)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Snapshot<>::is_protected(ictx, snap_name, is_protected);
+ tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0);
+ return r;
+ }
+
+ int Image::snap_list(vector<librbd::snap_info_t>& snaps)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, &snaps);
+ int r = librbd::api::Snapshot<>::list(ictx, snaps);
+ if (r >= 0) {
+ for (int i = 0, n = snaps.size(); i < n; i++) {
+ tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name.c_str());
+ }
+ }
+ tracepoint(librbd, snap_list_exit, r, snaps.size());
+ if (r >= 0) {
+ // A little ugly, but the C++ API doesn't need a Image::snap_list_end,
+ // and we want the tracepoints to mirror the C API
+ tracepoint(librbd, snap_list_end_enter, &snaps);
+ tracepoint(librbd, snap_list_end_exit);
+ }
+ return r;
+ }
+
+ bool Image::snap_exists(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ bool exists;
+ int r = librbd::api::Snapshot<>::exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, &exists);
+ tracepoint(librbd, snap_exists_exit, r, exists);
+ if (r < 0) {
+ // lie to caller since we don't know the real answer yet.
+ return false;
+ }
+ return exists;
+ }
+
+ // A safer verion of snap_exists.
+ int Image::snap_exists2(const char *snap_name, bool *exists)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Snapshot<>::exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, exists);
+ tracepoint(librbd, snap_exists_exit, r, *exists);
+ return r;
+ }
+
+ int Image::snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_timestamp(ictx, snap_id, timestamp);
+ tracepoint(librbd, snap_get_timestamp_exit, r);
+ return r;
+ }
+
+ int Image::snap_get_limit(uint64_t *limit)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_limit(ictx, limit);
+ tracepoint(librbd, snap_get_limit_exit, r, *limit);
+ return r;
+ }
+
+ int Image::snap_get_namespace_type(uint64_t snap_id,
+ snap_namespace_type_t *namespace_type) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id, namespace_type);
+ tracepoint(librbd, snap_get_namespace_type_exit, r);
+ return r;
+ }
+
+ int Image::snap_get_group_namespace(uint64_t snap_id,
+ snap_group_namespace_t *group_snap,
+ size_t group_snap_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_get_group_namespace_enter, ictx,
+ ictx->name.c_str());
+
+ if (group_snap_size != sizeof(snap_group_namespace_t)) {
+ tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id,
+ group_snap);
+ tracepoint(librbd, snap_get_group_namespace_exit, r);
+ return r;
+ }
+
+ int Image::snap_get_trash_namespace(uint64_t snap_id,
+ std::string* original_name) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id,
+ original_name);
+ }
+
+ int Image::snap_get_mirror_namespace(
+ uint64_t snap_id, snap_mirror_namespace_t *mirror_snap,
+ size_t mirror_snap_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (mirror_snap_size != sizeof(snap_mirror_namespace_t)) {
+ return -ERANGE;
+ }
+
+ int r = librbd::api::Snapshot<>::get_mirror_namespace(
+ ictx, snap_id, mirror_snap);
+ return r;
+ }
+
+ int Image::snap_set_limit(uint64_t limit)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit);
+ int r = ictx->operations->snap_set_limit(limit);
+ tracepoint(librbd, snap_set_limit_exit, r);
+ return r;
+ }
+
+ int Image::snap_set(const char *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Image<>::snap_set(
+ ictx, cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_set_exit, r);
+ return r;
+ }
+
+ int Image::snap_set_by_id(uint64_t snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Image<>::snap_set(ictx, snap_id);
+ }
+
+ int Image::snap_get_name(uint64_t snap_id, std::string *snap_name)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Snapshot<>::get_name(ictx, snap_id, snap_name);
+ }
+
+ int Image::snap_get_id(const std::string snap_name, uint64_t *snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Snapshot<>::get_id(ictx, snap_name, snap_id);
+ }
+
+ ssize_t Image::read(uint64_t ofs, size_t len, bufferlist& bl)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ bufferptr ptr(len);
+ bl.push_back(std::move(ptr));
+
+ int r = api::Io<>::read(*ictx, ofs, len, io::ReadResult{&bl}, 0);
+ tracepoint(librbd, read_exit, r);
+ return r;
+ }
+
+ ssize_t Image::read2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, op_flags);
+ bufferptr ptr(len);
+ bl.push_back(std::move(ptr));
+
+ int r = api::Io<>::read(*ictx, ofs, len, io::ReadResult{&bl}, op_flags);
+ tracepoint(librbd, read_exit, r);
+ return r;
+ }
+
+ int64_t Image::read_iterate(uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ tracepoint(librbd, read_iterate_exit, r);
+ return r;
+ }
+
+ int Image::read_iterate2(uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ if (r > 0)
+ r = 0;
+ tracepoint(librbd, read_iterate2_exit, r);
+ return (int)r;
+ }
+
+ int Image::diff_iterate(const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ true, false);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs,
+ len, true, false, cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+ }
+
+ int Image::diff_iterate2(const char *fromsnapname, uint64_t ofs, uint64_t len,
+ bool include_parent, bool whole_object,
+ int (*cb)(uint64_t, size_t, int, void *), void *arg)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ include_parent, whole_object);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs,
+ len, include_parent,
+ whole_object, cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+ }
+
+ ssize_t Image::write(uint64_t ofs, size_t len, bufferlist& bl)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, bl.length() < len ? NULL : bl.c_str());
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = api::Io<>::write(*ictx, ofs, len, bufferlist{bl}, 0);
+ tracepoint(librbd, write_exit, r);
+ return r;
+ }
+
+ ssize_t Image::write2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only,
+ ofs, len, bl.length() < len ? NULL : bl.c_str(), op_flags);
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = api::Io<>::write(*ictx, ofs, len, bufferlist{bl}, op_flags);
+ tracepoint(librbd, write_exit, r);
+ return r;
+ }
+
+ int Image::discard(uint64_t ofs, uint64_t len)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ if (len > static_cast<uint64_t>(std::numeric_limits<int32_t>::max())) {
+ tracepoint(librbd, discard_exit, -EINVAL);
+ return -EINVAL;
+ }
+ int r = api::Io<>::discard(
+ *ictx, ofs, len, ictx->discard_granularity_bytes);
+ tracepoint(librbd, discard_exit, r);
+ return r;
+ }
+
+ ssize_t Image::writesame(uint64_t ofs, size_t len, bufferlist& bl, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, bl.length() == 0 ? NULL : bl.c_str(), bl.length(),
+ op_flags);
+ if (bl.length() == 0 || len % bl.length() ||
+ len > static_cast<size_t>(std::numeric_limits<int>::max())) {
+ tracepoint(librbd, writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && bl.is_zero()) {
+ int r = api::Io<>::write_zeroes(*ictx, ofs, len, 0U, op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+ }
+
+ int r = api::Io<>::write_same(*ictx, ofs, len, bufferlist{bl}, op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+ }
+
+ ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags,
+ int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return api::Io<>::write_zeroes(*ictx, ofs, len, zero_flags, op_flags);
+ }
+
+ ssize_t Image::compare_and_write(uint64_t ofs, size_t len,
+ ceph::bufferlist &cmp_bl, ceph::bufferlist& bl,
+ uint64_t *mismatch_off, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(),
+ bl.length() < len ? NULL : bl.c_str(), op_flags);
+
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = api::Io<>::compare_and_write(
+ *ictx, ofs, len, bufferlist{cmp_bl}, bufferlist{bl}, mismatch_off,
+ op_flags);
+
+ tracepoint(librbd, compare_and_write_exit, r);
+
+ return r;
+ }
+
+ int Image::aio_write(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc);
+ if (bl.length() < len) {
+ tracepoint(librbd, aio_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+ api::Io<>::aio_write(*ictx, get_aio_completion(c), off, len, bufferlist{bl},
+ 0, true);
+
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_write2(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags);
+ if (bl.length() < len) {
+ tracepoint(librbd, aio_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+ api::Io<>::aio_write(*ictx, get_aio_completion(c), off, len, bufferlist{bl},
+ op_flags, true);
+
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_read(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.c_str(), c->pc);
+ ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
+ << (void *)(bl.c_str() + len - 1) << dendl;
+
+ api::Io<>::aio_read(*ictx, get_aio_completion(c), off, len,
+ io::ReadResult{&bl}, 0, true);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_read2(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, bl.c_str(), c->pc, op_flags);
+ ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
+ << (void *)(bl.c_str() + len - 1) << dendl;
+
+ api::Io<>::aio_read(*ictx, get_aio_completion(c), off, len,
+ io::ReadResult{&bl}, op_flags, true);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+ }
+
+ int Image::flush()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = api::Io<>::flush(*ictx);
+ tracepoint(librbd, flush_exit, r);
+ return r;
+ }
+
+ int Image::aio_flush(RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
+ api::Io<>::aio_flush(*ictx, get_aio_completion(c), true);
+ tracepoint(librbd, aio_flush_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, c->pc);
+ api::Io<>::aio_discard(
+ *ictx, get_aio_completion(c), off, len, ictx->discard_granularity_bytes,
+ true);
+ tracepoint(librbd, aio_discard_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_writesame(uint64_t off, size_t len, bufferlist& bl,
+ RBD::AioCompletion *c, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, bl.length() <= len ? NULL : bl.c_str(), bl.length(),
+ c->pc, op_flags);
+ if (bl.length() == 0 || len % bl.length()) {
+ tracepoint(librbd, aio_writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && bl.is_zero()) {
+ api::Io<>::aio_write_zeroes(*ictx, get_aio_completion(c), off, len, 0U,
+ op_flags, true);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+ }
+
+ api::Io<>::aio_write_same(*ictx, get_aio_completion(c), off, len,
+ bufferlist{bl}, op_flags, true);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+ }
+
+ int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c,
+ int zero_flags, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ api::Io<>::aio_write_zeroes(*ictx, get_aio_completion(c), off, len,
+ zero_flags, op_flags, true);
+ return 0;
+ }
+
+ int Image::aio_compare_and_write(uint64_t off, size_t len,
+ ceph::bufferlist& cmp_bl, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, uint64_t *mismatch_off,
+ int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(),
+ ictx->read_only, off, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(),
+ bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags);
+
+ if (bl.length() < len) {
+ tracepoint(librbd, compare_and_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ api::Io<>::aio_compare_and_write(*ictx, get_aio_completion(c), off, len,
+ bufferlist{cmp_bl}, bufferlist{bl},
+ mismatch_off, op_flags, false);
+
+ tracepoint(librbd, aio_compare_and_write_exit, 0);
+
+ return 0;
+ }
+
+ int Image::invalidate_cache()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::invalidate_cache(ictx);
+ tracepoint(librbd, invalidate_cache_exit, r);
+ return r;
+ }
+
+ int Image::poll_io_events(RBD::AioCompletion **comps, int numcomp)
+ {
+ io::AioCompletion *cs[numcomp];
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, poll_io_events_enter, ictx, numcomp);
+ int r = librbd::poll_io_events(ictx, cs, numcomp);
+ tracepoint(librbd, poll_io_events_exit, r);
+ if (r > 0) {
+ for (int i = 0; i < r; ++i)
+ comps[i] = (RBD::AioCompletion *)cs[i]->rbd_comp;
+ }
+ return r;
+ }
+
+ int Image::metadata_get(const std::string &key, std::string *value)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_get_enter, ictx, key.c_str());
+ int r = librbd::metadata_get(ictx, key, value);
+ if (r < 0) {
+ tracepoint(librbd, metadata_get_exit, r, key.c_str(), NULL);
+ } else {
+ tracepoint(librbd, metadata_get_exit, r, key.c_str(), value->c_str());
+ }
+ return r;
+ }
+
+ int Image::metadata_set(const std::string &key, const std::string &value)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_set_enter, ictx, key.c_str(), value.c_str());
+ int r = ictx->operations->metadata_set(key, value);
+ tracepoint(librbd, metadata_set_exit, r);
+ return r;
+ }
+
+ int Image::metadata_remove(const std::string &key)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_remove_enter, ictx, key.c_str());
+ int r = ictx->operations->metadata_remove(key);
+ tracepoint(librbd, metadata_remove_exit, r);
+ return r;
+ }
+
+ int Image::metadata_list(const std::string &start, uint64_t max, map<string, bufferlist> *pairs)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, metadata_list_enter, ictx);
+ int r = librbd::metadata_list(ictx, start, max, pairs);
+ if (r >= 0) {
+ for (map<string, bufferlist>::iterator it = pairs->begin();
+ it != pairs->end(); ++it) {
+ tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str());
+ }
+ }
+ tracepoint(librbd, metadata_list_exit, r);
+ return r;
+ }
+
+ int Image::mirror_image_enable() {
+ return mirror_image_enable2(RBD_MIRROR_IMAGE_MODE_JOURNAL);
+ }
+
+ int Image::mirror_image_enable2(mirror_image_mode_t mode) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_enable(ictx, mode, false);
+ }
+
+ int Image::mirror_image_disable(bool force) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_disable(ictx, force);
+ }
+
+ int Image::mirror_image_promote(bool force) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_promote(ictx, force);
+ }
+
+ int Image::mirror_image_demote() {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_demote(ictx);
+ }
+
+ int Image::mirror_image_resync()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_resync(ictx);
+ }
+
+ int Image::mirror_image_create_snapshot(uint64_t *snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ auto flags = librbd::util::get_default_snap_create_flags(ictx);
+ return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id);
+ }
+
+ int Image::mirror_image_create_snapshot2(uint32_t flags, uint64_t *snap_id)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id);
+ }
+
+ int Image::mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ return librbd::api::Mirror<>::image_get_info(ictx, mirror_image_info);
+ }
+
+ int Image::mirror_image_get_mode(mirror_image_mode_t *mode) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ return librbd::api::Mirror<>::image_get_mode(ictx, mode);
+ }
+
+ int Image::mirror_image_get_global_status(
+ mirror_image_global_status_t *mirror_image_global_status,
+ size_t status_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_global_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ return librbd::api::Mirror<>::image_get_global_status(
+ ictx, mirror_image_global_status);
+ }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+ int Image::mirror_image_get_status(mirror_image_status_t *mirror_image_status,
+ size_t status_size) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ mirror_image_global_status_t mirror_image_global_status;
+ int r = librbd::api::Mirror<>::image_get_global_status(
+ ictx, &mirror_image_global_status);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::mirror_image_site_status_t local_status;
+ r = get_local_mirror_image_site_status(mirror_image_global_status,
+ &local_status);
+ if (r < 0) {
+ return r;
+ }
+
+ *mirror_image_status = mirror_image_status_t{
+ mirror_image_global_status.name, mirror_image_global_status.info,
+ local_status.state, local_status.description, local_status.last_update,
+ local_status.up};
+ return 0;
+ }
+
+#pragma GCC diagnostic pop
+
+ int Image::mirror_image_get_instance_id(std::string *instance_id) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ return librbd::api::Mirror<>::image_get_instance_id(ictx, instance_id);
+ }
+
+ int Image::aio_mirror_image_promote(bool force, RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ librbd::api::Mirror<>::image_promote(
+ ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_demote(RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ librbd::api::Mirror<>::image_demote(
+ ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+ size_t info_size,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ librbd::api::Mirror<>::image_get_info(
+ ictx, mirror_image_info,
+ new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_get_mode(mirror_image_mode_t *mode,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ librbd::api::Mirror<>::image_get_mode(
+ ictx, mode, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::aio_mirror_image_get_global_status(
+ mirror_image_global_status_t *status, size_t status_size,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_global_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ librbd::api::Mirror<>::image_get_global_status(
+ ictx, status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+ int Image::aio_mirror_image_get_status(mirror_image_status_t *status,
+ size_t status_size,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ if (sizeof(mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ auto ctx = new C_MirrorImageGetStatus(
+ status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ librbd::api::Mirror<>::image_get_global_status(
+ ictx, &ctx->cpp_mirror_image_global_status, ctx);
+ return 0;
+ }
+
+#pragma GCC diagnostic pop
+
+ int Image::aio_mirror_image_create_snapshot(uint32_t flags, uint64_t *snap_id,
+ RBD::AioCompletion *c) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+
+ librbd::api::Mirror<>::image_snapshot_create(
+ ictx, flags, snap_id, new C_AioCompletion(ictx,
+ librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(c)));
+ return 0;
+ }
+
+ int Image::update_watch(UpdateWatchCtx *wctx, uint64_t *handle) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, update_watch_enter, ictx, wctx);
+ int r = ictx->state->register_update_watcher(wctx, handle);
+ tracepoint(librbd, update_watch_exit, r, *handle);
+ return r;
+ }
+
+ int Image::update_unwatch(uint64_t handle) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, update_unwatch_enter, ictx, handle);
+ int r = ictx->state->unregister_update_watcher(handle);
+ tracepoint(librbd, update_unwatch_exit, r);
+ return r;
+ }
+
+ int Image::list_watchers(std::list<librbd::image_watcher_t> &watchers) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::list_watchers(ictx, watchers);
+#ifdef WITH_LTTNG
+ if (r >= 0) {
+ for (auto &watcher : watchers) {
+ tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie);
+ }
+ }
+#endif
+ tracepoint(librbd, list_watchers_exit, r, watchers.size());
+ return r;
+ }
+
+ int Image::config_list(std::vector<config_option_t> *options) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::api::Config<>::list(ictx, options);
+ }
+
+ int Image::quiesce_watch(QuiesceWatchCtx *wctx, uint64_t *handle) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ int r = ictx->state->register_quiesce_watcher(wctx, handle);
+ return r;
+ }
+
+ int Image::quiesce_unwatch(uint64_t handle) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ int r = ictx->state->unregister_quiesce_watcher(handle);
+ return r;
+ }
+
+ void Image::quiesce_complete(uint64_t handle, int r) {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ ictx->state->quiesce_complete(handle, r);
+ }
+
+} // namespace librbd
+
+extern "C" void rbd_version(int *major, int *minor, int *extra)
+{
+ if (major)
+ *major = LIBRBD_VER_MAJOR;
+ if (minor)
+ *minor = LIBRBD_VER_MINOR;
+ if (extra)
+ *extra = LIBRBD_VER_EXTRA;
+}
+
+extern "C" void rbd_image_options_create(rbd_image_options_t* opts)
+{
+ librbd::image_options_create(opts);
+}
+
+extern "C" void rbd_image_options_destroy(rbd_image_options_t opts)
+{
+ librbd::image_options_destroy(opts);
+}
+
+extern "C" int rbd_image_options_set_string(rbd_image_options_t opts, int optname,
+ const char* optval)
+{
+ return librbd::image_options_set(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_set_uint64(rbd_image_options_t opts, int optname,
+ uint64_t optval)
+{
+ return librbd::image_options_set(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_get_string(rbd_image_options_t opts, int optname,
+ char* optval, size_t maxlen)
+{
+ std::string optval_;
+
+ int r = librbd::image_options_get(opts, optname, &optval_);
+
+ if (r < 0) {
+ return r;
+ }
+
+ if (optval_.size() >= maxlen) {
+ return -E2BIG;
+ }
+
+ strncpy(optval, optval_.c_str(), maxlen);
+
+ return 0;
+}
+
+extern "C" int rbd_image_options_get_uint64(rbd_image_options_t opts, int optname,
+ uint64_t* optval)
+{
+ return librbd::image_options_get(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_is_set(rbd_image_options_t opts, int optname,
+ bool* is_set)
+{
+ return librbd::image_options_is_set(opts, optname, is_set);
+}
+
+extern "C" int rbd_image_options_unset(rbd_image_options_t opts, int optname)
+{
+ return librbd::image_options_unset(opts, optname);
+}
+
+extern "C" void rbd_image_options_clear(rbd_image_options_t opts)
+{
+ librbd::image_options_clear(opts);
+}
+
+extern "C" int rbd_image_options_is_empty(rbd_image_options_t opts)
+{
+ return librbd::image_options_is_empty(opts);
+}
+
+/* pool mirroring */
+extern "C" int rbd_mirror_site_name_get(rados_t cluster, char *name,
+ size_t *max_len) {
+ librados::Rados rados;
+ librados::Rados::from_rados_t(cluster, rados);
+
+ std::string site_name;
+ int r = librbd::api::Mirror<>::site_name_get(rados, &site_name);
+ if (r < 0) {
+ return r;
+ }
+
+ auto total_len = site_name.size() + 1;
+ if (*max_len < total_len) {
+ *max_len = total_len;
+ return -ERANGE;
+ }
+ *max_len = total_len;
+
+ strcpy(name, site_name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_mirror_site_name_set(rados_t cluster, const char *name) {
+ librados::Rados rados;
+ librados::Rados::from_rados_t(cluster, rados);
+ return librbd::api::Mirror<>::site_name_set(rados, name);
+}
+
+extern "C" int rbd_mirror_mode_get(rados_ioctx_t p,
+ rbd_mirror_mode_t *mirror_mode) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode);
+}
+
+extern "C" int rbd_mirror_mode_set(rados_ioctx_t p,
+ rbd_mirror_mode_t mirror_mode) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode);
+}
+
+extern "C" int rbd_mirror_uuid_get(rados_ioctx_t p,
+ char *mirror_uuid, size_t *max_len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::string mirror_uuid_str;
+ int r = librbd::api::Mirror<>::uuid_get(io_ctx, &mirror_uuid_str);
+ if (r < 0) {
+ return r;
+ }
+
+ auto total_len = mirror_uuid_str.size() + 1;
+ if (*max_len < total_len) {
+ *max_len = total_len;
+ return -ERANGE;
+ }
+ *max_len = total_len;
+
+ strcpy(mirror_uuid, mirror_uuid_str.c_str());
+ return 0;
+}
+
+extern "C" int rbd_mirror_peer_bootstrap_create(rados_ioctx_t p, char *token,
+ size_t *max_len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::string token_str;
+ int r = librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, &token_str);
+ if (r < 0) {
+ return r;
+ }
+
+ auto total_len = token_str.size() + 1;
+ if (*max_len < total_len) {
+ *max_len = total_len;
+ return -ERANGE;
+ }
+ *max_len = total_len;
+
+ strcpy(token, token_str.c_str());
+ return 0;
+}
+
+extern "C" int rbd_mirror_peer_bootstrap_import(
+ rados_ioctx_t p, rbd_mirror_peer_direction_t direction,
+ const char *token) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction, token);
+}
+
+extern "C" int rbd_mirror_peer_site_add(rados_ioctx_t p, char *uuid,
+ size_t uuid_max_length,
+ rbd_mirror_peer_direction_t direction,
+ const char *site_name,
+ const char *client_name) {
+ static const std::size_t UUID_LENGTH = 36;
+
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ if (uuid_max_length < UUID_LENGTH + 1) {
+ return -E2BIG;
+ }
+
+ std::string uuid_str;
+ int r = librbd::api::Mirror<>::peer_site_add(io_ctx, &uuid_str, direction,
+ site_name, client_name);
+ if (r >= 0) {
+ strncpy(uuid, uuid_str.c_str(), uuid_max_length);
+ uuid[uuid_max_length - 1] = '\0';
+ }
+ return r;
+}
+
+extern "C" int rbd_mirror_peer_site_remove(rados_ioctx_t p, const char *uuid) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ int r = librbd::api::Mirror<>::peer_site_remove(io_ctx, uuid);
+ return r;
+}
+
+extern "C" int rbd_mirror_peer_site_list(
+ rados_ioctx_t p, rbd_mirror_peer_site_t *peers, int *max_peers) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::vector<librbd::mirror_peer_site_t> peer_vector;
+ int r = librbd::api::Mirror<>::peer_site_list(io_ctx, &peer_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_peers < static_cast<int>(peer_vector.size())) {
+ *max_peers = static_cast<int>(peer_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(peer_vector.size()); ++i) {
+ peers[i].uuid = strdup(peer_vector[i].uuid.c_str());
+ peers[i].direction = peer_vector[i].direction;
+ peers[i].site_name = strdup(peer_vector[i].site_name.c_str());
+ peers[i].mirror_uuid = strdup(peer_vector[i].mirror_uuid.c_str());
+ peers[i].client_name = strdup(peer_vector[i].client_name.c_str());
+ }
+ *max_peers = static_cast<int>(peer_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_mirror_peer_site_list_cleanup(rbd_mirror_peer_site_t *peers,
+ int max_peers) {
+ for (int i = 0; i < max_peers; ++i) {
+ free(peers[i].uuid);
+ free(peers[i].site_name);
+ free(peers[i].mirror_uuid);
+ free(peers[i].client_name);
+ }
+}
+
+extern "C" int rbd_mirror_peer_site_set_client_name(
+ rados_ioctx_t p, const char *uuid, const char *client_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::peer_site_set_client(io_ctx, uuid, client_name);
+}
+
+extern "C" int rbd_mirror_peer_site_set_name(
+ rados_ioctx_t p, const char *uuid, const char *site_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::peer_site_set_name(io_ctx, uuid, site_name);
+}
+
+extern "C" int rbd_mirror_peer_site_set_direction(
+ rados_ioctx_t p, const char *uuid, rbd_mirror_peer_direction_t direction) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ return librbd::api::Mirror<>::peer_site_set_direction(io_ctx, uuid,
+ direction);
+}
+
+extern "C" int rbd_mirror_peer_site_get_attributes(
+ rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+ char *values, size_t *max_val_len, size_t *key_value_count) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::map<std::string, std::string> attributes;
+ int r = librbd::api::Mirror<>::peer_site_get_attributes(
+ io_ctx, uuid, &attributes);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t key_total_len = 0, val_total_len = 0;
+ for (auto& it : attributes) {
+ key_total_len += it.first.size() + 1;
+ val_total_len += it.second.length() + 1;
+ }
+
+ bool too_short = ((*max_key_len < key_total_len) ||
+ (*max_val_len < val_total_len));
+
+ *max_key_len = key_total_len;
+ *max_val_len = val_total_len;
+ *key_value_count = attributes.size();
+ if (too_short) {
+ return -ERANGE;
+ }
+
+ char *keys_p = keys;
+ char *values_p = values;
+ for (auto& it : attributes) {
+ strncpy(keys_p, it.first.c_str(), it.first.size() + 1);
+ keys_p += it.first.size() + 1;
+
+ strncpy(values_p, it.second.c_str(), it.second.length() + 1);
+ values_p += it.second.length() + 1;
+ }
+
+ return 0;
+}
+
+extern "C" int rbd_mirror_peer_site_set_attributes(
+ rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+ size_t count) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::map<std::string, std::string> attributes;
+
+ for (size_t i = 0; i < count; ++i) {
+ const char* key = keys;
+ keys += strlen(key) + 1;
+ const char* value = values;
+ values += strlen(value) + 1;
+ attributes[key] = value;
+ }
+
+ return librbd::api::Mirror<>::peer_site_set_attributes(
+ io_ctx, uuid, attributes);
+}
+
+extern "C" int rbd_mirror_image_global_status_list(rados_ioctx_t p,
+ const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_global_status_t *images, size_t *len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ std::map<std::string, librbd::mirror_image_global_status_t> cpp_images;
+
+ int r = librbd::api::Mirror<>::image_global_status_list(
+ io_ctx, start_id, max, &cpp_images);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : cpp_images) {
+ ceph_assert(i < max);
+ const std::string &image_id = it.first;
+ image_ids[i] = strdup(image_id.c_str());
+ mirror_image_global_status_cpp_to_c(it.second, &images[i]);
+ i++;
+ }
+ *len = i;
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_global_status_cleanup(
+ rbd_mirror_image_global_status_t *global_status) {
+ free(global_status->name);
+ rbd_mirror_image_get_info_cleanup(&global_status->info);
+ for (auto idx = 0U; idx < global_status->site_statuses_count; ++idx) {
+ free(global_status->site_statuses[idx].mirror_uuid);
+ free(global_status->site_statuses[idx].description);
+ }
+ free(global_status->site_statuses);
+}
+
+extern "C" void rbd_mirror_image_global_status_list_cleanup(
+ char **image_ids, rbd_mirror_image_global_status_t *images, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ free(image_ids[i]);
+ rbd_mirror_image_global_status_cleanup(&images[i]);
+ }
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+extern "C" int rbd_mirror_peer_add(rados_ioctx_t p, char *uuid,
+ size_t uuid_max_length,
+ const char *cluster_name,
+ const char *client_name) {
+ return rbd_mirror_peer_site_add(
+ p, uuid, uuid_max_length, RBD_MIRROR_PEER_DIRECTION_RX_TX, cluster_name,
+ client_name);
+}
+
+extern "C" int rbd_mirror_peer_remove(rados_ioctx_t p, const char *uuid) {
+ return rbd_mirror_peer_site_remove(p, uuid);
+}
+
+extern "C" int rbd_mirror_peer_list(rados_ioctx_t p,
+ rbd_mirror_peer_t *peers, int *max_peers) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::vector<librbd::mirror_peer_site_t> peer_vector;
+ int r = librbd::api::Mirror<>::peer_site_list(io_ctx, &peer_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_peers < static_cast<int>(peer_vector.size())) {
+ *max_peers = static_cast<int>(peer_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(peer_vector.size()); ++i) {
+ peers[i].uuid = strdup(peer_vector[i].uuid.c_str());
+ peers[i].cluster_name = strdup(peer_vector[i].site_name.c_str());
+ peers[i].client_name = strdup(peer_vector[i].client_name.c_str());
+ }
+ *max_peers = static_cast<int>(peer_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers,
+ int max_peers) {
+ for (int i = 0; i < max_peers; ++i) {
+ free(peers[i].uuid);
+ free(peers[i].cluster_name);
+ free(peers[i].client_name);
+ }
+}
+
+extern "C" int rbd_mirror_peer_set_client(rados_ioctx_t p, const char *uuid,
+ const char *client_name) {
+ return rbd_mirror_peer_site_set_client_name(p, uuid, client_name);
+}
+
+extern "C" int rbd_mirror_peer_set_cluster(rados_ioctx_t p, const char *uuid,
+ const char *cluster_name) {
+ return rbd_mirror_peer_site_set_name(p, uuid, cluster_name);
+}
+
+extern "C" int rbd_mirror_peer_get_attributes(
+ rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+ char *values, size_t *max_val_len, size_t *key_value_count) {
+ return rbd_mirror_peer_site_get_attributes(
+ p, uuid, keys, max_key_len, values, max_val_len, key_value_count);
+}
+
+extern "C" int rbd_mirror_peer_set_attributes(
+ rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+ size_t count) {
+ return rbd_mirror_peer_site_set_attributes(
+ p, uuid, keys, values, count);
+}
+
+extern "C" int rbd_mirror_image_status_list(rados_ioctx_t p,
+ const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_status_t *images, size_t *len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ std::map<std::string, librbd::mirror_image_global_status_t> cpp_images;
+
+ int r = librbd::api::Mirror<>::image_global_status_list(
+ io_ctx, start_id, max, &cpp_images);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : cpp_images) {
+ ceph_assert(i < max);
+ const std::string &image_id = it.first;
+ image_ids[i] = strdup(image_id.c_str());
+ mirror_image_global_status_cpp_to_c(it.second, &images[i]);
+ i++;
+ }
+ *len = i;
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_status_list_cleanup(char **image_ids,
+ rbd_mirror_image_status_t *images, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ free(image_ids[i]);
+ free(images[i].name);
+ rbd_mirror_image_get_info_cleanup(&images[i].info);
+ free(images[i].description);
+ }
+}
+
+#pragma GCC diagnostic pop
+
+extern "C" int rbd_mirror_image_status_summary(rados_ioctx_t p,
+ rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen) {
+
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::map<librbd::mirror_image_status_state_t, int> states_;
+ int r = librbd::api::Mirror<>::image_status_summary(io_ctx, &states_);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : states_) {
+ if (i == *maxlen) {
+ return -ERANGE;
+ }
+ states[i] = it.first;
+ counts[i] = it.second;
+ i++;
+ }
+ *maxlen = i;
+ return 0;
+}
+
+extern "C" int rbd_mirror_image_instance_id_list(
+ rados_ioctx_t p, const char *start_id, size_t max, char **image_ids,
+ char **instance_ids, size_t *len) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ std::map<std::string, std::string> cpp_instance_ids;
+
+ int r = librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max,
+ &cpp_instance_ids);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t i = 0;
+ for (auto &it : cpp_instance_ids) {
+ ceph_assert(i < max);
+ image_ids[i] = strdup(it.first.c_str());
+ instance_ids[i] = strdup(it.second.c_str());
+ i++;
+ }
+ *len = i;
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_instance_id_list_cleanup(
+ char **image_ids, char **instance_ids, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ free(image_ids[i]);
+ free(instance_ids[i]);
+ }
+}
+
+extern "C" int rbd_mirror_image_info_list(
+ rados_ioctx_t p, rbd_mirror_image_mode_t *mode_filter,
+ const char *start_id, size_t max, char **image_ids,
+ rbd_mirror_image_mode_t *mode_entries,
+ rbd_mirror_image_info_t *info_entries, size_t *num_entries) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ std::map<std::string, std::pair<librbd::mirror_image_mode_t,
+ librbd::mirror_image_info_t>> cpp_entries;
+
+ int r = librbd::api::Mirror<>::image_info_list(io_ctx, mode_filter, start_id,
+ max, &cpp_entries);
+ if (r < 0) {
+ return r;
+ }
+
+ ceph_assert(cpp_entries.size() <= max);
+
+ for (auto &it : cpp_entries) {
+ *(image_ids++) = strdup(it.first.c_str());
+ *(mode_entries++) = it.second.first;
+ mirror_image_info_cpp_to_c(it.second.second, info_entries++);
+ }
+ *num_entries = cpp_entries.size();
+
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_info_list_cleanup(
+ char **image_ids, rbd_mirror_image_info_t *info_entries,
+ size_t num_entries) {
+ for (size_t i = 0; i < num_entries; i++) {
+ free(*(image_ids++));
+ rbd_mirror_image_get_info_cleanup(info_entries++);
+ }
+}
+
+/* helpers */
+
+extern "C" void rbd_image_spec_cleanup(rbd_image_spec_t *image)
+{
+ free(image->id);
+ free(image->name);
+}
+
+extern "C" void rbd_image_spec_list_cleanup(rbd_image_spec_t *images,
+ size_t num_images)
+{
+ for (size_t idx = 0; idx < num_images; ++idx) {
+ rbd_image_spec_cleanup(&images[idx]);
+ }
+}
+
+extern "C" void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image)
+{
+ free(image->pool_name);
+ free(image->pool_namespace);
+ free(image->image_id);
+ free(image->image_name);
+}
+
+extern "C" void rbd_linked_image_spec_list_cleanup(
+ rbd_linked_image_spec_t *images, size_t num_images)
+{
+ for (size_t idx = 0; idx < num_images; ++idx) {
+ rbd_linked_image_spec_cleanup(&images[idx]);
+ }
+}
+
+extern "C" void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap)
+{
+ free(snap->name);
+}
+
+/* images */
+extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+ std::vector<librbd::image_spec_t> cpp_image_specs;
+ int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs);
+ if (r < 0) {
+ tracepoint(librbd, list_exit, r, *size);
+ return r;
+ }
+
+ size_t expected_size = 0;
+
+ for (auto& it : cpp_image_specs) {
+ expected_size += it.name.size() + 1;
+ }
+ if (*size < expected_size) {
+ *size = expected_size;
+ tracepoint(librbd, list_exit, -ERANGE, *size);
+ return -ERANGE;
+ }
+
+ if (names == NULL) {
+ tracepoint(librbd, list_exit, -EINVAL, *size);
+ return -EINVAL;
+ }
+
+ for (auto& it : cpp_image_specs) {
+ const char* name = it.name.c_str();
+ tracepoint(librbd, list_entry, name);
+ strcpy(names, name);
+ names += strlen(names) + 1;
+ }
+ tracepoint(librbd, list_exit, (int)expected_size, *size);
+ return (int)expected_size;
+}
+
+extern "C" int rbd_list2(rados_ioctx_t p, rbd_image_spec_t *images,
+ size_t *size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *size);
+ std::vector<librbd::image_spec_t> cpp_image_specs;
+ int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs);
+ if (r < 0) {
+ tracepoint(librbd, list_exit, r, *size);
+ return r;
+ }
+
+ size_t expected_size = cpp_image_specs.size();
+ if (*size < expected_size) {
+ *size = expected_size;
+ tracepoint(librbd, list_exit, -ERANGE, *size);
+ return -ERANGE;
+ }
+
+ *size = expected_size;
+ for (size_t idx = 0; idx < expected_size; ++idx) {
+ images[idx].id = strdup(cpp_image_specs[idx].id.c_str());
+ images[idx].name = strdup(cpp_image_specs[idx].name.c_str());
+ }
+ tracepoint(librbd, list_exit, 0, *size);
+ return 0;
+}
+
+extern "C" int rbd_create(rados_ioctx_t p, const char *name, uint64_t size, int *order)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order);
+ int r = librbd::create(io_ctx, name, size, order);
+ tracepoint(librbd, create_exit, r, *order);
+ return r;
+}
+
+extern "C" int rbd_create2(rados_ioctx_t p, const char *name,
+ uint64_t size, uint64_t features,
+ int *order)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order);
+ int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0);
+ tracepoint(librbd, create2_exit, r, *order);
+ return r;
+}
+
+extern "C" int rbd_create3(rados_ioctx_t p, const char *name,
+ uint64_t size, uint64_t features,
+ int *order,
+ uint64_t stripe_unit, uint64_t stripe_count)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count);
+ int r = librbd::create(io_ctx, name, size, false, features, order,
+ stripe_unit, stripe_count);
+ tracepoint(librbd, create3_exit, r, *order);
+ return r;
+}
+
+extern "C" int rbd_create4(rados_ioctx_t p, const char *name,
+ uint64_t size, rbd_image_options_t opts)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts);
+ librbd::ImageOptions opts_(opts);
+ int r = librbd::create(io_ctx, name, "", size, opts_, "", "", false);
+ tracepoint(librbd, create4_exit, r);
+ return r;
+}
+
+extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snap_name, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order)
+{
+ librados::IoCtx p_ioc, c_ioc;
+ librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+ librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+ tracepoint(librbd, clone_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features);
+ int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name,
+ features, c_order, 0, 0);
+ tracepoint(librbd, clone_exit, r, *c_order);
+ return r;
+}
+
+extern "C" int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snap_name, rados_ioctx_t c_ioctx,
+ const char *c_name, uint64_t features, int *c_order,
+ uint64_t stripe_unit, int stripe_count)
+{
+ librados::IoCtx p_ioc, c_ioc;
+ librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+ librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+ tracepoint(librbd, clone2_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features, stripe_unit, stripe_count);
+ int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name,
+ features, c_order, stripe_unit, stripe_count);
+ tracepoint(librbd, clone2_exit, r, *c_order);
+ return r;
+}
+
+extern "C" int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+ const char *p_snap_name, rados_ioctx_t c_ioctx,
+ const char *c_name, rbd_image_options_t c_opts)
+{
+ librados::IoCtx p_ioc, c_ioc;
+ librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+ librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+ tracepoint(librbd, clone3_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, c_opts);
+ librbd::ImageOptions c_opts_(c_opts);
+ int r = librbd::clone(p_ioc, nullptr, p_name, p_snap_name, c_ioc, nullptr,
+ c_name, c_opts_, "", "");
+ tracepoint(librbd, clone3_exit, r);
+ return r;
+}
+
+extern "C" int rbd_remove(rados_ioctx_t p, const char *name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_remove_with_progress(rados_ioctx_t p, const char *name,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx);
+ tracepoint(librbd, remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_move(rados_ioctx_t p, const char *name,
+ uint64_t delay) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER, name,
+ delay);
+ tracepoint(librbd, trash_move_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_get(rados_ioctx_t io, const char *id,
+ rbd_trash_image_info_t *info) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ librbd::trash_image_info_t cpp_info;
+ int r = librbd::api::Trash<>::get(io_ctx, id, &cpp_info);
+ if (r < 0) {
+ return r;
+ }
+
+ trash_image_info_cpp_to_c(cpp_info, info);
+ return 0;
+}
+
+extern "C" void rbd_trash_get_cleanup(rbd_trash_image_info_t *info) {
+ free(info->id);
+ free(info->name);
+}
+
+extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries,
+ size_t *num_entries) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_list_enter,
+ io_ctx.get_pool_name().c_str(), io_ctx.get_id());
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(entries, 0, sizeof(*entries) * *num_entries);
+
+ vector<librbd::trash_image_info_t> cpp_entries;
+ int r = librbd::api::Trash<>::list(io_ctx, cpp_entries, true);
+ if (r < 0) {
+ tracepoint(librbd, trash_list_exit, r, *num_entries);
+ return r;
+ }
+
+ if (*num_entries < cpp_entries.size()) {
+ *num_entries = cpp_entries.size();
+ tracepoint(librbd, trash_list_exit, -ERANGE, *num_entries);
+ return -ERANGE;
+ }
+
+ int i=0;
+ for (const auto &entry : cpp_entries) {
+ trash_image_info_cpp_to_c(entry, &entries[i++]);
+ }
+ *num_entries = cpp_entries.size();
+
+ return *num_entries;
+}
+
+extern "C" void rbd_trash_list_cleanup(rbd_trash_image_info_t *entries,
+ size_t num_entries) {
+ for (size_t i=0; i < num_entries; i++) {
+ rbd_trash_get_cleanup(&entries[i]);
+ }
+}
+
+extern "C" int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts,
+ float threshold) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ librbd::NoOpProgressContext nop_pctx;
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts,
+ float threshold, librbd_progress_fn_t cb, void* cbdata) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), expire_ts, threshold);
+ librbd::CProgressContext pctx(cb, cbdata);
+ int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx);
+ tracepoint(librbd, trash_purge_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_remove(rados_ioctx_t p, const char *image_id,
+ bool force) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_remove_with_progress(rados_ioctx_t p,
+ const char *image_id,
+ bool force,
+ librbd_progress_fn_t cb,
+ void *cbdata) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_id, force);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx);
+ tracepoint(librbd, trash_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_trash_restore(rados_ioctx_t p, const char *id,
+ const char *name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), id, name);
+ int r = librbd::api::Trash<>::restore(
+ io_ctx, librbd::api::Trash<>::ALLOWED_RESTORE_SOURCES, id, name);
+ tracepoint(librbd, trash_undelete_exit, r);
+ return r;
+}
+
+extern "C" int rbd_namespace_create(rados_ioctx_t io,
+ const char *namespace_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Namespace<>::create(io_ctx, namespace_name);
+}
+
+extern "C" int rbd_namespace_remove(rados_ioctx_t io,
+ const char *namespace_name) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Namespace<>::remove(io_ctx, namespace_name);
+}
+
+extern "C" int rbd_namespace_list(rados_ioctx_t io, char *names, size_t *size) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ if (names == nullptr || size == nullptr) {
+ return -EINVAL;
+ }
+
+ std::vector<std::string> cpp_names;
+ int r = librbd::api::Namespace<>::list(io_ctx, &cpp_names);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t expected_size = 0;
+ for (size_t i = 0; i < cpp_names.size(); i++) {
+ expected_size += cpp_names[i].size() + 1;
+ }
+ if (*size < expected_size) {
+ *size = expected_size;
+ return -ERANGE;
+ }
+
+ *size = expected_size;
+ for (int i = 0; i < (int)cpp_names.size(); i++) {
+ const char* name = cpp_names[i].c_str();
+ strcpy(names, name);
+ names += strlen(names) + 1;
+ }
+
+ return (int)expected_size;
+}
+
+extern "C" int rbd_namespace_exists(rados_ioctx_t io,
+ const char *namespace_name,
+ bool *exists) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists);
+}
+
+extern "C" int rbd_pool_init(rados_ioctx_t io, bool force) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ return librbd::api::Pool<>::init(io_ctx, force);
+}
+
+extern "C" void rbd_pool_stats_create(rbd_pool_stats_t *stats) {
+ *stats = reinterpret_cast<rbd_pool_stats_t>(
+ new librbd::api::Pool<>::StatOptions{});
+}
+
+extern "C" void rbd_pool_stats_destroy(rbd_pool_stats_t stats) {
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats);
+ delete pool_stat_options;
+}
+
+extern "C" int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats,
+ int stat_option,
+ uint64_t* stat_val) {
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(stats);
+ return librbd::api::Pool<>::add_stat_option(
+ pool_stat_options, static_cast<rbd_pool_stat_option_t>(stat_option),
+ stat_val);
+}
+
+extern "C" int rbd_pool_stats_get(
+ rados_ioctx_t io, rbd_pool_stats_t pool_stats) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+ auto pool_stat_options =
+ reinterpret_cast<librbd::api::Pool<>::StatOptions*>(pool_stats);
+ return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options);
+}
+
+extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ librbd::ImageOptions opts;
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy2(rbd_image_t srcp, rbd_image_t destp)
+{
+ librbd::ImageCtx *src = (librbd::ImageCtx *)srcp;
+ librbd::ImageCtx *dest = (librbd::ImageCtx *)destp;
+ tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(src, dest, prog_ctx, 0);
+ tracepoint(librbd, copy2_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy3(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname, rbd_image_options_t c_opts)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts);
+ librbd::ImageOptions c_opts_(c_opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, 0);
+ tracepoint(librbd, copy3_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy4(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname, rbd_image_options_t c_opts, size_t sparse_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts, sparse_size);
+ librbd::ImageOptions c_opts_(c_opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, sparse_size);
+ tracepoint(librbd, copy4_exit, r);
+ return r;
+}
+
+extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+ librbd::ImageOptions opts;
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0);
+ tracepoint(librbd, copy_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_copy_with_progress2(rbd_image_t srcp, rbd_image_t destp,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *src = (librbd::ImageCtx *)srcp;
+ librbd::ImageCtx *dest = (librbd::ImageCtx *)destp;
+ tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(src, dest, prog_ctx, 0);
+ tracepoint(librbd, copy2_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_copy_with_progress3(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts);
+ librbd::ImageOptions dest_opts_(dest_opts);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, 0);
+ tracepoint(librbd, copy3_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_copy_with_progress4(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t fn, void *data, size_t sparse_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts, sparse_size);
+ librbd::ImageOptions dest_opts_(dest_opts);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, sparse_size);
+ tracepoint(librbd, copy4_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_deep_copy(rbd_image_t image, rados_ioctx_t dest_p,
+ const char *destname, rbd_image_options_t c_opts)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, c_opts);
+ librbd::ImageOptions opts(c_opts);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, r);
+ return r;
+}
+
+extern "C" int rbd_deep_copy_with_progress(rbd_image_t image,
+ rados_ioctx_t dest_p,
+ const char *destname,
+ rbd_image_options_t dest_opts,
+ librbd_progress_fn_t fn, void *data)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only,
+ dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(),
+ destname, dest_opts);
+ librbd::ImageOptions opts(dest_opts);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int ret = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts,
+ prog_ctx);
+ tracepoint(librbd, deep_copy_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_encryption_format(rbd_image_t image,
+ rbd_encryption_format_t format,
+ rbd_encryption_options_t opts,
+ size_t opts_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Image<>::encryption_format(
+ ictx, format, opts, opts_size, true);
+}
+
+extern "C" int rbd_encryption_load(rbd_image_t image,
+ rbd_encryption_format_t format,
+ rbd_encryption_options_t opts,
+ size_t opts_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Image<>::encryption_load(
+ ictx, format, opts, opts_size, true);
+}
+
+extern "C" int rbd_flatten(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+}
+
+extern "C" int rbd_flatten_with_progress(rbd_image_t image,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->flatten(prog_ctx);
+ tracepoint(librbd, flatten_exit, r);
+ return r;
+}
+
+extern "C" int rbd_sparsify(rbd_image_t image, size_t sparse_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+}
+
+extern "C" int rbd_sparsify_with_progress(rbd_image_t image, size_t sparse_size,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size,
+ ictx->id.c_str());
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->sparsify(sparse_size, prog_ctx);
+ tracepoint(librbd, sparsify_exit, r);
+ return r;
+}
+
+extern "C" int rbd_rename(rados_ioctx_t src_p, const char *srcname,
+ const char *destname)
+{
+ librados::IoCtx src_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(src_p, src_io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx));
+ tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname);
+ int r = librbd::rename(src_io_ctx, srcname, destname);
+ tracepoint(librbd, rename_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_prepare(rados_ioctx_t p, const char *image_name,
+ rados_ioctx_t dest_p,
+ const char *dest_image_name,
+ rbd_image_options_t opts_)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(),
+ dest_io_ctx.get_id(), dest_image_name, opts_);
+ librbd::ImageOptions opts(opts_);
+ int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx,
+ dest_image_name, opts);
+ tracepoint(librbd, migration_prepare_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_prepare_import(
+ const char *source_spec, rados_ioctx_t dest_p,
+ const char *dest_image_name, rbd_image_options_t opts_) {
+ librados::IoCtx dest_io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+ librbd::ImageOptions opts(opts_);
+ return librbd::api::Migration<>::prepare_import(source_spec, dest_io_ctx,
+ dest_image_name, opts);
+}
+
+extern "C" int rbd_migration_execute(rados_ioctx_t p, const char *image_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_execute_with_progress(rados_ioctx_t p,
+ const char *name,
+ librbd_progress_fn_t fn,
+ void *data)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int r = librbd::api::Migration<>::execute(io_ctx, name, prog_ctx);
+ tracepoint(librbd, migration_execute_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_abort(rados_ioctx_t p, const char *image_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_abort_with_progress(rados_ioctx_t p,
+ const char *name,
+ librbd_progress_fn_t fn,
+ void *data)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int r = librbd::api::Migration<>::abort(io_ctx, name, prog_ctx);
+ tracepoint(librbd, migration_abort_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_commit(rados_ioctx_t p, const char *image_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_commit_with_progress(rados_ioctx_t p,
+ const char *name,
+ librbd_progress_fn_t fn,
+ void *data)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ librbd::CProgressContext prog_ctx(fn, data);
+ int r = librbd::api::Migration<>::commit(io_ctx, name, prog_ctx);
+ tracepoint(librbd, migration_commit_exit, r);
+ return r;
+}
+
+extern "C" int rbd_migration_status(rados_ioctx_t p, const char *image_name,
+ rbd_image_migration_status_t *status,
+ size_t status_size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), image_name);
+
+ if (status_size != sizeof(rbd_image_migration_status_t)) {
+ tracepoint(librbd, migration_status_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ librbd::image_migration_status_t cpp_status;
+ int r = librbd::api::Migration<>::status(io_ctx, image_name, &cpp_status);
+ if (r >= 0) {
+ status->source_pool_id = cpp_status.source_pool_id;
+ status->source_pool_namespace =
+ strdup(cpp_status.source_pool_namespace.c_str());
+ status->source_image_name = strdup(cpp_status.source_image_name.c_str());
+ status->source_image_id = strdup(cpp_status.source_image_id.c_str());
+ status->dest_pool_id = cpp_status.dest_pool_id;
+ status->dest_pool_namespace =
+ strdup(cpp_status.dest_pool_namespace.c_str());
+ status->dest_image_name = strdup(cpp_status.dest_image_name.c_str());
+ status->dest_image_id = strdup(cpp_status.dest_image_id.c_str());
+ status->state = cpp_status.state;
+ status->state_description = strdup(cpp_status.state_description.c_str());
+ }
+
+ tracepoint(librbd, migration_status_exit, r);
+ return r;
+}
+
+extern "C" void rbd_migration_status_cleanup(rbd_image_migration_status_t *s)
+{
+ free(s->source_pool_namespace);
+ free(s->source_image_name);
+ free(s->source_image_id);
+ free(s->dest_pool_namespace);
+ free(s->dest_image_name);
+ free(s->dest_image_id);
+ free(s->state_description);
+}
+
+extern "C" int rbd_pool_metadata_get(rados_ioctx_t p, const char *key,
+ char *value, size_t *vallen)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ string val_s;
+ int r = librbd::api::PoolMetadata<>::get(io_ctx, key, &val_s);
+ if (*vallen < val_s.size() + 1) {
+ r = -ERANGE;
+ *vallen = val_s.size() + 1;
+ } else {
+ strncpy(value, val_s.c_str(), val_s.size() + 1);
+ }
+
+ return r;
+}
+
+extern "C" int rbd_pool_metadata_set(rados_ioctx_t p, const char *key,
+ const char *value)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ int r = librbd::api::PoolMetadata<>::set(io_ctx, key, value);
+ return r;
+}
+
+extern "C" int rbd_pool_metadata_remove(rados_ioctx_t p, const char *key)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ int r = librbd::api::PoolMetadata<>::remove(io_ctx, key);
+ return r;
+}
+
+extern "C" int rbd_pool_metadata_list(rados_ioctx_t p, const char *start,
+ uint64_t max, char *key, size_t *key_len,
+ char *value, size_t *val_len)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ map<string, bufferlist> pairs;
+ int r = librbd::api::PoolMetadata<>::list(io_ctx, start, max, &pairs);
+ if (r < 0) {
+ return r;
+ }
+ size_t key_total_len = 0, val_total_len = 0;
+ for (auto &it : pairs) {
+ key_total_len += it.first.size() + 1;
+ val_total_len += it.second.length() + 1;
+ }
+ if (*key_len < key_total_len || *val_len < val_total_len) {
+ *key_len = key_total_len;
+ *val_len = val_total_len;
+ return -ERANGE;
+ }
+ *key_len = key_total_len;
+ *val_len = val_total_len;
+
+ char *key_p = key, *value_p = value;
+ for (auto &it : pairs) {
+ strncpy(key_p, it.first.c_str(), it.first.size() + 1);
+ key_p += it.first.size() + 1;
+ strncpy(value_p, it.second.c_str(), it.second.length());
+ value_p += it.second.length();
+ *value_p = '\0';
+ value_p++;
+ }
+ return 0;
+}
+
+extern "C" int rbd_config_pool_list(rados_ioctx_t p,
+ rbd_config_option_t *options,
+ int *max_options) {
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+ std::vector<librbd::config_option_t> option_vector;
+ int r = librbd::api::Config<>::list(io_ctx, &option_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_options < static_cast<int>(option_vector.size())) {
+ *max_options = static_cast<int>(option_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(option_vector.size()); ++i) {
+ config_option_cpp_to_c(option_vector[i], &options[i]);
+ }
+ *max_options = static_cast<int>(option_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_config_pool_list_cleanup(rbd_config_option_t *options,
+ int max_options) {
+ for (int i = 0; i < max_options; ++i) {
+ config_option_cleanup(options[i]);
+ }
+}
+
+extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image,
+ const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ false);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_open_by_id(rados_ioctx_t p, const char *id,
+ rbd_image_t *image, const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ false);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_open(rados_ioctx_t p, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ false);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_open_by_id(rados_ioctx_t p, const char *id,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ false);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only,
+ comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_open_read_only(rados_ioctx_t p, const char *name,
+ rbd_image_t *image, const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ true);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_open_by_id_read_only(rados_ioctx_t p, const char *id,
+ rbd_image_t *image, const char *snap_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ true);
+ tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+
+ int r = ictx->state->open(0);
+ if (r >= 0) {
+ *image = (rbd_image_t)ictx;
+ }
+ tracepoint(librbd, open_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_open_read_only(rados_ioctx_t p, const char *name,
+ rbd_image_t *image, const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
+ true);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_open_by_id_read_only(rados_ioctx_t p, const char *id,
+ rbd_image_t *image,
+ const char *snap_name,
+ rbd_completion_t c)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx,
+ true);
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(),
+ ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp),
+ image));
+ tracepoint(librbd, aio_open_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_features_to_string(uint64_t features, char *str_features, size_t *size)
+{
+ std::stringstream err;
+ std::string get_str_features = librbd::rbd_features_to_string(features, &err);
+ if (!err.str().empty()) {
+ return -EINVAL;
+ }
+ uint64_t expected_size = get_str_features.size();
+ if (*size <= expected_size) {
+ *size = expected_size + 1;
+ return -ERANGE;
+ }
+ strncpy(str_features, get_str_features.c_str(), expected_size);
+ str_features[expected_size] = '\0';
+ *size = expected_size + 1;
+ return 0;
+}
+
+extern "C" int rbd_features_from_string(const char *str_features, uint64_t *features)
+{
+ std::stringstream err;
+ *features = librbd::rbd_features_from_string(str_features, &err);
+ if (!err.str().empty()) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+extern "C" int rbd_close(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+
+ int r = ictx->state->close();
+
+ tracepoint(librbd, close_image_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_close(rbd_image_t image, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), comp->pc);
+ ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE,
+ get_aio_completion(comp)));
+ tracepoint(librbd, aio_close_image_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_resize(rbd_image_t image, uint64_t size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->resize(size, true, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+}
+
+extern "C" int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->resize(size, allow_shrink, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+}
+
+extern "C" int rbd_resize_with_progress(rbd_image_t image, uint64_t size,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->resize(size, true, prog_ctx);
+ tracepoint(librbd, resize_exit, r);
+ return r;
+}
+
+extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
+ size_t infosize)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::info(ictx, *info, infosize);
+ tracepoint(librbd, stat_exit, r, info);
+ return r;
+}
+
+extern "C" int rbd_get_old_format(rbd_image_t image, uint8_t *old)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_old_format(ictx, old);
+ tracepoint(librbd, get_old_format_exit, r, *old);
+ return r;
+}
+
+extern "C" int rbd_get_size(rbd_image_t image, uint64_t *size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_size(ictx, size);
+ tracepoint(librbd, get_size_exit, r, *size);
+ return r;
+}
+
+extern "C" int rbd_get_features(rbd_image_t image, uint64_t *features)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_features(ictx, features);
+ tracepoint(librbd, get_features_exit, r, *features);
+ return r;
+}
+
+extern "C" int rbd_update_features(rbd_image_t image, uint64_t features,
+ uint8_t enabled)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ bool features_enabled = enabled != 0;
+ tracepoint(librbd, update_features_enter, ictx, features, features_enabled);
+ int r = ictx->operations->update_features(features, features_enabled);
+ tracepoint(librbd, update_features_exit, r);
+ return r;
+}
+
+extern "C" int rbd_get_op_features(rbd_image_t image, uint64_t *op_features)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Image<>::get_op_features(ictx, op_features);
+}
+
+extern "C" int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ *stripe_unit = ictx->get_stripe_unit();
+ tracepoint(librbd, get_stripe_unit_exit, 0, *stripe_unit);
+ return 0;
+}
+
+extern "C" int rbd_get_stripe_count(rbd_image_t image, uint64_t *stripe_count)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ *stripe_count = ictx->get_stripe_count();
+ tracepoint(librbd, get_stripe_count_exit, 0, *stripe_count);
+ return 0;
+}
+
+extern "C" int rbd_get_create_timestamp(rbd_image_t image,
+ struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_create_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_create_timestamp_exit, 0, timestamp);
+ return 0;
+}
+
+extern "C" int rbd_get_access_timestamp(rbd_image_t image,
+ struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_access_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_access_timestamp_exit, 0, timestamp);
+ return 0;
+}
+
+extern "C" int rbd_get_modify_timestamp(rbd_image_t image,
+ struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(),
+ ictx->read_only);
+ utime_t time = ictx->get_modify_timestamp();
+ time.to_timespec(timestamp);
+ tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp);
+ return 0;
+}
+
+
+extern "C" int rbd_get_overlap(rbd_image_t image, uint64_t *overlap)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::get_overlap(ictx, overlap);
+ tracepoint(librbd, get_overlap_exit, r, *overlap);
+ return r;
+}
+
+extern "C" int rbd_get_name(rbd_image_t image, char *name, size_t *name_len)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ if (*name_len <= ictx->name.size()) {
+ *name_len = ictx->name.size() + 1;
+ return -ERANGE;
+ }
+
+ strncpy(name, ictx->name.c_str(), ictx->name.size());
+ name[ictx->name.size()] = '\0';
+ *name_len = ictx->name.size() + 1;
+ return 0;
+}
+
+extern "C" int rbd_get_id(rbd_image_t image, char *id, size_t id_len)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ if (ictx->old_format) {
+ return -EINVAL;
+ }
+ if (ictx->id.size() >= id_len) {
+ return -ERANGE;
+ }
+
+ strncpy(id, ictx->id.c_str(), id_len - 1);
+ id[id_len - 1] = '\0';
+ return 0;
+}
+
+extern "C" int rbd_get_block_name_prefix(rbd_image_t image, char *prefix,
+ size_t prefix_len)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ if (ictx->object_prefix.size() >= prefix_len) {
+ return -ERANGE;
+ }
+
+ strncpy(prefix, ictx->object_prefix.c_str(), prefix_len - 1);
+ prefix[prefix_len - 1] = '\0';
+ return 0;
+}
+
+extern "C" int64_t rbd_get_data_pool_id(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+ return librbd::api::Image<>::get_data_pool_id(ictx);
+}
+
+extern "C" int rbd_get_parent_info(rbd_image_t image,
+ char *parent_pool_name, size_t ppool_namelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_snap_name, size_t psnap_namelen)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name) {
+ if (parent_image.pool_name.length() + 1 > ppool_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_pool_name, parent_image.pool_name.c_str());
+ }
+ }
+ if (parent_name) {
+ if (parent_image.image_name.length() + 1 > pnamelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_name, parent_image.image_name.c_str());
+ }
+ }
+ if (parent_snap_name) {
+ if (parent_snap.name.length() + 1 > psnap_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_snap_name, parent_snap.name.c_str());
+ }
+ }
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL);
+ return r;
+ }
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image.pool_name.c_str(),
+ parent_image.image_name.c_str(),
+ parent_image.image_id.c_str(),
+ parent_snap.name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_get_parent_info2(rbd_image_t image,
+ char *parent_pool_name,
+ size_t ppool_namelen,
+ char *parent_name, size_t pnamelen,
+ char *parent_id, size_t pidlen,
+ char *parent_snap_name,
+ size_t psnap_namelen)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap);
+ if (r >= 0) {
+ if (parent_pool_name) {
+ if (parent_image.pool_name.length() + 1 > ppool_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_pool_name, parent_image.pool_name.c_str());
+ }
+ }
+ if (parent_name) {
+ if (parent_image.image_name.length() + 1 > pnamelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_name, parent_image.image_name.c_str());
+ }
+ }
+ if (parent_id) {
+ if (parent_image.image_id.length() + 1 > pidlen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_id, parent_image.image_id.c_str());
+ }
+ }
+ if (parent_snap_name) {
+ if (parent_snap.name.length() + 1 > psnap_namelen) {
+ r = -ERANGE;
+ } else {
+ strcpy(parent_snap_name, parent_snap.name.c_str());
+ }
+ }
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL);
+ return r;
+ }
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image.pool_name.c_str(),
+ parent_image.image_name.c_str(),
+ parent_image.image_id.c_str(),
+ parent_snap.name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_get_parent(rbd_image_t image,
+ rbd_linked_image_spec_t *parent_image,
+ rbd_snap_spec_t *parent_snap)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ librbd::linked_image_spec_t cpp_parent_image;
+ librbd::snap_spec_t cpp_parent_snap;
+ int r = librbd::api::Image<>::get_parent(ictx, &cpp_parent_image,
+ &cpp_parent_snap);
+ if (r < 0) {
+ // FIPS zeroization audit 20191117: these memsets are not security related.
+ memset(parent_image, 0, sizeof(rbd_linked_image_spec_t));
+ memset(parent_snap, 0, sizeof(rbd_snap_spec_t));
+ } else {
+ *parent_image = {
+ .pool_id = cpp_parent_image.pool_id,
+ .pool_name = strdup(cpp_parent_image.pool_name.c_str()),
+ .pool_namespace = strdup(cpp_parent_image.pool_namespace.c_str()),
+ .image_id = strdup(cpp_parent_image.image_id.c_str()),
+ .image_name = strdup(cpp_parent_image.image_name.c_str()),
+ .trash = cpp_parent_image.trash};
+ *parent_snap = {
+ .id = cpp_parent_snap.id,
+ .namespace_type = cpp_parent_snap.namespace_type,
+ .name = strdup(cpp_parent_snap.name.c_str())};
+ }
+
+ tracepoint(librbd, get_parent_info_exit, r,
+ parent_image->pool_name,
+ parent_image->image_name,
+ parent_image->image_id,
+ parent_snap->name);
+ return r;
+}
+
+extern "C" int rbd_get_migration_source_spec(rbd_image_t image,
+ char* source_spec,
+ size_t* max_len)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+
+ std::string cpp_source_spec;
+ int r = librbd::api::Migration<>::get_source_spec(ictx, &cpp_source_spec);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t expected_size = cpp_source_spec.size();
+ if (expected_size >= *max_len) {
+ *max_len = expected_size + 1;
+ return -ERANGE;
+ }
+
+ strncpy(source_spec, cpp_source_spec.c_str(), expected_size);
+ source_spec[expected_size] = '\0';
+ *max_len = expected_size + 1;
+
+ return 0;
+}
+
+extern "C" int rbd_get_flags(rbd_image_t image, uint64_t *flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, get_flags_enter, ictx);
+ int r = librbd::get_flags(ictx, flags);
+ tracepoint(librbd, get_flags_exit, ictx, r, *flags);
+ return r;
+}
+
+extern "C" int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info,
+ size_t group_info_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, image_get_group_enter, ictx->name.c_str());
+
+ if (group_info_size != sizeof(rbd_group_info_t)) {
+ tracepoint(librbd, image_get_group_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ librbd::group_info_t cpp_group_info;
+ int r = librbd::api::Group<>::image_get_group(ictx, &cpp_group_info);
+ if (r >= 0) {
+ group_info_cpp_to_c(cpp_group_info, group_info);
+ } else {
+ group_info->name = NULL;
+ }
+
+ tracepoint(librbd, image_get_group_exit, r);
+ return r;
+}
+
+extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, set_image_notification_enter, ictx, fd, type);
+ int r = librbd::set_image_notification(ictx, fd, type);
+ tracepoint(librbd, set_image_notification_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, is_exclusive_lock_owner_enter, ictx);
+ bool owner;
+ int r = librbd::is_exclusive_lock_owner(ictx, &owner);
+ *is_owner = owner ? 1 : 0;
+ tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner);
+ return r;
+}
+
+extern "C" int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_acquire_enter, ictx, lock_mode);
+ int r = librbd::lock_acquire(ictx, lock_mode);
+ tracepoint(librbd, lock_acquire_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_lock_release(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_release_enter, ictx);
+ int r = librbd::lock_release(ictx);
+ tracepoint(librbd, lock_release_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_lock_get_owners(rbd_image_t image,
+ rbd_lock_mode_t *lock_mode,
+ char **lock_owners,
+ size_t *max_lock_owners)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, lock_get_owners_enter, ictx);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(lock_owners, 0, sizeof(*lock_owners) * *max_lock_owners);
+ std::list<std::string> lock_owner_list;
+ int r = librbd::lock_get_owners(ictx, lock_mode, &lock_owner_list);
+ if (r >= 0) {
+ if (*max_lock_owners >= lock_owner_list.size()) {
+ *max_lock_owners = 0;
+ for (auto &lock_owner : lock_owner_list) {
+ lock_owners[(*max_lock_owners)++] = strdup(lock_owner.c_str());
+ }
+ } else {
+ *max_lock_owners = lock_owner_list.size();
+ r = -ERANGE;
+ }
+ }
+ tracepoint(librbd, lock_get_owners_exit, ictx, r);
+ return r;
+}
+
+extern "C" void rbd_lock_get_owners_cleanup(char **lock_owners,
+ size_t lock_owner_count)
+{
+ for (size_t i = 0; i < lock_owner_count; ++i) {
+ free(lock_owners[i]);
+ }
+}
+
+extern "C" int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode,
+ const char *lock_owner)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner);
+ int r = librbd::lock_break(ictx, lock_mode, lock_owner);
+ tracepoint(librbd, lock_break_exit, ictx, r);
+ return r;
+}
+
+extern "C" int rbd_rebuild_object_map(rbd_image_t image,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ return ictx->operations->rebuild_object_map(prog_ctx);
+}
+
+/* snapshots */
+extern "C" int rbd_snap_create(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ auto flags = librbd::util::get_default_snap_create_flags(ictx);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx);
+ tracepoint(librbd, snap_create_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_create2(rbd_image_t image, const char *snap_name,
+ uint32_t flags, librbd_progress_fn_t cb,
+ void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx);
+ tracepoint(librbd, snap_create_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_rename(rbd_image_t image, const char *srcname, const char *dstname)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname);
+ int r = ictx->operations->snap_rename(srcname, dstname);
+ tracepoint(librbd, snap_rename_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_remove(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Snapshot<>::remove(ictx, snap_name, 0, prog_ctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_remove2(rbd_image_t image, const char *snap_name, uint32_t flags,
+ librbd_progress_fn_t cb, void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Snapshot<>::remove(ictx, snap_name, flags, prog_ctx);
+ tracepoint(librbd, snap_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Snapshot<>::remove(ictx, snap_id);
+}
+
+extern "C" int rbd_snap_rollback(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::NoOpProgressContext prog_ctx;
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_rollback_with_progress(rbd_image_t image,
+ const char *snap_name,
+ librbd_progress_fn_t cb,
+ void *cbdata)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx);
+ tracepoint(librbd, snap_rollback_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+ int *max_snaps)
+{
+ vector<librbd::snap_info_t> cpp_snaps;
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snaps);
+
+ if (!max_snaps) {
+ tracepoint(librbd, snap_list_exit, -EINVAL, 0);
+ return -EINVAL;
+ }
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(snaps, 0, sizeof(*snaps) * *max_snaps);
+
+ int r = librbd::api::Snapshot<>::list(ictx, cpp_snaps);
+ if (r == -ENOENT) {
+ tracepoint(librbd, snap_list_exit, 0, *max_snaps);
+ return 0;
+ }
+ if (r < 0) {
+ tracepoint(librbd, snap_list_exit, r, *max_snaps);
+ return r;
+ }
+ if (*max_snaps < (int)cpp_snaps.size() + 1) {
+ *max_snaps = (int)cpp_snaps.size() + 1;
+ tracepoint(librbd, snap_list_exit, -ERANGE, *max_snaps);
+ return -ERANGE;
+ }
+
+ int i;
+
+ for (i = 0; i < (int)cpp_snaps.size(); i++) {
+ snaps[i].id = cpp_snaps[i].id;
+ snaps[i].size = cpp_snaps[i].size;
+ snaps[i].name = strdup(cpp_snaps[i].name.c_str());
+ if (!snaps[i].name) {
+ for (int j = 0; j < i; j++)
+ free((void *)snaps[j].name);
+ tracepoint(librbd, snap_list_exit, -ENOMEM, *max_snaps);
+ return -ENOMEM;
+ }
+ tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name);
+ }
+ snaps[i].id = 0;
+ snaps[i].size = 0;
+ snaps[i].name = NULL;
+
+ r = (int)cpp_snaps.size();
+ tracepoint(librbd, snap_list_exit, r, *max_snaps);
+ return r;
+}
+
+extern "C" void rbd_snap_list_end(rbd_snap_info_t *snaps)
+{
+ tracepoint(librbd, snap_list_end_enter, snaps);
+ while (snaps->name) {
+ free((void *)snaps->name);
+ snaps++;
+ }
+ tracepoint(librbd, snap_list_end_exit);
+}
+
+extern "C" int rbd_snap_protect(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_protect_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_unprotect(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_unprotect_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+ int *is_protected)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ bool protected_snap;
+ int r = librbd::api::Snapshot<>::is_protected(ictx, snap_name, &protected_snap);
+ if (r < 0) {
+ tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0);
+ return r;
+ }
+ *is_protected = protected_snap ? 1 : 0;
+ tracepoint(librbd, snap_is_protected_exit, 0, *is_protected ? 1 : 0);
+ return 0;
+}
+
+extern "C" int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_limit(ictx, limit);
+ tracepoint(librbd, snap_get_limit_exit, r, *limit);
+ return r;
+}
+
+extern "C" int rbd_snap_exists(rbd_image_t image, const char *snapname, bool *exists)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, snapname);
+ int r = librbd::api::Snapshot<>::exists(ictx, cls::rbd::UserSnapshotNamespace(), snapname, exists);
+ tracepoint(librbd, snap_exists_exit, r, *exists);
+ return r;
+}
+
+extern "C" int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_timestamp(ictx, snap_id, timestamp);
+ tracepoint(librbd, snap_get_timestamp_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_set_limit(rbd_image_t image, uint64_t limit)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit);
+ int r = librbd::api::Snapshot<>::set_limit(ictx, limit);
+ tracepoint(librbd, snap_set_limit_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_set(rbd_image_t image, const char *snap_name)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name);
+ int r = librbd::api::Image<>::snap_set(
+ ictx, cls::rbd::UserSnapshotNamespace(), snap_name);
+ tracepoint(librbd, snap_set_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Image<>::snap_set(ictx, snap_id);
+}
+
+extern "C" int rbd_snap_get_name(rbd_image_t image, uint64_t snap_id, char *snapname, size_t *name_len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ std::string snap_name;
+ int r = librbd::api::Snapshot<>::get_name(ictx, snap_id, &snap_name);
+ size_t expected_size = snap_name.size();
+ if (*name_len <= expected_size) {
+ *name_len = expected_size + 1;
+ return -ERANGE;
+ }
+ strncpy(snapname, snap_name.c_str(), expected_size);
+ snapname[expected_size] = '\0';
+ *name_len = expected_size + 1;
+ return r;
+}
+
+extern "C" int rbd_snap_get_id(rbd_image_t image, const char *snapname, uint64_t *snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Snapshot<>::get_id(ictx, snapname, snap_id);
+}
+
+extern "C" ssize_t rbd_list_children(rbd_image_t image, char *pools,
+ size_t *pools_len, char *images,
+ size_t *images_len)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+
+ std::vector<librbd::linked_image_spec_t> cpp_images;
+ int r = librbd::api::Image<>::list_children(ictx, &cpp_images);
+ if (r < 0) {
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ std::set<std::pair<std::string, std::string>> image_set;
+ for (auto& image : cpp_images) {
+ if (!image.trash) {
+ image_set.insert({image.pool_name, image.image_name});
+ }
+ }
+
+ size_t pools_total = 0;
+ size_t images_total = 0;
+ for (auto it : image_set) {
+ pools_total += it.first.length() + 1;
+ images_total += it.second.length() + 1;
+ }
+
+ bool too_short = false;
+ if (pools_total > *pools_len)
+ too_short = true;
+ if (images_total > *images_len)
+ too_short = true;
+ *pools_len = pools_total;
+ *images_len = images_total;
+ if (too_short) {
+ tracepoint(librbd, list_children_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ char *pools_p = pools;
+ char *images_p = images;
+ for (auto it : image_set) {
+ const char* pool = it.first.c_str();
+ strcpy(pools_p, pool);
+ pools_p += it.first.length() + 1;
+ const char* image = it.second.c_str();
+ strcpy(images_p, image);
+ images_p += it.second.length() + 1;
+ tracepoint(librbd, list_children_entry, pool, image);
+ }
+
+ ssize_t ret = image_set.size();
+ tracepoint(librbd, list_children_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_list_children2(rbd_image_t image,
+ rbd_child_info_t *children,
+ int *max_children)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(children, 0, sizeof(*children) * *max_children);
+
+ if (!max_children) {
+ tracepoint(librbd, list_children_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ std::vector<librbd::linked_image_spec_t> cpp_children;
+ int r = librbd::api::Image<>::list_children(ictx, &cpp_children);
+ if (r < 0) {
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ if (*max_children < (int)cpp_children.size() + 1) {
+ *max_children = (int)cpp_children.size() + 1;
+ tracepoint(librbd, list_children_exit, *max_children);
+ return -ERANGE;
+ }
+
+ int i;
+ for (i = 0; i < (int)cpp_children.size(); i++) {
+ children[i].pool_name = strdup(cpp_children[i].pool_name.c_str());
+ children[i].image_name = strdup(cpp_children[i].image_name.c_str());
+ children[i].image_id = strdup(cpp_children[i].image_id.c_str());
+ children[i].trash = cpp_children[i].trash;
+ tracepoint(librbd, list_children_entry, children[i].pool_name,
+ children[i].image_name);
+ }
+ children[i].pool_name = NULL;
+ children[i].image_name = NULL;
+ children[i].image_id = NULL;
+
+ r = (int)cpp_children.size();
+ tracepoint(librbd, list_children_exit, *max_children);
+ return r;
+}
+
+extern "C" void rbd_list_child_cleanup(rbd_child_info_t *child)
+{
+ free((void *)child->pool_name);
+ free((void *)child->image_name);
+ free((void *)child->image_id);
+}
+
+extern "C" void rbd_list_children_cleanup(rbd_child_info_t *children,
+ size_t num_children)
+{
+ for (size_t i=0; i < num_children; i++) {
+ free((void *)children[i].pool_name);
+ free((void *)children[i].image_name);
+ free((void *)children[i].image_id);
+ }
+}
+
+extern "C" int rbd_list_children3(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *max_images);
+
+ std::vector<librbd::linked_image_spec_t> cpp_children;
+ int r = librbd::api::Image<>::list_children(ictx, &cpp_children);
+ if (r < 0) {
+ tracepoint(librbd, list_children_exit, r);
+ return r;
+ }
+
+ if (*max_images < cpp_children.size()) {
+ *max_images = cpp_children.size();
+ return -ERANGE;
+ }
+
+ *max_images = cpp_children.size();
+ for (size_t idx = 0; idx < cpp_children.size(); ++idx) {
+ images[idx] = {
+ .pool_id = cpp_children[idx].pool_id,
+ .pool_name = strdup(cpp_children[idx].pool_name.c_str()),
+ .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()),
+ .image_id = strdup(cpp_children[idx].image_id.c_str()),
+ .image_name = strdup(cpp_children[idx].image_name.c_str()),
+ .trash = cpp_children[idx].trash};
+ tracepoint(librbd, list_children_entry, images[idx].pool_name,
+ images[idx].image_name);
+ }
+ return 0;
+}
+
+extern "C" int rbd_list_descendants(rbd_image_t image,
+ rbd_linked_image_spec_t *images,
+ size_t *max_images)
+{
+ auto ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *max_images);
+
+ std::vector<librbd::linked_image_spec_t> cpp_children;
+ int r = librbd::api::Image<>::list_descendants(ictx, {}, &cpp_children);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_images < cpp_children.size()) {
+ *max_images = cpp_children.size();
+ return -ERANGE;
+ }
+
+ *max_images = cpp_children.size();
+ for (size_t idx = 0; idx < cpp_children.size(); ++idx) {
+ images[idx] = {
+ .pool_id = cpp_children[idx].pool_id,
+ .pool_name = strdup(cpp_children[idx].pool_name.c_str()),
+ .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()),
+ .image_id = strdup(cpp_children[idx].image_id.c_str()),
+ .image_name = strdup(cpp_children[idx].image_name.c_str()),
+ .trash = cpp_children[idx].trash};
+ }
+ return 0;
+}
+
+extern "C" ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ std::list<librbd::locker_t> lockers;
+ bool exclusive_bool;
+ string tag_str;
+
+ int r = list_lockers(ictx, &lockers, &exclusive_bool, &tag_str);
+ if (r < 0) {
+ tracepoint(librbd, list_lockers_exit, r);
+ return r;
+ }
+
+ ldout(ictx->cct, 20) << "list_lockers r = " << r << " lockers.size() = " << lockers.size() << dendl;
+
+ *exclusive = (int)exclusive_bool;
+ size_t clients_total = 0;
+ size_t cookies_total = 0;
+ size_t addrs_total = 0;
+ for (list<librbd::locker_t>::const_iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ clients_total += it->client.length() + 1;
+ cookies_total += it->cookie.length() + 1;
+ addrs_total += it->address.length() + 1;
+ }
+
+ bool too_short = ((clients_total > *clients_len) ||
+ (cookies_total > *cookies_len) ||
+ (addrs_total > *addrs_len) ||
+ (tag_str.length() + 1 > *tag_len));
+ *clients_len = clients_total;
+ *cookies_len = cookies_total;
+ *addrs_len = addrs_total;
+ *tag_len = tag_str.length() + 1;
+ if (too_short) {
+ tracepoint(librbd, list_lockers_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ strcpy(tag, tag_str.c_str());
+ char *clients_p = clients;
+ char *cookies_p = cookies;
+ char *addrs_p = addrs;
+ for (list<librbd::locker_t>::const_iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ const char* client = it->client.c_str();
+ strcpy(clients_p, client);
+ clients_p += it->client.length() + 1;
+ const char* cookie = it->cookie.c_str();
+ strcpy(cookies_p, cookie);
+ cookies_p += it->cookie.length() + 1;
+ const char* address = it->address.c_str();
+ strcpy(addrs_p, address);
+ addrs_p += it->address.length() + 1;
+ tracepoint(librbd, list_lockers_entry, client, cookie, address);
+ }
+
+ ssize_t ret = lockers.size();
+ tracepoint(librbd, list_lockers_exit, ret);
+ return ret;
+}
+
+extern "C" int rbd_lock_exclusive(rbd_image_t image, const char *cookie)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie);
+ int r = librbd::lock(ictx, true, cookie ? cookie : "", "");
+ tracepoint(librbd, lock_exclusive_exit, r);
+ return r;
+}
+
+extern "C" int rbd_lock_shared(rbd_image_t image, const char *cookie,
+ const char *tag)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie, tag);
+ int r = librbd::lock(ictx, false, cookie ? cookie : "", tag ? tag : "");
+ tracepoint(librbd, lock_shared_exit, r);
+ return r;
+}
+
+extern "C" int rbd_unlock(rbd_image_t image, const char *cookie)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie);
+ int r = librbd::unlock(ictx, cookie ? cookie : "");
+ tracepoint(librbd, unlock_exit, r);
+ return r;
+}
+
+extern "C" int rbd_break_lock(rbd_image_t image, const char *client,
+ const char *cookie)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client, cookie);
+ int r = librbd::break_lock(ictx, client, cookie ? cookie : "");
+ tracepoint(librbd, break_lock_exit, r);
+ return r;
+}
+
+/* I/O */
+extern "C" ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ int r = librbd::api::Io<>::read(
+ *ictx, ofs, len, librbd::io::ReadResult{buf, len}, 0);
+ tracepoint(librbd, read_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+ char *buf, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs, len, op_flags);
+ int r = librbd::api::Io<>::read(
+ *ictx, ofs, len, librbd::io::ReadResult{buf, len}, op_flags);
+ tracepoint(librbd, read_exit, r);
+ return r;
+}
+
+
+extern "C" int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ tracepoint(librbd, read_iterate_exit, r);
+ return r;
+}
+
+extern "C" int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, const char *, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg);
+ if (r > 0)
+ r = 0;
+ tracepoint(librbd, read_iterate2_exit, r);
+ return (int)r;
+}
+
+extern "C" int rbd_diff_iterate(rbd_image_t image,
+ const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ true, false);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs, len,
+ true, false, cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+}
+
+extern "C" int rbd_diff_iterate2(rbd_image_t image, const char *fromsnapname,
+ uint64_t ofs, uint64_t len,
+ uint8_t include_parent, uint8_t whole_object,
+ int (*cb)(uint64_t, size_t, int, void *),
+ void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+ include_parent != 0, whole_object != 0);
+ int r = librbd::api::DiffIterate<>::diff_iterate(ictx,
+ cls::rbd::UserSnapshotNamespace(),
+ fromsnapname, ofs, len,
+ include_parent, whole_object,
+ cb, arg);
+ tracepoint(librbd, diff_iterate_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf);
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len, nullptr));
+ int r = librbd::api::Io<>::write(*ictx, ofs, len, std::move(bl), 0);
+ tracepoint(librbd, write_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf, op_flags);
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len, nullptr));
+ int r = librbd::api::Io<>::write(*ictx, ofs, len, std::move(bl), op_flags);
+ tracepoint(librbd, write_exit, r);
+ return r;
+}
+
+
+extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs, len);
+ if (len > static_cast<uint64_t>(std::numeric_limits<int>::max())) {
+ tracepoint(librbd, discard_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = librbd::api::Io<>::discard(
+ *ictx, ofs, len, ictx->discard_granularity_bytes);
+ tracepoint(librbd, discard_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
+ const char *buf, size_t data_len, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, data_len == 0 ? NULL : buf, data_len, op_flags);
+
+ if (data_len == 0 || len % data_len ||
+ len > static_cast<uint64_t>(std::numeric_limits<int>::max())) {
+ tracepoint(librbd, writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && mem_is_zero(buf, data_len)) {
+ int r = librbd::api::Io<>::write_zeroes(*ictx, ofs, len, 0, op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+ }
+
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, data_len, nullptr));
+ int r = librbd::api::Io<>::write_same(
+ *ictx, ofs, len, std::move(bl), op_flags);
+ tracepoint(librbd, writesame_exit, r);
+ return r;
+}
+
+extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len,
+ int zero_flags, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Io<>::write_zeroes(*ictx, ofs, len, zero_flags, op_flags);
+}
+
+extern "C" ssize_t rbd_compare_and_write(rbd_image_t image,
+ uint64_t ofs, size_t len,
+ const char *cmp_buf,
+ const char *buf,
+ uint64_t *mismatch_off,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs,
+ len, cmp_buf, buf, op_flags);
+
+ bufferlist cmp_bl;
+ cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len, nullptr));
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len, nullptr));
+
+ int r = librbd::api::Io<>::compare_and_write(
+ *ictx, ofs, len, std::move(cmp_bl), std::move(bl), mismatch_off, op_flags);
+ tracepoint(librbd, compare_and_write_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_create_completion(void *cb_arg,
+ rbd_callback_t complete_cb,
+ rbd_completion_t *c)
+{
+ librbd::RBD::AioCompletion *rbd_comp =
+ new librbd::RBD::AioCompletion(cb_arg, complete_cb);
+ *c = (rbd_completion_t) rbd_comp;
+ return 0;
+}
+
+extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc);
+
+ auto aio_completion = get_aio_completion(comp);
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len, aio_completion));
+ librbd::api::Io<>::aio_write(
+ *ictx, aio_completion, off, len, std::move(bl), 0, true);
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, rbd_completion_t c, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, buf, comp->pc, op_flags);
+
+ auto aio_completion = get_aio_completion(comp);
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len, aio_completion));
+ librbd::api::Io<>::aio_write(
+ *ictx, aio_completion, off, len, std::move(bl), op_flags, true);
+ tracepoint(librbd, aio_write_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ size_t len;
+ int r = get_iovec_length(iov, iovcnt, len);
+
+ tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, off, len, NULL,
+ comp->pc);
+
+ if (r == 0) {
+ auto aio_completion = get_aio_completion(comp);
+ auto bl = iovec_to_bufferlist(ictx, iov, iovcnt, aio_completion);
+ librbd::api::Io<>::aio_write(
+ *ictx, aio_completion, off, len, std::move(bl), 0, true);
+ }
+ tracepoint(librbd, aio_write_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc);
+ librbd::api::Io<>::aio_read(
+ *ictx, get_aio_completion(comp), off, len, librbd::io::ReadResult{buf, len},
+ 0, true);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
+ char *buf, rbd_completion_t c, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, buf, comp->pc, op_flags);
+ librbd::api::Io<>::aio_read(
+ *ictx, get_aio_completion(comp), off, len, librbd::io::ReadResult{buf, len},
+ op_flags, true);
+ tracepoint(librbd, aio_read_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_readv(rbd_image_t image, const struct iovec *iov,
+ int iovcnt, uint64_t off, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ size_t len;
+ int r = get_iovec_length(iov, iovcnt, len);
+
+ tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, off, len, NULL,
+ comp->pc);
+ if (r == 0) {
+ librbd::io::ReadResult read_result;
+ if (iovcnt == 1) {
+ read_result = librbd::io::ReadResult(
+ static_cast<char *>(iov[0].iov_base), iov[0].iov_len);
+ } else {
+ read_result = librbd::io::ReadResult(iov, iovcnt);
+ }
+ librbd::api::Io<>::aio_read(
+ *ictx, get_aio_completion(comp), off, len, std::move(read_result), 0,
+ true);
+ }
+ tracepoint(librbd, aio_read_exit, r);
+ return r;
+}
+
+extern "C" int rbd_flush(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::api::Io<>::flush(*ictx);
+ tracepoint(librbd, flush_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
+ librbd::api::Io<>::aio_flush(*ictx, get_aio_completion(comp), true);
+ tracepoint(librbd, aio_flush_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
+ rbd_completion_t c)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, comp->pc);
+ librbd::api::Io<>::aio_discard(
+ *ictx, get_aio_completion(comp), off, len,
+ ictx->discard_granularity_bytes, true);
+ tracepoint(librbd, aio_discard_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
+ const char *buf, size_t data_len, rbd_completion_t c,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, data_len == 0 ? NULL : buf, data_len, comp->pc,
+ op_flags);
+
+ if (data_len == 0 || len % data_len) {
+ tracepoint(librbd, aio_writesame_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
+ if (discard_zero && mem_is_zero(buf, data_len)) {
+ librbd::api::Io<>::aio_write_zeroes(
+ *ictx, get_aio_completion(comp), off, len, 0, op_flags, true);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+ }
+
+ auto aio_completion = get_aio_completion(comp);
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, data_len, aio_completion));
+ librbd::api::Io<>::aio_write_same(
+ *ictx, aio_completion, off, len, std::move(bl), op_flags, true);
+ tracepoint(librbd, aio_writesame_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len,
+ rbd_completion_t c, int zero_flags,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ librbd::api::Io<>::aio_write_zeroes(*ictx, get_aio_completion(comp), off, len,
+ zero_flags, op_flags, true);
+ return 0;
+}
+
+extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off,
+ size_t len, const char *cmp_buf,
+ const char *buf, rbd_completion_t c,
+ uint64_t *mismatch_off,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, cmp_buf, buf, comp->pc, op_flags);
+
+ auto aio_completion = get_aio_completion(comp);
+ bufferlist cmp_bl;
+ cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len, aio_completion));
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len, aio_completion));
+ librbd::api::Io<>::aio_compare_and_write(
+ *ictx, aio_completion, off, len, std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, false);
+
+ tracepoint(librbd, aio_compare_and_write_exit, 0);
+ return 0;
+}
+
+extern "C" int rbd_invalidate_cache(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ int r = librbd::invalidate_cache(ictx);
+ tracepoint(librbd, invalidate_cache_exit, r);
+ return r;
+}
+
+extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::io::AioCompletion *cs[numcomp];
+ tracepoint(librbd, poll_io_events_enter, ictx, numcomp);
+ int r = librbd::poll_io_events(ictx, cs, numcomp);
+ tracepoint(librbd, poll_io_events_exit, r);
+ if (r > 0) {
+ for (int i = 0; i < r; ++i)
+ comps[i] = cs[i]->rbd_comp;
+ }
+ return r;
+}
+
+extern "C" int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *vallen)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ string val_s;
+ tracepoint(librbd, metadata_get_enter, ictx, key);
+ int r = librbd::metadata_get(ictx, key, &val_s);
+ if (r < 0) {
+ tracepoint(librbd, metadata_get_exit, r, key, NULL);
+ return r;
+ }
+ if (*vallen < val_s.size() + 1) {
+ r = -ERANGE;
+ *vallen = val_s.size() + 1;
+ tracepoint(librbd, metadata_get_exit, r, key, NULL);
+ } else {
+ strncpy(value, val_s.c_str(), val_s.size() + 1);
+ tracepoint(librbd, metadata_get_exit, r, key, value);
+ }
+ return r;
+}
+
+extern "C" int rbd_metadata_set(rbd_image_t image, const char *key, const char *value)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, metadata_set_enter, ictx, key, value);
+ int r = ictx->operations->metadata_set(key, value);
+ tracepoint(librbd, metadata_set_exit, r);
+ return r;
+}
+
+extern "C" int rbd_metadata_remove(rbd_image_t image, const char *key)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, metadata_remove_enter, ictx, key);
+ int r = ictx->operations->metadata_remove(key);
+ tracepoint(librbd, metadata_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+ char *key, size_t *key_len, char *value, size_t *val_len)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, metadata_list_enter, ictx);
+ map<string, bufferlist> pairs;
+ int r = librbd::metadata_list(ictx, start, max, &pairs);
+ size_t key_total_len = 0, val_total_len = 0;
+ bool too_short = false;
+ for (map<string, bufferlist>::iterator it = pairs.begin();
+ it != pairs.end(); ++it) {
+ key_total_len += it->first.size() + 1;
+ val_total_len += it->second.length() + 1;
+ }
+ if (*key_len < key_total_len || *val_len < val_total_len)
+ too_short = true;
+ *key_len = key_total_len;
+ *val_len = val_total_len;
+ if (too_short) {
+ tracepoint(librbd, metadata_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ char *key_p = key, *value_p = value;
+
+ for (map<string, bufferlist>::iterator it = pairs.begin();
+ it != pairs.end(); ++it) {
+ strncpy(key_p, it->first.c_str(), it->first.size() + 1);
+ key_p += it->first.size() + 1;
+ strncpy(value_p, it->second.c_str(), it->second.length());
+ value_p += it->second.length();
+ *value_p = '\0';
+ value_p++;
+ tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str());
+ }
+ tracepoint(librbd, metadata_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_mirror_image_enable(rbd_image_t image)
+{
+ return rbd_mirror_image_enable2(image, RBD_MIRROR_IMAGE_MODE_JOURNAL);
+}
+
+extern "C" int rbd_mirror_image_enable2(rbd_image_t image,
+ rbd_mirror_image_mode_t mode)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_enable(ictx, mode, false);
+}
+
+extern "C" int rbd_mirror_image_disable(rbd_image_t image, bool force)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_disable(ictx, force);
+}
+
+extern "C" int rbd_mirror_image_promote(rbd_image_t image, bool force)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_promote(ictx, force);
+}
+
+extern "C" int rbd_mirror_image_demote(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_demote(ictx);
+}
+
+extern "C" int rbd_mirror_image_resync(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_resync(ictx);
+}
+
+extern "C" int rbd_mirror_image_create_snapshot(rbd_image_t image,
+ uint64_t *snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ auto flags = librbd::util::get_default_snap_create_flags(ictx);
+ return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id);
+}
+
+extern "C" int rbd_mirror_image_create_snapshot2(rbd_image_t image,
+ uint32_t flags,
+ uint64_t *snap_id)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id);
+}
+
+extern "C" int rbd_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *mirror_image_info,
+ size_t info_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ if (sizeof(rbd_mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ librbd::mirror_image_info_t cpp_mirror_image;
+ int r = librbd::api::Mirror<>::image_get_info(ictx, &cpp_mirror_image);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror_image_info_cpp_to_c(cpp_mirror_image, mirror_image_info);
+ return 0;
+}
+
+extern "C" void rbd_mirror_image_get_info_cleanup(
+ rbd_mirror_image_info_t *mirror_image_info)
+{
+ free(mirror_image_info->global_id);
+}
+
+extern "C" int rbd_mirror_image_get_mode(rbd_image_t image,
+ rbd_mirror_image_mode_t *mode)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ return librbd::api::Mirror<>::image_get_mode(ictx, mode);
+}
+
+extern "C" int rbd_mirror_image_get_global_status(
+ rbd_image_t image, rbd_mirror_image_global_status_t *status,
+ size_t status_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ if (sizeof(rbd_mirror_image_global_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ librbd::mirror_image_global_status_t cpp_status;
+ int r = librbd::api::Mirror<>::image_get_global_status(ictx, &cpp_status);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror_image_global_status_cpp_to_c(cpp_status, status);
+ return 0;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+extern "C" int rbd_mirror_image_get_status(rbd_image_t image,
+ rbd_mirror_image_status_t *status,
+ size_t status_size)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ if (sizeof(rbd_mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ librbd::mirror_image_global_status_t cpp_status;
+ int r = librbd::api::Mirror<>::image_get_global_status(ictx, &cpp_status);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror_image_global_status_cpp_to_c(cpp_status, status);
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+
+extern "C" int rbd_mirror_image_get_instance_id(rbd_image_t image,
+ char *instance_id,
+ size_t *instance_id_max_length)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ std::string cpp_instance_id;
+ int r = librbd::api::Mirror<>::image_get_instance_id(ictx, &cpp_instance_id);
+ if (r < 0) {
+ return r;
+ }
+
+ if (cpp_instance_id.size() >= *instance_id_max_length) {
+ *instance_id_max_length = cpp_instance_id.size() + 1;
+ return -ERANGE;
+ }
+
+ strcpy(instance_id, cpp_instance_id.c_str());
+ *instance_id_max_length = cpp_instance_id.size() + 1;
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_promote(rbd_image_t image, bool force,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ librbd::api::Mirror<>::image_promote(
+ ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_demote(rbd_image_t image,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ librbd::api::Mirror<>::image_demote(
+ ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_get_info(rbd_image_t image,
+ rbd_mirror_image_info_t *info,
+ size_t info_size,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ if (sizeof(rbd_mirror_image_info_t) != info_size) {
+ return -ERANGE;
+ }
+
+ auto ctx = new C_MirrorImageGetInfo(
+ info, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ librbd::api::Mirror<>::image_get_info(
+ ictx, &ctx->cpp_mirror_image_info, ctx);
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_get_mode(rbd_image_t image,
+ rbd_mirror_image_mode_t *mode,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ librbd::api::Mirror<>::image_get_mode(
+ ictx, mode, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ return 0;
+}
+
+extern "C" int rbd_aio_mirror_image_get_global_status(
+ rbd_image_t image, rbd_mirror_image_global_status_t *status,
+ size_t status_size, rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ if (sizeof(rbd_mirror_image_global_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ auto ctx = new C_MirrorImageGetGlobalStatus(
+ status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ librbd::api::Mirror<>::image_get_global_status(
+ ictx, &ctx->cpp_mirror_image_global_status, ctx);
+ return 0;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+extern "C" int rbd_aio_mirror_image_get_status(
+ rbd_image_t image, rbd_mirror_image_status_t *status, size_t status_size,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ if (sizeof(rbd_mirror_image_status_t) != status_size) {
+ return -ERANGE;
+ }
+
+ auto ctx = new C_MirrorImageGetStatus(
+ status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ librbd::api::Mirror<>::image_get_global_status(
+ ictx, &ctx->cpp_mirror_image_global_status, ctx);
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+
+extern "C" int rbd_aio_mirror_image_create_snapshot(rbd_image_t image,
+ uint32_t flags,
+ uint64_t *snap_id,
+ rbd_completion_t c) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ librbd::api::Mirror<>::image_snapshot_create(
+ ictx, flags, snap_id, new C_AioCompletion(ictx,
+ librbd::io::AIO_TYPE_GENERIC,
+ get_aio_completion(comp)));
+ return 0;
+}
+
+extern "C" int rbd_update_watch(rbd_image_t image, uint64_t *handle,
+ rbd_update_callback_t watch_cb, void *arg)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ C_UpdateWatchCB *wctx = new C_UpdateWatchCB(watch_cb, arg);
+ tracepoint(librbd, update_watch_enter, ictx, wctx);
+ int r = ictx->state->register_update_watcher(wctx, &wctx->handle);
+ tracepoint(librbd, update_watch_exit, r, wctx->handle);
+ *handle = reinterpret_cast<uint64_t>(wctx);
+ return r;
+}
+
+extern "C" int rbd_update_unwatch(rbd_image_t image, uint64_t handle)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ C_UpdateWatchCB *wctx = reinterpret_cast<C_UpdateWatchCB *>(handle);
+ tracepoint(librbd, update_unwatch_enter, ictx, wctx->handle);
+ int r = ictx->state->unregister_update_watcher(wctx->handle);
+ delete wctx;
+ tracepoint(librbd, update_unwatch_exit, r);
+ return r;
+}
+
+extern "C" int rbd_aio_is_complete(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->is_complete();
+}
+
+extern "C" int rbd_aio_wait_for_complete(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->wait_for_complete();
+}
+
+extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->get_return_value();
+}
+
+extern "C" void *rbd_aio_get_arg(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->get_arg();
+}
+
+extern "C" void rbd_aio_release(rbd_completion_t c)
+{
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ comp->release();
+}
+
+extern "C" int rbd_group_create(rados_ioctx_t p, const char *name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Group<>::create(io_ctx, name);
+ tracepoint(librbd, group_create_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_remove(rados_ioctx_t p, const char *name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), name);
+ int r = librbd::api::Group<>::remove(io_ctx, name);
+ tracepoint(librbd, group_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_list(rados_ioctx_t p, char *names, size_t *size)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id());
+
+ vector<string> cpp_names;
+ int r = librbd::api::Group<>::list(io_ctx, &cpp_names);
+
+ if (r < 0) {
+ tracepoint(librbd, group_list_exit, r);
+ return r;
+ }
+
+ size_t expected_size = 0;
+
+ for (size_t i = 0; i < cpp_names.size(); i++) {
+ expected_size += cpp_names[i].size() + 1;
+ }
+ if (*size < expected_size) {
+ *size = expected_size;
+ tracepoint(librbd, group_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ if (names == NULL) {
+ tracepoint(librbd, group_list_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < (int)cpp_names.size(); i++) {
+ const char* name = cpp_names[i].c_str();
+ tracepoint(librbd, group_list_entry, name);
+ strcpy(names, name);
+ names += strlen(names) + 1;
+ }
+ tracepoint(librbd, group_list_exit, (int)expected_size);
+ return (int)expected_size;
+}
+
+extern "C" int rbd_group_rename(rados_ioctx_t p, const char *src_name,
+ const char *dest_name)
+{
+ librados::IoCtx io_ctx;
+ librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+ tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(),
+ io_ctx.get_id(), src_name, dest_name);
+ int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name);
+ tracepoint(librbd, group_rename_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_add(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx image_ioctx;
+
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+ librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_add_enter, group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+
+ int r = librbd::api::Group<>::image_add(group_ioctx, group_name, image_ioctx,
+ image_name);
+
+ tracepoint(librbd, group_image_add_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx image_ioctx;
+
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+ librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_enter, group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_name);
+
+ int r = librbd::api::Group<>::image_remove(group_ioctx, group_name,
+ image_ioctx, image_name);
+
+ tracepoint(librbd, group_image_remove_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_remove_by_id(rados_ioctx_t group_p,
+ const char *group_name,
+ rados_ioctx_t image_p,
+ const char *image_id)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx image_ioctx;
+
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+ librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_remove_by_id_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name,
+ image_ioctx.get_pool_name().c_str(),
+ image_ioctx.get_id(), image_id);
+
+ int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name,
+ image_ioctx, image_id);
+
+ tracepoint(librbd, group_image_remove_by_id_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_image_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t *image_size)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_image_list_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(images, 0, sizeof(*images) * *image_size);
+
+ if (group_image_info_size != sizeof(rbd_group_image_info_t)) {
+ *image_size = 0;
+ tracepoint(librbd, group_image_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ std::vector<librbd::group_image_info_t> cpp_images;
+ int r = librbd::api::Group<>::image_list(group_ioctx, group_name,
+ &cpp_images);
+
+ if (r == -ENOENT) {
+ tracepoint(librbd, group_image_list_exit, 0);
+ *image_size = 0;
+ return 0;
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, group_image_list_exit, r);
+ return r;
+ }
+
+ if (*image_size < cpp_images.size()) {
+ *image_size = cpp_images.size();
+ tracepoint(librbd, group_image_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < cpp_images.size(); ++i) {
+ group_image_status_cpp_to_c(cpp_images[i], &images[i]);
+ }
+
+ r = *image_size = cpp_images.size();
+ tracepoint(librbd, group_image_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_info_cleanup(rbd_group_info_t *group_info,
+ size_t group_info_size) {
+ if (group_info_size != sizeof(rbd_group_info_t)) {
+ return -ERANGE;
+ }
+
+ free(group_info->name);
+ return 0;
+}
+
+extern "C" int rbd_group_image_list_cleanup(rbd_group_image_info_t *images,
+ size_t group_image_info_size,
+ size_t len) {
+ if (group_image_info_size != sizeof(rbd_group_image_info_t)) {
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < len; ++i) {
+ free(images[i].name);
+ }
+ return 0;
+}
+
+extern "C" int rbd_group_snap_create(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_create_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ int r = librbd::api::Group<>::snap_create(group_ioctx, group_name,
+ snap_name, 0);
+ tracepoint(librbd, group_snap_create_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_create2(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name,
+ uint32_t flags)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_create_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, snap_name,
+ flags);
+ tracepoint(librbd, group_snap_create_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_remove(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_remove_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name, snap_name);
+
+ tracepoint(librbd, group_snap_remove_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_rename(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *old_snap_name,
+ const char *new_snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rename_enter,
+ group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(),
+ group_name, old_snap_name, new_snap_name);
+
+ int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name,
+ old_snap_name, new_snap_name);
+
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_snap_list(rados_ioctx_t group_p,
+ const char *group_name,
+ rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t *snaps_size)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_list_enter, group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(snaps, 0, sizeof(*snaps) * *snaps_size);
+
+ if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) {
+ *snaps_size = 0;
+ tracepoint(librbd, group_snap_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ std::vector<librbd::group_snap_info_t> cpp_snaps;
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps);
+
+ if (r == -ENOENT) {
+ *snaps_size = 0;
+ tracepoint(librbd, group_snap_list_exit, 0);
+ return 0;
+ }
+
+ if (r < 0) {
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+ }
+
+ if (*snaps_size < cpp_snaps.size()) {
+ *snaps_size = cpp_snaps.size();
+ tracepoint(librbd, group_snap_list_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < cpp_snaps.size(); ++i) {
+ group_snap_info_cpp_to_c(cpp_snaps[i], &snaps[i]);
+ }
+
+ r = *snaps_size = cpp_snaps.size();
+ tracepoint(librbd, group_snap_list_exit, r);
+ return r;
+}
+
+extern "C" int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps,
+ size_t group_snap_info_size,
+ size_t len) {
+ if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) {
+ return -ERANGE;
+ }
+
+ for (size_t i = 0; i < len; ++i) {
+ free(snaps[i].name);
+ }
+ return 0;
+}
+
+extern "C" int rbd_group_snap_rollback(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ librbd::NoOpProgressContext prog_ctx;
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+
+ tracepoint(librbd, group_snap_rollback_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p,
+ const char *group_name,
+ const char *snap_name,
+ librbd_progress_fn_t cb,
+ void *cbdata)
+{
+ librados::IoCtx group_ioctx;
+ librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
+
+ TracepointProvider::initialize<tracepoint_traits>(get_cct(group_ioctx));
+ tracepoint(librbd, group_snap_rollback_enter,
+ group_ioctx.get_pool_name().c_str(),
+ group_ioctx.get_id(), group_name, snap_name);
+
+ librbd::CProgressContext prog_ctx(cb, cbdata);
+ int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name,
+ snap_name, prog_ctx);
+
+ tracepoint(librbd, group_snap_rollback_exit, r);
+
+ return r;
+}
+
+extern "C" int rbd_snap_get_namespace_type(rbd_image_t image,
+ uint64_t snap_id,
+ rbd_snap_namespace_type_t *namespace_type) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str());
+ int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id,
+ namespace_type);
+ tracepoint(librbd, snap_get_namespace_type_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_get_group_namespace(rbd_image_t image, uint64_t snap_id,
+ rbd_snap_group_namespace_t *group_snap,
+ size_t snap_group_namespace_size) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, snap_get_group_namespace_enter, ictx,
+ ictx->name.c_str());
+
+ if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) {
+ tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE);
+ return -ERANGE;
+ }
+
+ librbd::snap_group_namespace_t group_namespace;
+ int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id,
+ &group_namespace);
+ if (r >= 0) {
+ group_snap->group_pool = group_namespace.group_pool;
+ group_snap->group_name = strdup(group_namespace.group_name.c_str());
+ group_snap->group_snap_name =
+ strdup(group_namespace.group_snap_name.c_str());
+ }
+
+ tracepoint(librbd, snap_get_group_namespace_exit, r);
+ return r;
+}
+
+extern "C" int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap,
+ size_t snap_group_namespace_size) {
+ if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) {
+ return -ERANGE;
+ }
+
+ free(group_snap->group_name);
+ free(group_snap->group_snap_name);
+ return 0;
+}
+
+extern "C" int rbd_snap_get_trash_namespace(rbd_image_t image, uint64_t snap_id,
+ char *original_name,
+ size_t max_length) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ std::string cpp_original_name;
+ int r = librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id,
+ &cpp_original_name);
+ if (r < 0) {
+ return r;
+ }
+
+ if (cpp_original_name.length() >= max_length) {
+ return -ERANGE;
+ }
+
+ strcpy(original_name, cpp_original_name.c_str());
+ return 0;
+}
+
+extern "C" int rbd_snap_get_mirror_namespace(
+ rbd_image_t image, uint64_t snap_id,
+ rbd_snap_mirror_namespace_t *mirror_snap,
+ size_t mirror_snap_size) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+
+ if (mirror_snap_size != sizeof(rbd_snap_mirror_namespace_t)) {
+ return -ERANGE;
+ }
+
+ librbd::snap_mirror_namespace_t mirror_namespace;
+ int r = librbd::api::Snapshot<>::get_mirror_namespace(
+ ictx, snap_id, &mirror_namespace);
+ if (r < 0) {
+ return r;
+ }
+
+ mirror_snap->state = mirror_namespace.state;
+ mirror_snap->primary_mirror_uuid =
+ strdup(mirror_namespace.primary_mirror_uuid.c_str());
+ mirror_snap->primary_snap_id = mirror_namespace.primary_snap_id;
+ mirror_snap->mirror_peer_uuids_count =
+ mirror_namespace.mirror_peer_uuids.size();
+ size_t len = 0;
+ for (auto &peer : mirror_namespace.mirror_peer_uuids) {
+ len += peer.size() + 1;
+ }
+ mirror_snap->mirror_peer_uuids = (char *)malloc(len);
+ char *p = mirror_snap->mirror_peer_uuids;
+ for (auto &peer : mirror_namespace.mirror_peer_uuids) {
+ strncpy(p, peer.c_str(), peer.size() + 1);
+ p += peer.size() + 1;
+ }
+ mirror_snap->complete = mirror_namespace.complete;
+ mirror_snap->last_copied_object_number =
+ mirror_namespace.last_copied_object_number;
+
+ return 0;
+}
+
+extern "C" int rbd_snap_mirror_namespace_cleanup(
+ rbd_snap_mirror_namespace_t *mirror_snap,
+ size_t mirror_snap_size) {
+ if (mirror_snap_size != sizeof(rbd_snap_mirror_namespace_t)) {
+ return -ERANGE;
+ }
+
+ free(mirror_snap->primary_mirror_uuid);
+ free(mirror_snap->mirror_peer_uuids);
+ return 0;
+}
+
+extern "C" int rbd_watchers_list(rbd_image_t image,
+ rbd_image_watcher_t *watchers,
+ size_t *max_watchers) {
+ std::list<librbd::image_watcher_t> watcher_list;
+ librbd::ImageCtx *ictx = (librbd::ImageCtx*)image;
+
+ tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(watchers, 0, sizeof(*watchers) * *max_watchers);
+ int r = librbd::list_watchers(ictx, watcher_list);
+ if (r < 0) {
+ tracepoint(librbd, list_watchers_exit, r, 0);
+ return r;
+ }
+
+ if (watcher_list.size() > *max_watchers) {
+ *max_watchers = watcher_list.size();
+ tracepoint(librbd, list_watchers_exit, -ERANGE, watcher_list.size());
+ return -ERANGE;
+ }
+
+ *max_watchers = 0;
+ for (auto &watcher : watcher_list) {
+ tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie);
+ watchers[*max_watchers].addr = strdup(watcher.addr.c_str());
+ watchers[*max_watchers].id = watcher.id;
+ watchers[*max_watchers].cookie = watcher.cookie;
+ *max_watchers += 1;
+ }
+
+ tracepoint(librbd, list_watchers_exit, r, watcher_list.size());
+ return 0;
+}
+
+extern "C" void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers,
+ size_t num_watchers) {
+ for (size_t i = 0; i < num_watchers; ++i) {
+ free(watchers[i].addr);
+ }
+}
+
+extern "C" int rbd_config_image_list(rbd_image_t image,
+ rbd_config_option_t *options,
+ int *max_options) {
+ librbd::ImageCtx *ictx = (librbd::ImageCtx*)image;
+
+ std::vector<librbd::config_option_t> option_vector;
+ int r = librbd::api::Config<>::list(ictx, &option_vector);
+ if (r < 0) {
+ return r;
+ }
+
+ if (*max_options < static_cast<int>(option_vector.size())) {
+ *max_options = static_cast<int>(option_vector.size());
+ return -ERANGE;
+ }
+
+ for (int i = 0; i < static_cast<int>(option_vector.size()); ++i) {
+ config_option_cpp_to_c(option_vector[i], &options[i]);
+ }
+ *max_options = static_cast<int>(option_vector.size());
+ return 0;
+}
+
+extern "C" void rbd_config_image_list_cleanup(rbd_config_option_t *options,
+ int max_options) {
+ for (int i = 0; i < max_options; ++i) {
+ config_option_cleanup(options[i]);
+ }
+}
+
+extern "C" int rbd_quiesce_watch(rbd_image_t image,
+ rbd_update_callback_t quiesce_cb,
+ rbd_update_callback_t unquiesce_cb,
+ void *arg, uint64_t *handle)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ auto wctx = new C_QuiesceWatchCB(quiesce_cb, unquiesce_cb, arg);
+ int r = ictx->state->register_quiesce_watcher(wctx, &wctx->handle);
+ if (r < 0) {
+ delete wctx;
+ return r;
+ }
+ *handle = reinterpret_cast<uint64_t>(wctx);
+ return 0;
+}
+
+extern "C" int rbd_quiesce_unwatch(rbd_image_t image, uint64_t handle)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ auto *wctx = reinterpret_cast<C_QuiesceWatchCB *>(handle);
+ int r = ictx->state->unregister_quiesce_watcher(wctx->handle);
+ delete wctx;
+ return r;
+}
+
+extern "C" void rbd_quiesce_complete(rbd_image_t image, uint64_t handle, int r)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ ictx->state->quiesce_complete(handle, r);
+}
diff --git a/src/librbd/managed_lock/AcquireRequest.cc b/src/librbd/managed_lock/AcquireRequest.cc
new file mode 100644
index 000000000..79be0f25a
--- /dev/null
+++ b/src/librbd/managed_lock/AcquireRequest.cc
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/AcquireRequest.h"
+#include "librbd/Watcher.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/managed_lock/BreakRequest.h"
+#include "librbd/managed_lock/GetLockerRequest.h"
+#include "librbd/managed_lock/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::AcquireRequest: " << this \
+ << " " << __func__ << ": "
+
+using std::string;
+
+namespace librbd {
+
+using librbd::util::detail::C_AsyncCallback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace managed_lock {
+
+template <typename I>
+AcquireRequest<I>* AcquireRequest<I>::create(librados::IoCtx& ioctx,
+ Watcher *watcher,
+ AsioEngine& asio_engine,
+ const string& oid,
+ const string& cookie,
+ bool exclusive,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds,
+ Context *on_finish) {
+ return new AcquireRequest(ioctx, watcher, asio_engine, oid, cookie,
+ exclusive, blocklist_on_break_lock,
+ blocklist_expire_seconds, on_finish);
+}
+
+template <typename I>
+AcquireRequest<I>::AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ AsioEngine& asio_engine,
+ const string& oid,
+ const string& cookie, bool exclusive,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_watcher(watcher),
+ m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())),
+ m_asio_engine(asio_engine), m_oid(oid), m_cookie(cookie),
+ m_exclusive(exclusive),
+ m_blocklist_on_break_lock(blocklist_on_break_lock),
+ m_blocklist_expire_seconds(blocklist_expire_seconds),
+ m_on_finish(new C_AsyncCallback<asio::ContextWQ>(
+ asio_engine.get_work_queue(), on_finish)) {
+}
+
+template <typename I>
+AcquireRequest<I>::~AcquireRequest() {
+}
+
+template <typename I>
+void AcquireRequest<I>::send() {
+ send_get_locker();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_get_locker() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ AcquireRequest<I>, &AcquireRequest<I>::handle_get_locker>(this);
+ auto req = GetLockerRequest<I>::create(m_ioctx, m_oid, m_exclusive,
+ &m_locker, ctx);
+ req->send();
+}
+
+template <typename I>
+void AcquireRequest<I>::handle_get_locker(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 20) << "no lockers detected" << dendl;
+ m_locker = {};
+ } else if (r == -EBUSY) {
+ ldout(m_cct, 5) << "incompatible lock detected" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_lock() {
+ ldout(m_cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", "
+ << "cookie=" << m_cookie << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::lock(&op, RBD_LOCK_NAME,
+ m_exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED, m_cookie,
+ util::get_watcher_lock_tag(), "", utime_t(), 0);
+
+ using klass = AcquireRequest;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_lock>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void AcquireRequest<I>::handle_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ finish(0);
+ return;
+ } else if (r == -EBUSY && m_locker.cookie.empty()) {
+ ldout(m_cct, 5) << "already locked, refreshing locker" << dendl;
+ send_get_locker();
+ return;
+ } else if (r != -EBUSY) {
+ lderr(m_cct) << "failed to lock: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_break_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_break_lock() {
+ ldout(m_cct, 10) << dendl;
+
+ Context *ctx = create_context_callback<
+ AcquireRequest<I>, &AcquireRequest<I>::handle_break_lock>(this);
+ auto req = BreakRequest<I>::create(
+ m_ioctx, m_asio_engine, m_oid, m_locker, m_exclusive,
+ m_blocklist_on_break_lock, m_blocklist_expire_seconds, false, ctx);
+ req->send();
+}
+
+template <typename I>
+void AcquireRequest<I>::handle_break_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -EAGAIN) {
+ ldout(m_cct, 5) << "lock owner is still alive" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to break lock : " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_locker = {};
+ send_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::finish(int r) {
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::AcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/AcquireRequest.h b/src/librbd/managed_lock/AcquireRequest.h
new file mode 100644
index 000000000..19424a422
--- /dev/null
+++ b/src/librbd/managed_lock/AcquireRequest.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/watcher/Types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class AsioEngine;
+class Watcher;
+
+namespace managed_lock {
+
+template <typename ImageCtxT>
+class AcquireRequest {
+private:
+ typedef watcher::Traits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Watcher Watcher;
+
+public:
+ static AcquireRequest* create(librados::IoCtx& ioctx, Watcher *watcher,
+ AsioEngine& asio_engine,
+ const std::string& oid,
+ const std::string& cookie,
+ bool exclusive,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds,
+ Context *on_finish);
+
+ ~AcquireRequest();
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_LOCKER
+ * | ^
+ * | . (EBUSY && no cached locker)
+ * | .
+ * | . (EBUSY && cached locker)
+ * \--> LOCK_IMAGE * * * * * * * * > BREAK_LOCK . . . . .
+ * | ^ | .
+ * | | | (success) .
+ * | \-------------------------/ .
+ * v .
+ * <finish> < . . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ AsioEngine& asio_engine, const std::string& oid,
+ const std::string& cookie, bool exclusive,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds, Context *on_finish);
+
+ librados::IoCtx& m_ioctx;
+ Watcher *m_watcher;
+ CephContext *m_cct;
+ AsioEngine& m_asio_engine;
+ std::string m_oid;
+ std::string m_cookie;
+ bool m_exclusive;
+ bool m_blocklist_on_break_lock;
+ uint32_t m_blocklist_expire_seconds;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ Locker m_locker;
+
+ void send_get_locker();
+ void handle_get_locker(int r);
+
+ void send_lock();
+ void handle_lock(int r);
+
+ void send_break_lock();
+ void handle_break_lock(int r);
+
+ void finish(int r);
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H
diff --git a/src/librbd/managed_lock/BreakRequest.cc b/src/librbd/managed_lock/BreakRequest.cc
new file mode 100644
index 000000000..e482d221e
--- /dev/null
+++ b/src/librbd/managed_lock/BreakRequest.cc
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/BreakRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/neorados/RADOS.hpp"
+#include "include/stringify.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/managed_lock/GetLockerRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::BreakRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace managed_lock {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+BreakRequest<I>::BreakRequest(librados::IoCtx& ioctx,
+ AsioEngine& asio_engine,
+ const std::string& oid, const Locker &locker,
+ bool exclusive, bool blocklist_locker,
+ uint32_t blocklist_expire_seconds,
+ bool force_break_lock, Context *on_finish)
+ : m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())),
+ m_asio_engine(asio_engine), m_oid(oid), m_locker(locker),
+ m_exclusive(exclusive), m_blocklist_locker(blocklist_locker),
+ m_blocklist_expire_seconds(blocklist_expire_seconds),
+ m_force_break_lock(force_break_lock), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void BreakRequest<I>::send() {
+ send_get_watchers();
+}
+
+template <typename I>
+void BreakRequest<I>::send_get_watchers() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_watchers(&m_watchers, &m_watchers_ret_val);
+
+ using klass = BreakRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_get_watchers>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void BreakRequest<I>::handle_get_watchers(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ r = m_watchers_ret_val;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve watchers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ bool found_alive_locker = false;
+ for (auto &watcher : m_watchers) {
+ ldout(m_cct, 20) << "watcher=["
+ << "addr=" << watcher.addr << ", "
+ << "entity=client." << watcher.watcher_id << "]" << dendl;
+
+ if ((strncmp(m_locker.address.c_str(),
+ watcher.addr, sizeof(watcher.addr)) == 0) &&
+ (m_locker.handle == watcher.cookie)) {
+ ldout(m_cct, 10) << "lock owner is still alive" << dendl;
+ found_alive_locker = true;
+ }
+ }
+
+ if (!m_force_break_lock && found_alive_locker) {
+ finish(-EAGAIN);
+ return;
+ }
+
+ send_get_locker();
+}
+
+template <typename I>
+void BreakRequest<I>::send_get_locker() {
+ ldout(m_cct, 10) << dendl;
+
+ using klass = BreakRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_get_locker>(
+ this);
+ auto req = GetLockerRequest<I>::create(m_ioctx, m_oid, m_exclusive,
+ &m_refreshed_locker, ctx);
+ req->send();
+}
+
+template <typename I>
+void BreakRequest<I>::handle_get_locker(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 5) << "no lock owner" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0 && r != -EBUSY) {
+ lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ m_refreshed_locker = {};
+ }
+
+ if (m_refreshed_locker != m_locker || m_refreshed_locker == Locker{}) {
+ ldout(m_cct, 5) << "no longer lock owner" << dendl;
+ finish(-EAGAIN);
+ return;
+ }
+
+ send_blocklist();
+}
+
+template <typename I>
+void BreakRequest<I>::send_blocklist() {
+ if (!m_blocklist_locker) {
+ send_break_lock();
+ return;
+ }
+
+ entity_name_t entity_name = entity_name_t::CLIENT(m_ioctx.get_instance_id());
+ ldout(m_cct, 10) << "local entity=" << entity_name << ", "
+ << "locker entity=" << m_locker.entity << dendl;
+
+ if (m_locker.entity == entity_name) {
+ lderr(m_cct) << "attempting to self-blocklist" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ entity_addr_t locker_addr;
+ if (!locker_addr.parse(m_locker.address.c_str(), 0)) {
+ lderr(m_cct) << "unable to parse locker address: " << m_locker.address
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ std::optional<std::chrono::seconds> expire;
+ if (m_blocklist_expire_seconds != 0) {
+ expire = std::chrono::seconds(m_blocklist_expire_seconds);
+ }
+ m_asio_engine.get_rados_api().blocklist_add(
+ m_locker.address, expire,
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_blocklist(r); }));
+}
+
+template <typename I>
+void BreakRequest<I>::handle_blocklist(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to blocklist lock owner: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ wait_for_osd_map();
+}
+
+template <typename I>
+void BreakRequest<I>::wait_for_osd_map() {
+ ldout(m_cct, 10) << dendl;
+
+ m_asio_engine.get_rados_api().wait_for_latest_osd_map(
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_wait_for_osd_map(r); }));
+}
+
+template <typename I>
+void BreakRequest<I>::handle_wait_for_osd_map(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to wait for updated OSD map: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_break_lock();
+}
+
+template <typename I>
+void BreakRequest<I>::send_break_lock() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, m_locker.cookie,
+ m_locker.entity);
+
+ using klass = BreakRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_break_lock>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void BreakRequest<I>::handle_break_lock(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void BreakRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::BreakRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/BreakRequest.h b/src/librbd/managed_lock/BreakRequest.h
new file mode 100644
index 000000000..dd46bbcc5
--- /dev/null
+++ b/src/librbd/managed_lock/BreakRequest.h
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "msg/msg_types.h"
+#include <list>
+#include <string>
+#include <boost/optional.hpp>
+#include "librbd/managed_lock/Types.h"
+
+class Context;
+class ContextWQ;
+class obj_watch_t;
+
+namespace librbd {
+
+class AsioEngine;
+class ImageCtx;
+template <typename> class Journal;
+namespace asio { struct ContextWQ; }
+
+namespace managed_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class BreakRequest {
+public:
+ static BreakRequest* create(librados::IoCtx& ioctx,
+ AsioEngine& asio_engine,
+ const std::string& oid, const Locker &locker,
+ bool exclusive, bool blocklist_locker,
+ uint32_t blocklist_expire_seconds,
+ bool force_break_lock, Context *on_finish) {
+ return new BreakRequest(ioctx, asio_engine, oid, locker, exclusive,
+ blocklist_locker, blocklist_expire_seconds,
+ force_break_lock, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_WATCHERS
+ * |
+ * v
+ * GET_LOCKER
+ * |
+ * v
+ * BLOCKLIST (skip if disabled)
+ * |
+ * v
+ * WAIT_FOR_OSD_MAP
+ * |
+ * v
+ * BREAK_LOCK
+ * |
+ * v
+ * <finish>
+ *
+ * @endvertbatim
+ */
+
+ librados::IoCtx &m_ioctx;
+ CephContext *m_cct;
+ AsioEngine& m_asio_engine;
+ std::string m_oid;
+ Locker m_locker;
+ bool m_exclusive;
+ bool m_blocklist_locker;
+ uint32_t m_blocklist_expire_seconds;
+ bool m_force_break_lock;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ std::list<obj_watch_t> m_watchers;
+ int m_watchers_ret_val;
+
+ Locker m_refreshed_locker;
+
+ BreakRequest(librados::IoCtx& ioctx, AsioEngine& asio_engine,
+ const std::string& oid, const Locker &locker,
+ bool exclusive, bool blocklist_locker,
+ uint32_t blocklist_expire_seconds, bool force_break_lock,
+ Context *on_finish);
+
+ void send_get_watchers();
+ void handle_get_watchers(int r);
+
+ void send_get_locker();
+ void handle_get_locker(int r);
+
+ void send_blocklist();
+ void handle_blocklist(int r);
+
+ void wait_for_osd_map();
+ void handle_wait_for_osd_map(int r);
+
+ void send_break_lock();
+ void handle_break_lock(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+extern template class librbd::managed_lock::BreakRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H
diff --git a/src/librbd/managed_lock/GetLockerRequest.cc b/src/librbd/managed_lock/GetLockerRequest.cc
new file mode 100644
index 000000000..6b2c27342
--- /dev/null
+++ b/src/librbd/managed_lock/GetLockerRequest.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/GetLockerRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/managed_lock/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::GetLockerRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace managed_lock {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+GetLockerRequest<I>::GetLockerRequest(librados::IoCtx& ioctx,
+ const std::string& oid, bool exclusive,
+ Locker *locker, Context *on_finish)
+ : m_ioctx(ioctx), m_cct(reinterpret_cast<CephContext *>(m_ioctx.cct())),
+ m_oid(oid), m_exclusive(exclusive), m_locker(locker),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void GetLockerRequest<I>::send() {
+ send_get_lockers();
+}
+
+template <typename I>
+void GetLockerRequest<I>::send_get_lockers() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = GetLockerRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_get_lockers>(this);
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void GetLockerRequest<I>::handle_get_lockers(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> lockers;
+ ClsLockType lock_type = ClsLockType::NONE;
+ std::string lock_tag;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = rados::cls::lock::get_lock_info_finish(&it, &lockers, &lock_type,
+ &lock_tag);
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (lockers.empty()) {
+ ldout(m_cct, 20) << "no lockers detected" << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ if (lock_tag != util::get_watcher_lock_tag()) {
+ ldout(m_cct, 5) <<"locked by external mechanism: tag=" << lock_tag << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ if (m_exclusive && lock_type == ClsLockType::SHARED) {
+ ldout(m_cct, 5) << "incompatible shared lock type detected" << dendl;
+ finish(-EBUSY);
+ return;
+ } else if (!m_exclusive && lock_type == ClsLockType::EXCLUSIVE) {
+ ldout(m_cct, 5) << "incompatible exclusive lock type detected" << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t>::iterator iter = lockers.begin();
+ if (!util::decode_lock_cookie(iter->first.cookie, &m_locker->handle)) {
+ ldout(m_cct, 5) << "locked by external mechanism: "
+ << "cookie=" << iter->first.cookie << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ m_locker->entity = iter->first.locker;
+ m_locker->cookie = iter->first.cookie;
+ m_locker->address = iter->second.addr.get_legacy_str();
+ if (m_locker->cookie.empty() || m_locker->address.empty()) {
+ ldout(m_cct, 20) << "no valid lockers detected" << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ ldout(m_cct, 10) << "retrieved exclusive locker: "
+ << m_locker->entity << "@" << m_locker->address << dendl;
+ finish(0);
+}
+
+template <typename I>
+void GetLockerRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::GetLockerRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/GetLockerRequest.h b/src/librbd/managed_lock/GetLockerRequest.h
new file mode 100644
index 000000000..b8fd08f6e
--- /dev/null
+++ b/src/librbd/managed_lock/GetLockerRequest.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace managed_lock {
+
+struct Locker;
+
+template <typename ImageCtxT = ImageCtx>
+class GetLockerRequest {
+public:
+ static GetLockerRequest* create(librados::IoCtx& ioctx,
+ const std::string& oid, bool exclusive,
+ Locker *locker, Context *on_finish) {
+ return new GetLockerRequest(ioctx, oid, exclusive, locker, on_finish);
+ }
+
+ void send();
+
+private:
+ librados::IoCtx &m_ioctx;
+ CephContext *m_cct;
+ std::string m_oid;
+ bool m_exclusive;
+ Locker *m_locker;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ GetLockerRequest(librados::IoCtx& ioctx, const std::string& oid,
+ bool exclusive, Locker *locker, Context *on_finish);
+
+ void send_get_lockers();
+ void handle_get_lockers(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+extern template class librbd::managed_lock::GetLockerRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H
diff --git a/src/librbd/managed_lock/ReacquireRequest.cc b/src/librbd/managed_lock/ReacquireRequest.cc
new file mode 100644
index 000000000..9eaa51569
--- /dev/null
+++ b/src/librbd/managed_lock/ReacquireRequest.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/ReacquireRequest.h"
+#include "librbd/Watcher.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/managed_lock/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::ReacquireRequest: " \
+ << this << ": " << __func__
+
+using std::string;
+
+namespace librbd {
+namespace managed_lock {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+ReacquireRequest<I>::ReacquireRequest(librados::IoCtx& ioctx,
+ const string& oid,
+ const string& old_cookie,
+ const string &new_cookie,
+ bool exclusive,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_oid(oid), m_old_cookie(old_cookie),
+ m_new_cookie(new_cookie), m_exclusive(exclusive), m_on_finish(on_finish) {
+}
+
+
+template <typename I>
+void ReacquireRequest<I>::send() {
+ set_cookie();
+}
+
+template <typename I>
+void ReacquireRequest<I>::set_cookie() {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::set_cookie(&op, RBD_LOCK_NAME,
+ m_exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED,
+ m_old_cookie, util::get_watcher_lock_tag(),
+ m_new_cookie);
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ ReacquireRequest, &ReacquireRequest::handle_set_cookie>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void ReacquireRequest<I>::handle_set_cookie(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << ": r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ ldout(cct, 10) << ": OSD doesn't support updating lock" << dendl;
+ } else if (r < 0) {
+ lderr(cct) << ": failed to update lock: " << cpp_strerror(r) << dendl;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::ReacquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/managed_lock/ReacquireRequest.h b/src/librbd/managed_lock/ReacquireRequest.h
new file mode 100644
index 000000000..3f2b7d7e2
--- /dev/null
+++ b/src/librbd/managed_lock/ReacquireRequest.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/int_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class Watcher;
+
+namespace managed_lock {
+
+template <typename ImageCtxT>
+class ReacquireRequest {
+public:
+
+ static ReacquireRequest *create(librados::IoCtx& ioctx,
+ const std::string& oid,
+ const std::string& old_cookie,
+ const std::string &new_cookie,
+ bool exclusive,
+ Context *on_finish) {
+ return new ReacquireRequest(ioctx, oid, old_cookie, new_cookie, exclusive,
+ on_finish);
+ }
+
+ ReacquireRequest(librados::IoCtx& ioctx, const std::string& oid,
+ const std::string& old_cookie,
+ const std::string &new_cookie, bool exclusive,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SET_COOKIE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ librados::IoCtx& m_ioctx;
+ std::string m_oid;
+ std::string m_old_cookie;
+ std::string m_new_cookie;
+ bool m_exclusive;
+ Context *m_on_finish;
+
+ void set_cookie();
+ void handle_set_cookie(int r);
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H
diff --git a/src/librbd/managed_lock/ReleaseRequest.cc b/src/librbd/managed_lock/ReleaseRequest.cc
new file mode 100644
index 000000000..598ececab
--- /dev/null
+++ b/src/librbd/managed_lock/ReleaseRequest.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/managed_lock/ReleaseRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/Watcher.h"
+#include "librbd/asio/ContextWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::managed_lock::ReleaseRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace managed_lock {
+
+using util::detail::C_AsyncCallback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+ReleaseRequest<I>* ReleaseRequest<I>::create(librados::IoCtx& ioctx,
+ Watcher *watcher,
+ asio::ContextWQ *work_queue,
+ const string& oid,
+ const string& cookie,
+ Context *on_finish) {
+ return new ReleaseRequest(ioctx, watcher, work_queue, oid, cookie,
+ on_finish);
+}
+
+template <typename I>
+ReleaseRequest<I>::ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ asio::ContextWQ *work_queue,
+ const string& oid, const string& cookie,
+ Context *on_finish)
+ : m_ioctx(ioctx), m_watcher(watcher), m_oid(oid), m_cookie(cookie),
+ m_on_finish(new C_AsyncCallback<asio::ContextWQ>(work_queue, on_finish)) {
+}
+
+template <typename I>
+ReleaseRequest<I>::~ReleaseRequest() {
+}
+
+
+template <typename I>
+void ReleaseRequest<I>::send() {
+ send_unlock();
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_unlock() {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", "
+ << "cookie=" << m_cookie << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::unlock(&op, RBD_LOCK_NAME, m_cookie);
+
+ using klass = ReleaseRequest;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_unlock>(this);
+ int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void ReleaseRequest<I>::handle_unlock(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to unlock: " << cpp_strerror(r) << dendl;
+ }
+
+ finish();
+}
+
+template <typename I>
+void ReleaseRequest<I>::finish() {
+ m_on_finish->complete(0);
+ delete this;
+}
+
+} // namespace managed_lock
+} // namespace librbd
+
+template class librbd::managed_lock::ReleaseRequest<librbd::ImageCtx>;
+
diff --git a/src/librbd/managed_lock/ReleaseRequest.h b/src/librbd/managed_lock/ReleaseRequest.h
new file mode 100644
index 000000000..91d922282
--- /dev/null
+++ b/src/librbd/managed_lock/ReleaseRequest.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H
+#define CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "librbd/watcher/Types.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+
+namespace librbd {
+
+class Watcher;
+namespace asio { struct ContextWQ; }
+
+namespace managed_lock {
+
+template <typename ImageCtxT>
+class ReleaseRequest {
+private:
+ typedef watcher::Traits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Watcher Watcher;
+
+public:
+ static ReleaseRequest* create(librados::IoCtx& ioctx, Watcher *watcher,
+ asio::ContextWQ *work_queue,
+ const std::string& oid,
+ const std::string& cookie,
+ Context *on_finish);
+
+ ~ReleaseRequest();
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNLOCK
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher,
+ asio::ContextWQ *work_queue, const std::string& oid,
+ const std::string& cookie, Context *on_finish);
+
+ librados::IoCtx& m_ioctx;
+ Watcher *m_watcher;
+ std::string m_oid;
+ std::string m_cookie;
+ Context *m_on_finish;
+
+ void send_unlock();
+ void handle_unlock(int r);
+
+ void finish();
+
+};
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H
diff --git a/src/librbd/managed_lock/Types.h b/src/librbd/managed_lock/Types.h
new file mode 100644
index 000000000..319789c83
--- /dev/null
+++ b/src/librbd/managed_lock/Types.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_TYPES_H
+#define CEPH_LIBRBD_MANAGED_LOCK_TYPES_H
+
+#include "msg/msg_types.h"
+#include <string>
+
+namespace librbd {
+namespace managed_lock {
+
+struct Locker {
+ entity_name_t entity;
+ std::string cookie;
+ std::string address;
+ uint64_t handle = 0;
+
+ Locker() {
+ }
+ Locker(const entity_name_t& entity, const std::string &cookie,
+ const std::string &address, uint64_t handle)
+ : entity(entity), cookie(cookie), address(address), handle(handle) {
+ }
+
+ inline bool operator==(const Locker &rhs) const {
+ return (entity == rhs.entity &&
+ cookie == rhs.cookie &&
+ address == rhs.address &&
+ handle == rhs.handle);
+ }
+ inline bool operator!=(const Locker &rhs) const {
+ return !(*this == rhs);
+ }
+};
+
+enum Mode {
+ EXCLUSIVE,
+ SHARED
+};
+
+
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_TYPES_H
diff --git a/src/librbd/managed_lock/Utils.cc b/src/librbd/managed_lock/Utils.cc
new file mode 100644
index 000000000..0b4f908dd
--- /dev/null
+++ b/src/librbd/managed_lock/Utils.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/ceph_assert.h"
+#include "librbd/managed_lock/Utils.h"
+#include <sstream>
+
+namespace librbd {
+namespace managed_lock {
+namespace util {
+
+namespace {
+
+const std::string WATCHER_LOCK_COOKIE_PREFIX = "auto";
+const std::string WATCHER_LOCK_TAG("internal");
+
+} // anonymous namespace
+
+const std::string &get_watcher_lock_tag() {
+ return WATCHER_LOCK_TAG;
+}
+
+bool decode_lock_cookie(const std::string &tag, uint64_t *handle) {
+ std::string prefix;
+ std::istringstream ss(tag);
+ if (!(ss >> prefix >> *handle) || prefix != WATCHER_LOCK_COOKIE_PREFIX) {
+ return false;
+ }
+ return true;
+}
+
+std::string encode_lock_cookie(uint64_t watch_handle) {
+ ceph_assert(watch_handle != 0);
+ std::ostringstream ss;
+ ss << WATCHER_LOCK_COOKIE_PREFIX << " " << watch_handle;
+ return ss.str();
+}
+
+} // namespace util
+} // namespace managed_lock
+} // namespace librbd
+
+
diff --git a/src/librbd/managed_lock/Utils.h b/src/librbd/managed_lock/Utils.h
new file mode 100644
index 000000000..679cbfe8e
--- /dev/null
+++ b/src/librbd/managed_lock/Utils.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MANAGED_LOCK_UTILS_H
+#define CEPH_LIBRBD_MANAGED_LOCK_UTILS_H
+
+#include "include/int_types.h"
+#include <string>
+
+namespace librbd {
+namespace managed_lock {
+namespace util {
+
+const std::string &get_watcher_lock_tag();
+
+bool decode_lock_cookie(const std::string &tag, uint64_t *handle);
+std::string encode_lock_cookie(uint64_t watch_handle);
+
+} // namespace util
+} // namespace managed_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MANAGED_LOCK_UTILS_H
diff --git a/src/librbd/migration/FileStream.cc b/src/librbd/migration/FileStream.cc
new file mode 100644
index 000000000..63cd722dd
--- /dev/null
+++ b/src/librbd/migration/FileStream.cc
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _LARGEFILE64_SOURCE
+#define _LARGEFILE64_SOURCE
+#endif // _LARGEFILE64_SOURCE
+
+#include "librbd/migration/FileStream.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/Utils.h"
+#include <boost/asio/buffer.hpp>
+#include <boost/asio/post.hpp>
+#include <boost/asio/read.hpp>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string FILE_PATH {"file_path"};
+
+} // anonymous namespace
+
+#ifdef BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::FileStream::ReadRequest " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+struct FileStream<I>::ReadRequest {
+ FileStream* file_stream;
+ io::Extents byte_extents;
+ bufferlist* data;
+ Context* on_finish;
+ size_t index = 0;
+
+ ReadRequest(FileStream* file_stream, io::Extents&& byte_extents,
+ bufferlist* data, Context* on_finish)
+ : file_stream(file_stream), byte_extents(std::move(byte_extents)),
+ data(data), on_finish(on_finish) {
+ auto cct = file_stream->m_cct;
+ ldout(cct, 20) << dendl;
+ }
+
+ void send() {
+ data->clear();
+ read();
+ }
+
+ void read() {
+ auto cct = file_stream->m_cct;
+ if (index >= byte_extents.size()) {
+ finish(0);
+ return;
+ }
+
+ auto& byte_extent = byte_extents[index++];
+ ldout(cct, 20) << "byte_extent=" << byte_extent << dendl;
+
+ auto ptr = buffer::ptr_node::create(buffer::create_small_page_aligned(
+ byte_extent.second));
+ auto buffer = boost::asio::mutable_buffer(
+ ptr->c_str(), byte_extent.second);
+ data->push_back(std::move(ptr));
+
+ int r;
+ auto offset = lseek64(file_stream->m_file_no, byte_extent.first, SEEK_SET);
+ if (offset == -1) {
+ r = -errno;
+ lderr(cct) << "failed to seek file stream: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ boost::system::error_code ec;
+ size_t bytes_read = boost::asio::read(
+ *file_stream->m_stream_descriptor, std::move(buffer), ec);
+ r = -ec.value();
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to read from file stream: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else if (bytes_read < byte_extent.second) {
+ lderr(cct) << "failed to read " << byte_extent.second << " bytes from "
+ << "file stream" << dendl;
+ finish(-ERANGE);
+ return;
+ }
+
+ // re-queue the remainder of the read requests
+ boost::asio::post(file_stream->m_strand, [this]() { read(); });
+ }
+
+ void finish(int r) {
+ auto cct = file_stream->m_cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ data->clear();
+ }
+
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::FileStream: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+FileStream<I>::FileStream(I* image_ctx, const json_spirit::mObject& json_object)
+ : m_cct(image_ctx->cct), m_asio_engine(image_ctx->asio_engine),
+ m_json_object(json_object),
+ m_strand(boost::asio::make_strand(*m_asio_engine)) {
+}
+
+template <typename I>
+FileStream<I>::~FileStream() {
+ if (m_file_no != -1) {
+ ::close(m_file_no);
+ }
+}
+
+#ifdef BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+
+template <typename I>
+void FileStream<I>::open(Context* on_finish) {
+ auto& file_path_value = m_json_object[FILE_PATH];
+ if (file_path_value.type() != json_spirit::str_type) {
+ lderr(m_cct) << "failed to locate '" << FILE_PATH << "' key" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& file_path = file_path_value.get_str();
+ ldout(m_cct, 10) << "file_path=" << file_path << dendl;
+
+ m_file_no = ::open(file_path.c_str(), O_RDONLY);
+ if (m_file_no < 0) {
+ int r = -errno;
+ lderr(m_cct) << "failed to open file stream '" << file_path << "': "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ m_stream_descriptor = std::make_optional<
+ boost::asio::posix::stream_descriptor>(m_strand, m_file_no);
+ on_finish->complete(0);
+}
+
+template <typename I>
+void FileStream<I>::close(Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ m_stream_descriptor.reset();
+ on_finish->complete(0);
+}
+
+template <typename I>
+void FileStream<I>::get_size(uint64_t* size, Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ // execute IO operations in a single strand to prevent seek races
+ boost::asio::post(
+ m_strand, [this, size, on_finish]() {
+ auto offset = lseek64(m_file_no, 0, SEEK_END);
+ if (offset == -1) {
+ int r = -errno;
+ lderr(m_cct) << "failed to seek to file end: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ ldout(m_cct, 10) << "size=" << offset << dendl;
+ *size = offset;
+ on_finish->complete(0);
+ });
+}
+
+template <typename I>
+void FileStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) {
+ ldout(m_cct, 20) << byte_extents << dendl;
+
+ auto ctx = new ReadRequest(this, std::move(byte_extents), data, on_finish);
+
+ // execute IO operations in a single strand to prevent seek races
+ boost::asio::post(m_strand, [ctx]() { ctx->send(); });
+}
+
+#else // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+
+template <typename I>
+void FileStream<I>::open(Context* on_finish) {
+ on_finish->complete(-EIO);
+}
+
+template <typename I>
+void FileStream<I>::close(Context* on_finish) {
+ on_finish->complete(-EIO);
+}
+
+template <typename I>
+void FileStream<I>::get_size(uint64_t* size, Context* on_finish) {
+ on_finish->complete(-EIO);
+}
+
+template <typename I>
+void FileStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) {
+ on_finish->complete(-EIO);
+}
+
+#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::FileStream<librbd::ImageCtx>;
diff --git a/src/librbd/migration/FileStream.h b/src/librbd/migration/FileStream.h
new file mode 100644
index 000000000..32face71e
--- /dev/null
+++ b/src/librbd/migration/FileStream.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_FILE_STREAM_H
+#define CEPH_LIBRBD_MIGRATION_FILE_STREAM_H
+
+#include "include/int_types.h"
+#include "librbd/migration/StreamInterface.h"
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/strand.hpp>
+#include <boost/asio/posix/basic_stream_descriptor.hpp>
+#include <json_spirit/json_spirit.h>
+#include <memory>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename ImageCtxT>
+class FileStream : public StreamInterface {
+public:
+ static FileStream* create(ImageCtxT* image_ctx,
+ const json_spirit::mObject& json_object) {
+ return new FileStream(image_ctx, json_object);
+ }
+
+ FileStream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object);
+ ~FileStream() override;
+
+ FileStream(const FileStream&) = delete;
+ FileStream& operator=(const FileStream&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_size(uint64_t* size, Context* on_finish) override;
+
+ void read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) override;
+
+private:
+ CephContext* m_cct;
+ std::shared_ptr<AsioEngine> m_asio_engine;
+ json_spirit::mObject m_json_object;
+
+ boost::asio::strand<boost::asio::io_context::executor_type> m_strand;
+#ifdef BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+ std::optional<boost::asio::posix::stream_descriptor> m_stream_descriptor;
+
+ struct ReadRequest;
+
+#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+
+ int m_file_no = -1;
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::FileStream<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_FILE_STREAM_H
diff --git a/src/librbd/migration/FormatInterface.h b/src/librbd/migration/FormatInterface.h
new file mode 100644
index 000000000..d13521d11
--- /dev/null
+++ b/src/librbd/migration/FormatInterface.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_FORMAT_INTERFACE_H
+#define CEPH_LIBRBD_MIGRATION_FORMAT_INTERFACE_H
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+struct Context;
+
+namespace librbd {
+
+namespace io {
+struct AioCompletion;
+struct ReadResult;
+} // namespace io
+
+namespace migration {
+
+struct FormatInterface {
+ typedef std::map<uint64_t, SnapInfo> SnapInfos;
+
+ virtual ~FormatInterface() {
+ }
+
+ virtual void open(Context* on_finish) = 0;
+ virtual void close(Context* on_finish) = 0;
+
+ virtual void get_snapshots(SnapInfos* snap_infos, Context* on_finish) = 0;
+ virtual void get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) = 0;
+
+ virtual bool read(io::AioCompletion* aio_comp, uint64_t snap_id,
+ io::Extents&& image_extents, io::ReadResult&& read_result,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) = 0;
+
+ virtual void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids,
+ int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) = 0;
+};
+
+} // namespace migration
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIGRATION_FORMAT_INTERFACE_H
diff --git a/src/librbd/migration/HttpClient.cc b/src/librbd/migration/HttpClient.cc
new file mode 100644
index 000000000..679c2bb07
--- /dev/null
+++ b/src/librbd/migration/HttpClient.cc
@@ -0,0 +1,946 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/HttpClient.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/migration/Utils.h"
+#include <boost/asio/buffer.hpp>
+#include <boost/asio/post.hpp>
+#include <boost/asio/ip/tcp.hpp>
+#include <boost/asio/read.hpp>
+#include <boost/asio/ssl.hpp>
+#include <boost/beast/core.hpp>
+#include <boost/beast/http/read.hpp>
+#include <boost/lexical_cast.hpp>
+#include <deque>
+
+namespace librbd {
+namespace migration {
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::HttpClient::" \
+ << "HttpSession " << this << " " << __func__ \
+ << ": "
+
+/**
+ * boost::beast utilizes non-inheriting template classes for handling plain vs
+ * encrypted TCP streams. Utilize a base-class for handling the majority of the
+ * logic for handling connecting, disconnecting, reseting, and sending requests.
+ */
+
+template <typename I>
+template <typename D>
+class HttpClient<I>::HttpSession : public HttpSessionInterface {
+public:
+ void init(Context* on_finish) override {
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ ceph_assert(m_state == STATE_UNINITIALIZED);
+ m_state = STATE_CONNECTING;
+
+ resolve_host(on_finish);
+ }
+
+ void shut_down(Context* on_finish) override {
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ ceph_assert(on_finish != nullptr);
+ ceph_assert(m_on_shutdown == nullptr);
+ m_on_shutdown = on_finish;
+
+ auto current_state = m_state;
+ if (current_state == STATE_UNINITIALIZED) {
+ // never initialized or resolve/connect failed
+ on_finish->complete(0);
+ return;
+ }
+
+ m_state = STATE_SHUTTING_DOWN;
+ if (current_state != STATE_READY) {
+ // delay shutdown until current state transition completes
+ return;
+ }
+
+ disconnect(new LambdaContext([this](int r) { handle_shut_down(r); }));
+ }
+
+ void issue(std::shared_ptr<Work>&& work) override {
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 20) << "work=" << work.get() << dendl;
+
+ if (is_shutdown()) {
+ lderr(cct) << "cannot issue HTTP request, client is shutdown"
+ << dendl;
+ work->complete(-ESHUTDOWN, {});
+ return;
+ }
+
+ bool first_issue = m_issue_queue.empty();
+ m_issue_queue.emplace_back(work);
+ if (m_state == STATE_READY && first_issue) {
+ ldout(cct, 20) << "sending http request: work=" << work.get() << dendl;
+ finalize_issue(std::move(work));
+ } else if (m_state == STATE_UNINITIALIZED) {
+ ldout(cct, 20) << "resetting HTTP session: work=" << work.get() << dendl;
+ m_state = STATE_RESET_CONNECTING;
+ resolve_host(nullptr);
+ } else {
+ ldout(cct, 20) << "queueing HTTP request: work=" << work.get() << dendl;
+ }
+ }
+
+ void finalize_issue(std::shared_ptr<Work>&& work) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 20) << "work=" << work.get() << dendl;
+
+ ++m_in_flight_requests;
+ (*work)(derived().stream());
+ }
+
+ void handle_issue(boost::system::error_code ec,
+ std::shared_ptr<Work>&& work) override {
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 20) << "work=" << work.get() << ", r=" << -ec.value() << dendl;
+
+ ceph_assert(m_in_flight_requests > 0);
+ --m_in_flight_requests;
+ if (maybe_finalize_reset()) {
+ // previous request is attempting reset to this request will be resent
+ return;
+ }
+
+ ceph_assert(!m_issue_queue.empty());
+ m_issue_queue.pop_front();
+
+ if (is_shutdown()) {
+ lderr(cct) << "client shutdown during in-flight request" << dendl;
+ work->complete(-ESHUTDOWN, {});
+
+ maybe_finalize_shutdown();
+ return;
+ }
+
+ if (ec) {
+ if (ec == boost::asio::error::bad_descriptor ||
+ ec == boost::asio::error::broken_pipe ||
+ ec == boost::asio::error::connection_reset ||
+ ec == boost::asio::error::operation_aborted ||
+ ec == boost::asio::ssl::error::stream_truncated ||
+ ec == boost::beast::http::error::end_of_stream ||
+ ec == boost::beast::http::error::partial_message) {
+ ldout(cct, 5) << "remote peer stream closed, retrying request" << dendl;
+ m_issue_queue.push_front(work);
+ } else if (ec == boost::beast::error::timeout) {
+ lderr(cct) << "timed-out while issuing request" << dendl;
+ work->complete(-ETIMEDOUT, {});
+ } else {
+ lderr(cct) << "failed to issue request: " << ec.message() << dendl;
+ work->complete(-ec.value(), {});
+ }
+
+ // attempt to recover the connection
+ reset();
+ return;
+ }
+
+ bool first_receive = m_receive_queue.empty();
+ m_receive_queue.push_back(work);
+ if (first_receive) {
+ receive(std::move(work));
+ }
+
+ // TODO disable pipelining for non-idempotent requests
+
+ // pipeline the next request into the stream
+ if (!m_issue_queue.empty()) {
+ work = m_issue_queue.front();
+ ldout(cct, 20) << "sending http request: work=" << work.get() << dendl;
+ finalize_issue(std::move(work));
+ }
+ }
+
+protected:
+ HttpClient* m_http_client;
+
+ HttpSession(HttpClient* http_client)
+ : m_http_client(http_client), m_resolver(http_client->m_strand) {
+ }
+
+ virtual void connect(boost::asio::ip::tcp::resolver::results_type results,
+ Context* on_finish) = 0;
+ virtual void disconnect(Context* on_finish) = 0;
+
+ void close_socket() {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ boost::system::error_code ec;
+ boost::beast::get_lowest_layer(derived().stream()).socket().close(ec);
+ }
+
+private:
+ enum State {
+ STATE_UNINITIALIZED,
+ STATE_CONNECTING,
+ STATE_READY,
+ STATE_RESET_PENDING,
+ STATE_RESET_DISCONNECTING,
+ STATE_RESET_CONNECTING,
+ STATE_SHUTTING_DOWN,
+ STATE_SHUTDOWN,
+ };
+
+ State m_state = STATE_UNINITIALIZED;
+ boost::asio::ip::tcp::resolver m_resolver;
+
+ Context* m_on_shutdown = nullptr;
+
+ uint64_t m_in_flight_requests = 0;
+ std::deque<std::shared_ptr<Work>> m_issue_queue;
+ std::deque<std::shared_ptr<Work>> m_receive_queue;
+
+ boost::beast::flat_buffer m_buffer;
+ std::optional<boost::beast::http::parser<false, EmptyBody>> m_header_parser;
+ std::optional<boost::beast::http::parser<false, StringBody>> m_parser;
+
+ D& derived() {
+ return static_cast<D&>(*this);
+ }
+
+ void resolve_host(Context* on_finish) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ shutdown_socket();
+ m_resolver.async_resolve(
+ m_http_client->m_url_spec.host, m_http_client->m_url_spec.port,
+ [this, on_finish](boost::system::error_code ec, auto results) {
+ handle_resolve_host(ec, results, on_finish); });
+ }
+
+ void handle_resolve_host(
+ boost::system::error_code ec,
+ boost::asio::ip::tcp::resolver::results_type results,
+ Context* on_finish) {
+ auto cct = m_http_client->m_cct;
+ int r = -ec.value();
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (ec) {
+ if (ec == boost::asio::error::host_not_found) {
+ r = -ENOENT;
+ } else if (ec == boost::asio::error::host_not_found_try_again) {
+ // TODO: add retry throttle
+ r = -EAGAIN;
+ }
+
+ lderr(cct) << "failed to resolve host '"
+ << m_http_client->m_url_spec.host << "': "
+ << cpp_strerror(r) << dendl;
+ advance_state(STATE_UNINITIALIZED, r, on_finish);
+ return;
+ }
+
+ connect(results, new LambdaContext([this, on_finish](int r) {
+ handle_connect(r, on_finish); }));
+ }
+
+ void handle_connect(int r, Context* on_finish) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to connect to host '"
+ << m_http_client->m_url_spec.host << "': "
+ << cpp_strerror(r) << dendl;
+ advance_state(STATE_UNINITIALIZED, r, on_finish);
+ return;
+ }
+
+ advance_state(STATE_READY, 0, on_finish);
+ }
+
+ void handle_shut_down(int r) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to disconnect stream: '" << cpp_strerror(r)
+ << dendl;
+ }
+
+ // cancel all in-flight send/receives (if any)
+ shutdown_socket();
+
+ maybe_finalize_shutdown();
+ }
+
+ void maybe_finalize_shutdown() {
+ if (m_in_flight_requests > 0) {
+ return;
+ }
+
+ // cancel any queued IOs
+ fail_queued_work(-ESHUTDOWN);
+
+ advance_state(STATE_SHUTDOWN, 0, nullptr);
+ }
+
+ bool is_shutdown() const {
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+ return (m_state == STATE_SHUTTING_DOWN || m_state == STATE_SHUTDOWN);
+ }
+
+ void reset() {
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+ ceph_assert(m_state == STATE_READY);
+
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ m_state = STATE_RESET_PENDING;
+ maybe_finalize_reset();
+ }
+
+ bool maybe_finalize_reset() {
+ if (m_state != STATE_RESET_PENDING) {
+ return false;
+ }
+
+ if (m_in_flight_requests > 0) {
+ return true;
+ }
+
+ ceph_assert(m_http_client->m_strand.running_in_this_thread());
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ m_buffer.clear();
+
+ // move in-flight request back to the front of the issue queue
+ m_issue_queue.insert(m_issue_queue.begin(),
+ m_receive_queue.begin(), m_receive_queue.end());
+ m_receive_queue.clear();
+
+ m_state = STATE_RESET_DISCONNECTING;
+ disconnect(new LambdaContext([this](int r) { handle_reset(r); }));
+ return true;
+ }
+
+ void handle_reset(int r) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to disconnect stream: '" << cpp_strerror(r)
+ << dendl;
+ }
+
+ advance_state(STATE_RESET_CONNECTING, r, nullptr);
+ }
+
+ int shutdown_socket() {
+ if (!boost::beast::get_lowest_layer(
+ derived().stream()).socket().is_open()) {
+ return 0;
+ }
+
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ boost::system::error_code ec;
+ boost::beast::get_lowest_layer(derived().stream()).socket().shutdown(
+ boost::asio::ip::tcp::socket::shutdown_both, ec);
+
+ if (ec && ec != boost::beast::errc::not_connected) {
+ lderr(cct) << "failed to shutdown socket: " << ec.message() << dendl;
+ return -ec.value();
+ }
+
+ close_socket();
+ return 0;
+ }
+
+ void receive(std::shared_ptr<Work>&& work) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << "work=" << work.get() << dendl;
+
+ ceph_assert(!m_receive_queue.empty());
+ ++m_in_flight_requests;
+
+ // receive the response for this request
+ m_parser.emplace();
+ if (work->header_only()) {
+ // HEAD requests don't trasfer data but the parser still cares about max
+ // content-length
+ m_header_parser.emplace();
+ m_header_parser->body_limit(std::numeric_limits<uint64_t>::max());
+
+ boost::beast::http::async_read_header(
+ derived().stream(), m_buffer, *m_header_parser,
+ [this, work=std::move(work)]
+ (boost::beast::error_code ec, std::size_t) mutable {
+ handle_receive(ec, std::move(work));
+ });
+ } else {
+ m_parser->body_limit(1 << 25); // max RBD object size
+ boost::beast::http::async_read(
+ derived().stream(), m_buffer, *m_parser,
+ [this, work=std::move(work)]
+ (boost::beast::error_code ec, std::size_t) mutable {
+ handle_receive(ec, std::move(work));
+ });
+ }
+ }
+
+ void handle_receive(boost::system::error_code ec,
+ std::shared_ptr<Work>&& work) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 15) << "work=" << work.get() << ", r=" << -ec.value() << dendl;
+
+ ceph_assert(m_in_flight_requests > 0);
+ --m_in_flight_requests;
+ if (maybe_finalize_reset()) {
+ // previous request is attempting reset to this request will be resent
+ return;
+ }
+
+ ceph_assert(!m_receive_queue.empty());
+ m_receive_queue.pop_front();
+
+ if (is_shutdown()) {
+ lderr(cct) << "client shutdown with in-flight request" << dendl;
+ work->complete(-ESHUTDOWN, {});
+
+ maybe_finalize_shutdown();
+ return;
+ }
+
+ if (ec) {
+ if (ec == boost::asio::error::bad_descriptor ||
+ ec == boost::asio::error::broken_pipe ||
+ ec == boost::asio::error::connection_reset ||
+ ec == boost::asio::error::operation_aborted ||
+ ec == boost::asio::ssl::error::stream_truncated ||
+ ec == boost::beast::http::error::end_of_stream ||
+ ec == boost::beast::http::error::partial_message) {
+ ldout(cct, 5) << "remote peer stream closed, retrying request" << dendl;
+ m_receive_queue.push_front(work);
+ } else if (ec == boost::beast::error::timeout) {
+ lderr(cct) << "timed-out while issuing request" << dendl;
+ work->complete(-ETIMEDOUT, {});
+ } else {
+ lderr(cct) << "failed to issue request: " << ec.message() << dendl;
+ work->complete(-ec.value(), {});
+ }
+
+ reset();
+ return;
+ }
+
+ Response response;
+ if (work->header_only()) {
+ m_parser.emplace(std::move(*m_header_parser));
+ }
+ response = m_parser->release();
+
+ // basic response code handling in a common location
+ int r = 0;
+ auto result = response.result();
+ if (result == boost::beast::http::status::not_found) {
+ lderr(cct) << "requested resource does not exist" << dendl;
+ r = -ENOENT;
+ } else if (result == boost::beast::http::status::forbidden) {
+ lderr(cct) << "permission denied attempting to access resource" << dendl;
+ r = -EACCES;
+ } else if (boost::beast::http::to_status_class(result) !=
+ boost::beast::http::status_class::successful) {
+ lderr(cct) << "failed to retrieve size: HTTP " << result << dendl;
+ r = -EIO;
+ }
+
+ bool need_eof = response.need_eof();
+ if (r < 0) {
+ work->complete(r, {});
+ } else {
+ work->complete(0, std::move(response));
+ }
+
+ if (need_eof) {
+ ldout(cct, 20) << "reset required for non-pipelined response: "
+ << "work=" << work.get() << dendl;
+ reset();
+ } else if (!m_receive_queue.empty()) {
+ auto work = m_receive_queue.front();
+ receive(std::move(work));
+ }
+ }
+
+ void advance_state(State next_state, int r, Context* on_finish) {
+ auto cct = m_http_client->m_cct;
+ auto current_state = m_state;
+ ldout(cct, 15) << "current_state=" << current_state << ", "
+ << "next_state=" << next_state << ", "
+ << "r=" << r << dendl;
+
+ m_state = next_state;
+ if (current_state == STATE_CONNECTING) {
+ if (next_state == STATE_UNINITIALIZED) {
+ shutdown_socket();
+ on_finish->complete(r);
+ return;
+ } else if (next_state == STATE_READY) {
+ on_finish->complete(r);
+ return;
+ }
+ } else if (current_state == STATE_SHUTTING_DOWN) {
+ if (next_state == STATE_READY) {
+ // shut down requested while connecting/resetting
+ disconnect(new LambdaContext([this](int r) { handle_shut_down(r); }));
+ return;
+ } else if (next_state == STATE_UNINITIALIZED ||
+ next_state == STATE_SHUTDOWN ||
+ next_state == STATE_RESET_CONNECTING) {
+ ceph_assert(m_on_shutdown != nullptr);
+ m_on_shutdown->complete(r);
+ return;
+ }
+ } else if (current_state == STATE_RESET_DISCONNECTING) {
+ // disconnected from peer -- ignore errors and reconnect
+ ceph_assert(next_state == STATE_RESET_CONNECTING);
+ ceph_assert(on_finish == nullptr);
+ shutdown_socket();
+ resolve_host(nullptr);
+ return;
+ } else if (current_state == STATE_RESET_CONNECTING) {
+ ceph_assert(on_finish == nullptr);
+ if (next_state == STATE_READY) {
+ // restart queued IO
+ if (!m_issue_queue.empty()) {
+ auto& work = m_issue_queue.front();
+ finalize_issue(std::move(work));
+ }
+ return;
+ } else if (next_state == STATE_UNINITIALIZED) {
+ shutdown_socket();
+
+ // fail all queued IO
+ fail_queued_work(r);
+ return;
+ }
+ }
+
+ lderr(cct) << "unexpected state transition: "
+ << "current_state=" << current_state << ", "
+ << "next_state=" << next_state << dendl;
+ ceph_assert(false);
+ }
+
+ void complete_work(std::shared_ptr<Work> work, int r, Response&& response) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 20) << "work=" << work.get() << ", r=" << r << dendl;
+
+ work->complete(r, std::move(response));
+ }
+
+ void fail_queued_work(int r) {
+ auto cct = m_http_client->m_cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ for (auto& work : m_issue_queue) {
+ complete_work(work, r, {});
+ }
+ m_issue_queue.clear();
+ ceph_assert(m_receive_queue.empty());
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::HttpClient::" \
+ << "PlainHttpSession " << this << " " << __func__ \
+ << ": "
+
+template <typename I>
+class HttpClient<I>::PlainHttpSession : public HttpSession<PlainHttpSession> {
+public:
+ PlainHttpSession(HttpClient* http_client)
+ : HttpSession<PlainHttpSession>(http_client),
+ m_stream(http_client->m_strand) {
+ }
+ ~PlainHttpSession() override {
+ this->close_socket();
+ }
+
+ inline boost::beast::tcp_stream&
+ stream() {
+ return m_stream;
+ }
+
+protected:
+ void connect(boost::asio::ip::tcp::resolver::results_type results,
+ Context* on_finish) override {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ m_stream.async_connect(
+ results,
+ asio::util::get_callback_adapter(
+ [on_finish](int r, auto endpoint) { on_finish->complete(r); }));
+ }
+
+ void disconnect(Context* on_finish) override {
+ on_finish->complete(0);
+ }
+
+private:
+ boost::beast::tcp_stream m_stream;
+
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::HttpClient::" \
+ << "SslHttpSession " << this << " " << __func__ \
+ << ": "
+
+template <typename I>
+class HttpClient<I>::SslHttpSession : public HttpSession<SslHttpSession> {
+public:
+ SslHttpSession(HttpClient* http_client)
+ : HttpSession<SslHttpSession>(http_client),
+ m_stream(http_client->m_strand, http_client->m_ssl_context) {
+ }
+ ~SslHttpSession() override {
+ this->close_socket();
+ }
+
+ inline boost::beast::ssl_stream<boost::beast::tcp_stream>&
+ stream() {
+ return m_stream;
+ }
+
+protected:
+ void connect(boost::asio::ip::tcp::resolver::results_type results,
+ Context* on_finish) override {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ boost::beast::get_lowest_layer(m_stream).async_connect(
+ results,
+ asio::util::get_callback_adapter(
+ [this, on_finish](int r, auto endpoint) {
+ handle_connect(r, on_finish); }));
+ }
+
+ void disconnect(Context* on_finish) override {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ if (!m_ssl_enabled) {
+ on_finish->complete(0);
+ return;
+ }
+
+ m_stream.async_shutdown(
+ asio::util::get_callback_adapter([this, on_finish](int r) {
+ shutdown(r, on_finish); }));
+ }
+
+private:
+ boost::beast::ssl_stream<boost::beast::tcp_stream> m_stream;
+ bool m_ssl_enabled = false;
+
+ void handle_connect(int r, Context* on_finish) {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to connect to host '"
+ << http_client->m_url_spec.host << "': "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ handshake(on_finish);
+ }
+
+ void handshake(Context* on_finish) {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << dendl;
+
+ auto& host = http_client->m_url_spec.host;
+ m_stream.set_verify_mode(
+ boost::asio::ssl::verify_peer |
+ boost::asio::ssl::verify_fail_if_no_peer_cert);
+ m_stream.set_verify_callback(
+ [host, next=boost::asio::ssl::host_name_verification(host),
+ ignore_self_signed=http_client->m_ignore_self_signed_cert]
+ (bool preverified, boost::asio::ssl::verify_context& ctx) {
+ if (!preverified && ignore_self_signed) {
+ auto ec = X509_STORE_CTX_get_error(ctx.native_handle());
+ switch (ec) {
+ case X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT:
+ case X509_V_ERR_SELF_SIGNED_CERT_IN_CHAIN:
+ // ignore self-signed cert issues
+ preverified = true;
+ break;
+ default:
+ break;
+ }
+ }
+ return next(preverified, ctx);
+ });
+
+ // Set SNI Hostname (many hosts need this to handshake successfully)
+ if(!SSL_set_tlsext_host_name(m_stream.native_handle(),
+ http_client->m_url_spec.host.c_str())) {
+ int r = -::ERR_get_error();
+ lderr(cct) << "failed to initialize SNI hostname: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ // Perform the SSL/TLS handshake
+ m_stream.async_handshake(
+ boost::asio::ssl::stream_base::client,
+ asio::util::get_callback_adapter(
+ [this, on_finish](int r) { handle_handshake(r, on_finish); }));
+ }
+
+ void handle_handshake(int r, Context* on_finish) {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to complete handshake: " << cpp_strerror(r)
+ << dendl;
+ disconnect(new LambdaContext([r, on_finish](int) {
+ on_finish->complete(r); }));
+ return;
+ }
+
+ m_ssl_enabled = true;
+ on_finish->complete(0);
+ }
+
+ void shutdown(int r, Context* on_finish) {
+ auto http_client = this->m_http_client;
+ auto cct = http_client->m_cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ on_finish->complete(r);
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::HttpClient: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+HttpClient<I>::HttpClient(I* image_ctx, const std::string& url)
+ : m_cct(image_ctx->cct), m_image_ctx(image_ctx),
+ m_asio_engine(image_ctx->asio_engine), m_url(url),
+ m_strand(boost::asio::make_strand(*m_asio_engine)),
+ m_ssl_context(boost::asio::ssl::context::sslv23_client) {
+ m_ssl_context.set_default_verify_paths();
+}
+
+template <typename I>
+void HttpClient<I>::open(Context* on_finish) {
+ ldout(m_cct, 10) << "url=" << m_url << dendl;
+
+ int r = util::parse_url(m_cct, m_url, &m_url_spec);
+ if (r < 0) {
+ lderr(m_cct) << "failed to parse url '" << m_url << "': " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ boost::asio::post(m_strand, [this, on_finish]() mutable {
+ create_http_session(on_finish); });
+}
+
+template <typename I>
+void HttpClient<I>::close(Context* on_finish) {
+ boost::asio::post(m_strand, [this, on_finish]() mutable {
+ shut_down_http_session(on_finish); });
+}
+
+template <typename I>
+void HttpClient<I>::get_size(uint64_t* size, Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ Request req;
+ req.method(boost::beast::http::verb::head);
+
+ issue(
+ std::move(req), [this, size, on_finish](int r, Response&& response) {
+ handle_get_size(r, std::move(response), size, on_finish);
+ });
+}
+
+template <typename I>
+void HttpClient<I>::handle_get_size(int r, Response&& response, uint64_t* size,
+ Context* on_finish) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve size: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ } else if (!response.has_content_length()) {
+ lderr(m_cct) << "failed to retrieve size: missing content-length" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto content_length = response[boost::beast::http::field::content_length];
+ try {
+ *size = boost::lexical_cast<uint64_t>(content_length);
+ } catch (boost::bad_lexical_cast&) {
+ lderr(m_cct) << "invalid content-length in response" << dendl;
+ on_finish->complete(-EBADMSG);
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void HttpClient<I>::read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) {
+ ldout(m_cct, 20) << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_READ);
+ aio_comp->set_request_count(byte_extents.size());
+
+ // utilize ReadResult to assemble multiple byte extents into a single bl
+ // since boost::beast doesn't support multipart responses out-of-the-box
+ io::ReadResult read_result{data};
+ aio_comp->read_result = std::move(read_result);
+ aio_comp->read_result.set_image_extents(byte_extents);
+
+ // issue a range get request for each extent
+ uint64_t buffer_offset = 0;
+ for (auto [byte_offset, byte_length] : byte_extents) {
+ auto ctx = new io::ReadResult::C_ImageReadRequest(
+ aio_comp, buffer_offset, {{byte_offset, byte_length}});
+ buffer_offset += byte_length;
+
+ Request req;
+ req.method(boost::beast::http::verb::get);
+
+ std::stringstream range;
+ ceph_assert(byte_length > 0);
+ range << "bytes=" << byte_offset << "-" << (byte_offset + byte_length - 1);
+ req.set(boost::beast::http::field::range, range.str());
+
+ issue(
+ std::move(req),
+ [this, byte_offset=byte_offset, byte_length=byte_length, ctx]
+ (int r, Response&& response) {
+ handle_read(r, std::move(response), byte_offset, byte_length, &ctx->bl,
+ ctx);
+ });
+ }
+}
+
+template <typename I>
+void HttpClient<I>::handle_read(int r, Response&& response,
+ uint64_t byte_offset, uint64_t byte_length,
+ bufferlist* data, Context* on_finish) {
+ ldout(m_cct, 20) << "bytes=" << byte_offset << "~" << byte_length << ", "
+ << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to read requested byte range: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ } else if (response.result() != boost::beast::http::status::partial_content) {
+ lderr(m_cct) << "failed to retrieve requested byte range: HTTP "
+ << response.result() << dendl;
+ on_finish->complete(-EIO);
+ return;
+ } else if (byte_length != response.body().size()) {
+ lderr(m_cct) << "unexpected short range read: "
+ << "wanted=" << byte_length << ", "
+ << "received=" << response.body().size() << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ data->clear();
+ data->append(response.body());
+ on_finish->complete(data->length());
+}
+
+template <typename I>
+void HttpClient<I>::issue(std::shared_ptr<Work>&& work) {
+ boost::asio::post(m_strand, [this, work=std::move(work)]() mutable {
+ m_http_session->issue(std::move(work)); });
+}
+
+template <typename I>
+void HttpClient<I>::create_http_session(Context* on_finish) {
+ ldout(m_cct, 15) << dendl;
+
+ ceph_assert(m_http_session == nullptr);
+ switch (m_url_spec.scheme) {
+ case URL_SCHEME_HTTP:
+ m_http_session = std::make_unique<PlainHttpSession>(this);
+ break;
+ case URL_SCHEME_HTTPS:
+ m_http_session = std::make_unique<SslHttpSession>(this);
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+
+ m_http_session->init(on_finish);
+}
+
+template <typename I>
+void HttpClient<I>::shut_down_http_session(Context* on_finish) {
+ ldout(m_cct, 15) << dendl;
+
+ if (m_http_session == nullptr) {
+ on_finish->complete(0);
+ return;
+ }
+
+ m_http_session->shut_down(on_finish);
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::HttpClient<librbd::ImageCtx>;
diff --git a/src/librbd/migration/HttpClient.h b/src/librbd/migration/HttpClient.h
new file mode 100644
index 000000000..3997e6159
--- /dev/null
+++ b/src/librbd/migration/HttpClient.h
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_HTTP_CLIENT_H
+#define CEPH_LIBRBD_MIGRATION_HTTP_CLIENT_H
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "librbd/io/Types.h"
+#include "librbd/migration/HttpProcessorInterface.h"
+#include "librbd/migration/Types.h"
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/strand.hpp>
+#include <boost/asio/ip/tcp.hpp>
+#include <boost/asio/ssl/context.hpp>
+#include <boost/beast/version.hpp>
+#include <boost/beast/core/tcp_stream.hpp>
+#include <boost/beast/http/empty_body.hpp>
+#include <boost/beast/http/message.hpp>
+#include <boost/beast/http/string_body.hpp>
+#include <boost/beast/http/write.hpp>
+#include <boost/beast/ssl/ssl_stream.hpp>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename ImageCtxT>
+class HttpClient {
+public:
+ using EmptyBody = boost::beast::http::empty_body;
+ using StringBody = boost::beast::http::string_body;
+ using Request = boost::beast::http::request<EmptyBody>;
+ using Response = boost::beast::http::response<StringBody>;
+
+ using RequestPreprocessor = std::function<void(Request&)>;
+
+ static HttpClient* create(ImageCtxT* image_ctx, const std::string& url) {
+ return new HttpClient(image_ctx, url);
+ }
+
+ HttpClient(ImageCtxT* image_ctx, const std::string& url);
+ HttpClient(const HttpClient&) = delete;
+ HttpClient& operator=(const HttpClient&) = delete;
+
+ void open(Context* on_finish);
+ void close(Context* on_finish);
+
+ void get_size(uint64_t* size, Context* on_finish);
+
+ void read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish);
+
+ void set_ignore_self_signed_cert(bool ignore) {
+ m_ignore_self_signed_cert = ignore;
+ }
+
+ void set_http_processor(HttpProcessorInterface* http_processor) {
+ m_http_processor = http_processor;
+ }
+
+ template <class Body, typename Completion>
+ void issue(boost::beast::http::request<Body>&& request,
+ Completion&& completion) {
+ struct WorkImpl : Work {
+ HttpClient* http_client;
+ boost::beast::http::request<Body> request;
+ Completion completion;
+
+ WorkImpl(HttpClient* http_client,
+ boost::beast::http::request<Body>&& request,
+ Completion&& completion)
+ : http_client(http_client), request(std::move(request)),
+ completion(std::move(completion)) {
+ }
+ WorkImpl(const WorkImpl&) = delete;
+ WorkImpl& operator=(const WorkImpl&) = delete;
+
+ bool need_eof() const override {
+ return request.need_eof();
+ }
+
+ bool header_only() const override {
+ return (request.method() == boost::beast::http::verb::head);
+ }
+
+ void complete(int r, Response&& response) override {
+ completion(r, std::move(response));
+ }
+
+ void operator()(boost::beast::tcp_stream& stream) override {
+ preprocess_request();
+
+ boost::beast::http::async_write(
+ stream, request,
+ [http_session=http_client->m_http_session.get(),
+ work=this->shared_from_this()]
+ (boost::beast::error_code ec, std::size_t) mutable {
+ http_session->handle_issue(ec, std::move(work));
+ });
+ }
+
+ void operator()(
+ boost::beast::ssl_stream<boost::beast::tcp_stream>& stream) override {
+ preprocess_request();
+
+ boost::beast::http::async_write(
+ stream, request,
+ [http_session=http_client->m_http_session.get(),
+ work=this->shared_from_this()]
+ (boost::beast::error_code ec, std::size_t) mutable {
+ http_session->handle_issue(ec, std::move(work));
+ });
+ }
+
+ void preprocess_request() {
+ if (http_client->m_http_processor) {
+ http_client->m_http_processor->process_request(request);
+ }
+ }
+ };
+
+ initialize_default_fields(request);
+ issue(std::make_shared<WorkImpl>(this, std::move(request),
+ std::move(completion)));
+ }
+
+private:
+ struct Work;
+ struct HttpSessionInterface {
+ virtual ~HttpSessionInterface() {}
+
+ virtual void init(Context* on_finish) = 0;
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual void issue(std::shared_ptr<Work>&& work) = 0;
+ virtual void handle_issue(boost::system::error_code ec,
+ std::shared_ptr<Work>&& work) = 0;
+ };
+
+ struct Work : public std::enable_shared_from_this<Work> {
+ virtual ~Work() {}
+ virtual bool need_eof() const = 0;
+ virtual bool header_only() const = 0;
+ virtual void complete(int r, Response&&) = 0;
+ virtual void operator()(boost::beast::tcp_stream& stream) = 0;
+ virtual void operator()(
+ boost::beast::ssl_stream<boost::beast::tcp_stream>& stream) = 0;
+ };
+
+ template <typename D> struct HttpSession;
+ struct PlainHttpSession;
+ struct SslHttpSession;
+
+ CephContext* m_cct;
+ ImageCtxT* m_image_ctx;
+ std::shared_ptr<AsioEngine> m_asio_engine;
+ std::string m_url;
+
+ UrlSpec m_url_spec;
+
+ bool m_ignore_self_signed_cert = false;
+
+ HttpProcessorInterface* m_http_processor = nullptr;
+
+ boost::asio::strand<boost::asio::io_context::executor_type> m_strand;
+
+ boost::asio::ssl::context m_ssl_context;
+ std::unique_ptr<HttpSessionInterface> m_http_session;
+
+ template <typename Fields>
+ void initialize_default_fields(Fields& fields) const {
+ fields.target(m_url_spec.path);
+ fields.set(boost::beast::http::field::host, m_url_spec.host);
+ fields.set(boost::beast::http::field::user_agent,
+ BOOST_BEAST_VERSION_STRING);
+ }
+
+ void handle_get_size(int r, Response&& response, uint64_t* size,
+ Context* on_finish);
+
+ void handle_read(int r, Response&& response, uint64_t byte_offset,
+ uint64_t byte_length, bufferlist* data, Context* on_finish);
+
+ void issue(std::shared_ptr<Work>&& work);
+
+ void create_http_session(Context* on_finish);
+ void shut_down_http_session(Context* on_finish);
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::HttpClient<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_HTTP_CLIENT_H
diff --git a/src/librbd/migration/HttpProcessorInterface.h b/src/librbd/migration/HttpProcessorInterface.h
new file mode 100644
index 000000000..3d9af88bd
--- /dev/null
+++ b/src/librbd/migration/HttpProcessorInterface.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_HTTP_PROCESSOR_INTERFACE_H
+#define CEPH_LIBRBD_MIGRATION_HTTP_PROCESSOR_INTERFACE_H
+
+#include <boost/beast/http/empty_body.hpp>
+#include <boost/beast/http/message.hpp>
+
+namespace librbd {
+namespace migration {
+
+struct HttpProcessorInterface {
+ using EmptyBody = boost::beast::http::empty_body;
+ using EmptyRequest = boost::beast::http::request<EmptyBody>;
+
+ virtual ~HttpProcessorInterface() {
+ }
+
+ virtual void process_request(EmptyRequest& request) = 0;
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIGRATION_HTTP_PROCESSOR_INTERFACE_H
diff --git a/src/librbd/migration/HttpStream.cc b/src/librbd/migration/HttpStream.cc
new file mode 100644
index 000000000..fa3cc0032
--- /dev/null
+++ b/src/librbd/migration/HttpStream.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/HttpStream.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/migration/HttpClient.h"
+#include <boost/beast/http.hpp>
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string URL_KEY {"url"};
+
+} // anonymous namespace
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::HttpStream: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+HttpStream<I>::HttpStream(I* image_ctx, const json_spirit::mObject& json_object)
+ : m_image_ctx(image_ctx), m_cct(image_ctx->cct),
+ m_asio_engine(image_ctx->asio_engine), m_json_object(json_object) {
+}
+
+template <typename I>
+HttpStream<I>::~HttpStream() {
+}
+
+template <typename I>
+void HttpStream<I>::open(Context* on_finish) {
+ auto& url_value = m_json_object[URL_KEY];
+ if (url_value.type() != json_spirit::str_type) {
+ lderr(m_cct) << "failed to locate '" << URL_KEY << "' key" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_url = url_value.get_str();
+ ldout(m_cct, 10) << "url=" << m_url << dendl;
+
+ m_http_client.reset(HttpClient<I>::create(m_image_ctx, m_url));
+ m_http_client->open(on_finish);
+}
+
+template <typename I>
+void HttpStream<I>::close(Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ if (!m_http_client) {
+ on_finish->complete(0);
+ return;
+ }
+
+ m_http_client->close(on_finish);
+}
+
+template <typename I>
+void HttpStream<I>::get_size(uint64_t* size, Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ m_http_client->get_size(size, on_finish);
+}
+
+template <typename I>
+void HttpStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) {
+ ldout(m_cct, 20) << "byte_extents=" << byte_extents << dendl;
+
+ m_http_client->read(std::move(byte_extents), data, on_finish);
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::HttpStream<librbd::ImageCtx>;
diff --git a/src/librbd/migration/HttpStream.h b/src/librbd/migration/HttpStream.h
new file mode 100644
index 000000000..01a583714
--- /dev/null
+++ b/src/librbd/migration/HttpStream.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_HTTP_STREAM_H
+#define CEPH_LIBRBD_MIGRATION_HTTP_STREAM_H
+
+#include "include/int_types.h"
+#include "librbd/migration/StreamInterface.h"
+#include <boost/beast/http/message.hpp>
+#include <boost/beast/http/string_body.hpp>
+#include <json_spirit/json_spirit.h>
+#include <memory>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> class HttpClient;
+
+template <typename ImageCtxT>
+class HttpStream : public StreamInterface {
+public:
+ static HttpStream* create(ImageCtxT* image_ctx,
+ const json_spirit::mObject& json_object) {
+ return new HttpStream(image_ctx, json_object);
+ }
+
+ HttpStream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object);
+ ~HttpStream() override;
+
+ HttpStream(const HttpStream&) = delete;
+ HttpStream& operator=(const HttpStream&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_size(uint64_t* size, Context* on_finish) override;
+
+ void read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) override;
+
+private:
+ using HttpResponse = boost::beast::http::response<
+ boost::beast::http::string_body>;
+
+ ImageCtxT* m_image_ctx;
+ CephContext* m_cct;
+ std::shared_ptr<AsioEngine> m_asio_engine;
+ json_spirit::mObject m_json_object;
+
+ std::string m_url;
+
+ std::unique_ptr<HttpClient<ImageCtxT>> m_http_client;
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::HttpStream<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_HTTP_STREAM_H
diff --git a/src/librbd/migration/ImageDispatch.cc b/src/librbd/migration/ImageDispatch.cc
new file mode 100644
index 000000000..3aa2eeb0b
--- /dev/null
+++ b/src/librbd/migration/ImageDispatch.cc
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/ImageDispatch.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/migration/FormatInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::ImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace migration {
+
+template <typename I>
+ImageDispatch<I>::ImageDispatch(I* image_ctx,
+ std::unique_ptr<FormatInterface> format)
+ : m_image_ctx(image_ctx), m_format(std::move(format)) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+void ImageDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool ImageDispatch<I>::read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ return m_format->read(aio_comp, io_context->read_snap().value_or(CEPH_NOSNAP),
+ std::move(image_extents), std::move(read_result),
+ op_flags, read_flags, parent_trace);
+}
+
+template <typename I>
+bool ImageDispatch<I>::write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ lderr(cct) << dendl;
+
+ fail_io(-EROFS, aio_comp, dispatch_result);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ IOContext io_context, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ lderr(cct) << dendl;
+
+ fail_io(-EROFS, aio_comp, dispatch_result);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ lderr(cct) << dendl;
+
+ fail_io(-EROFS, aio_comp, dispatch_result);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ lderr(cct) << dendl;
+
+ fail_io(-EROFS, aio_comp, dispatch_result);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ aio_comp->set_request_count(0);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::list_snaps(
+ io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+
+ aio_comp->set_request_count(1);
+ auto ctx = new io::C_AioRequest(aio_comp);
+
+ m_format->list_snaps(std::move(image_extents), std::move(snap_ids),
+ list_snaps_flags, snapshot_delta, parent_trace,
+ ctx);
+ return true;
+}
+
+template <typename I>
+void ImageDispatch<I>::fail_io(int r, io::AioCompletion* aio_comp,
+ io::DispatchResult* dispatch_result) {
+ *dispatch_result = io::DISPATCH_RESULT_COMPLETE;
+ aio_comp->fail(r);
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::ImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/migration/ImageDispatch.h b/src/librbd/migration/ImageDispatch.h
new file mode 100644
index 000000000..03bb3aa52
--- /dev/null
+++ b/src/librbd/migration/ImageDispatch.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_MIGRATION_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include <memory>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace migration {
+
+struct FormatInterface;
+
+template <typename ImageCtxT>
+class ImageDispatch : public io::ImageDispatchInterface {
+public:
+ static ImageDispatch* create(ImageCtxT* image_ctx,
+ std::unique_ptr<FormatInterface> source) {
+ return new ImageDispatch(image_ctx, std::move(source));
+ }
+
+ ImageDispatch(ImageCtxT* image_ctx, std::unique_ptr<FormatInterface> source);
+
+ void shut_down(Context* on_finish) override;
+
+ io::ImageDispatchLayer get_dispatch_layer() const override {
+ return io::IMAGE_DISPATCH_LAYER_MIGRATION;
+ }
+
+ bool read(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ io::ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes,
+ IOContext io_context, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ io::AioCompletion* aio_comp, io::Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ io::AioCompletion* aio_comp, io::FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ io::DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+ std::unique_ptr<FormatInterface> m_format;
+
+ void fail_io(int r, io::AioCompletion* aio_comp,
+ io::DispatchResult* dispatch_result);
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::ImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_IMAGE_DISPATCH_H
diff --git a/src/librbd/migration/NativeFormat.cc b/src/librbd/migration/NativeFormat.cc
new file mode 100644
index 000000000..a7682619c
--- /dev/null
+++ b/src/librbd/migration/NativeFormat.cc
@@ -0,0 +1,309 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/NativeFormat.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "json_spirit/json_spirit.h"
+#include "boost/lexical_cast.hpp"
+#include <sstream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NativeFormat: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string TYPE_KEY{"type"};
+const std::string POOL_ID_KEY{"pool_id"};
+const std::string POOL_NAME_KEY{"pool_name"};
+const std::string POOL_NAMESPACE_KEY{"pool_namespace"};
+const std::string IMAGE_NAME_KEY{"image_name"};
+const std::string IMAGE_ID_KEY{"image_id"};
+const std::string SNAP_NAME_KEY{"snap_name"};
+const std::string SNAP_ID_KEY{"snap_id"};
+
+} // anonymous namespace
+
+template <typename I>
+std::string NativeFormat<I>::build_source_spec(
+ int64_t pool_id, const std::string& pool_namespace,
+ const std::string& image_name, const std::string& image_id) {
+ json_spirit::mObject source_spec;
+ source_spec[TYPE_KEY] = "native";
+ source_spec[POOL_ID_KEY] = pool_id;
+ source_spec[POOL_NAMESPACE_KEY] = pool_namespace;
+ source_spec[IMAGE_NAME_KEY] = image_name;
+ if (!image_id.empty()) {
+ source_spec[IMAGE_ID_KEY] = image_id;
+ }
+ return json_spirit::write(source_spec);
+}
+
+template <typename I>
+NativeFormat<I>::NativeFormat(
+ I* image_ctx, const json_spirit::mObject& json_object, bool import_only)
+ : m_image_ctx(image_ctx), m_json_object(json_object),
+ m_import_only(import_only) {
+}
+
+template <typename I>
+void NativeFormat<I>::open(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto& pool_name_val = m_json_object[POOL_NAME_KEY];
+ if (pool_name_val.type() == json_spirit::str_type) {
+ librados::Rados rados(m_image_ctx->md_ctx);
+ librados::IoCtx io_ctx;
+ int r = rados.ioctx_create(pool_name_val.get_str().c_str(), io_ctx);
+ if (r < 0 ) {
+ lderr(cct) << "invalid pool name" << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ m_pool_id = io_ctx.get_id();
+ } else if (pool_name_val.type() != json_spirit::null_type) {
+ lderr(cct) << "invalid pool name" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& pool_id_val = m_json_object[POOL_ID_KEY];
+ if (m_pool_id != -1 && pool_id_val.type() != json_spirit::null_type) {
+ lderr(cct) << "cannot specify both pool name and pool id" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ } else if (pool_id_val.type() == json_spirit::int_type) {
+ m_pool_id = pool_id_val.get_int64();
+ } else if (pool_id_val.type() == json_spirit::str_type) {
+ try {
+ m_pool_id = boost::lexical_cast<int64_t>(pool_id_val.get_str());
+ } catch (boost::bad_lexical_cast &) {
+ }
+ }
+
+ if (m_pool_id == -1) {
+ lderr(cct) << "missing or invalid pool id" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& pool_namespace_val = m_json_object[POOL_NAMESPACE_KEY];
+ if (pool_namespace_val.type() == json_spirit::str_type) {
+ m_pool_namespace = pool_namespace_val.get_str();
+ } else if (pool_namespace_val.type() != json_spirit::null_type) {
+ lderr(cct) << "invalid pool namespace" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& image_name_val = m_json_object[IMAGE_NAME_KEY];
+ if (image_name_val.type() != json_spirit::str_type) {
+ lderr(cct) << "missing or invalid image name" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_image_name = image_name_val.get_str();
+
+ auto& image_id_val = m_json_object[IMAGE_ID_KEY];
+ if (image_id_val.type() == json_spirit::str_type) {
+ m_image_id = image_id_val.get_str();
+ } else if (image_id_val.type() != json_spirit::null_type) {
+ lderr(cct) << "invalid image id" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& snap_name_val = m_json_object[SNAP_NAME_KEY];
+ if (snap_name_val.type() == json_spirit::str_type) {
+ m_snap_name = snap_name_val.get_str();
+ } else if (snap_name_val.type() != json_spirit::null_type) {
+ lderr(cct) << "invalid snap name" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& snap_id_val = m_json_object[SNAP_ID_KEY];
+ if (!m_snap_name.empty() && snap_id_val.type() != json_spirit::null_type) {
+ lderr(cct) << "cannot specify both snap name and snap id" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ } else if (snap_id_val.type() == json_spirit::str_type) {
+ try {
+ m_snap_id = boost::lexical_cast<uint64_t>(snap_id_val.get_str());
+ } catch (boost::bad_lexical_cast &) {
+ }
+ } else if (snap_id_val.type() == json_spirit::int_type) {
+ m_snap_id = snap_id_val.get_uint64();
+ }
+
+ if (snap_id_val.type() != json_spirit::null_type &&
+ m_snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "invalid snap id" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ // snapshot is required for import to keep source read-only
+ if (m_import_only && m_snap_name.empty() && m_snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "snapshot required for import" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ // TODO add support for external clusters
+ librados::IoCtx io_ctx;
+ int r = util::create_ioctx(m_image_ctx->md_ctx, "source image",
+ m_pool_id, m_pool_namespace, &io_ctx);
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ m_image_ctx->md_ctx.dup(io_ctx);
+ m_image_ctx->data_ctx.dup(io_ctx);
+ m_image_ctx->name = m_image_name;
+
+ uint64_t flags = 0;
+ if (m_image_id.empty() && !m_import_only) {
+ flags |= OPEN_FLAG_OLD_FORMAT;
+ } else {
+ m_image_ctx->id = m_image_id;
+ }
+
+ if (m_image_ctx->child != nullptr) {
+ // set rados flags for reading the parent image
+ if (m_image_ctx->child->config.template get_val<bool>("rbd_balance_parent_reads")) {
+ m_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS);
+ } else if (m_image_ctx->child->config.template get_val<bool>("rbd_localize_parent_reads")) {
+ m_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS);
+ }
+ }
+
+ // open the source RBD image
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ handle_open(r, on_finish); });
+ m_image_ctx->state->open(flags, on_finish);
+}
+
+template <typename I>
+void NativeFormat<I>::handle_open(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ if (m_snap_id == CEPH_NOSNAP && m_snap_name.empty()) {
+ on_finish->complete(0);
+ return;
+ }
+
+ if (!m_snap_name.empty()) {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ m_snap_id = m_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace{},
+ m_snap_name);
+ }
+
+ if (m_snap_id == CEPH_NOSNAP) {
+ lderr(cct) << "failed to locate snapshot " << m_snap_name << dendl;
+ on_finish = new LambdaContext([on_finish](int) {
+ on_finish->complete(-ENOENT); });
+ m_image_ctx->state->close(on_finish);
+ return;
+ }
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ handle_snap_set(r, on_finish); });
+ m_image_ctx->state->snap_set(m_snap_id, on_finish);
+}
+
+template <typename I>
+void NativeFormat<I>::handle_snap_set(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to set snapshot " << m_snap_id << ": "
+ << cpp_strerror(r) << dendl;
+ on_finish = new LambdaContext([r, on_finish](int) {
+ on_finish->complete(r); });
+ m_image_ctx->state->close(on_finish);
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NativeFormat<I>::close(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // the native librbd::image::CloseRequest handles all cleanup
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NativeFormat<I>::get_snapshots(SnapInfos* snap_infos, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ m_image_ctx->image_lock.lock_shared();
+ *snap_infos = m_image_ctx->snap_info;
+ m_image_ctx->image_lock.unlock_shared();
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NativeFormat<I>::get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ m_image_ctx->image_lock.lock_shared();
+ *size = m_image_ctx->get_image_size(snap_id);
+ m_image_ctx->image_lock.unlock_shared();
+
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NativeFormat<I>::list_snaps(io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(
+ on_finish, util::get_image_ctx(m_image_ctx), io::AIO_TYPE_GENERIC);
+ auto req = io::ImageDispatchSpec::create_list_snaps(
+ *m_image_ctx, io::IMAGE_DISPATCH_LAYER_MIGRATION, aio_comp,
+ std::move(image_extents), std::move(snap_ids), list_snaps_flags,
+ snapshot_delta, {});
+ req->send();
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::NativeFormat<librbd::ImageCtx>;
diff --git a/src/librbd/migration/NativeFormat.h b/src/librbd/migration/NativeFormat.h
new file mode 100644
index 000000000..e58c04121
--- /dev/null
+++ b/src/librbd/migration/NativeFormat.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_NATIVE_FORMAT_H
+#define CEPH_LIBRBD_MIGRATION_NATIVE_FORMAT_H
+
+#include "include/int_types.h"
+#include "librbd/Types.h"
+#include "librbd/migration/FormatInterface.h"
+#include "json_spirit/json_spirit.h"
+#include <memory>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename ImageCtxT>
+class NativeFormat : public FormatInterface {
+public:
+ static std::string build_source_spec(int64_t pool_id,
+ const std::string& pool_namespace,
+ const std::string& image_name,
+ const std::string& image_id);
+
+ static NativeFormat* create(ImageCtxT* image_ctx,
+ const json_spirit::mObject& json_object,
+ bool import_only) {
+ return new NativeFormat(image_ctx, json_object, import_only);
+ }
+
+ NativeFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+ bool import_only);
+ NativeFormat(const NativeFormat&) = delete;
+ NativeFormat& operator=(const NativeFormat&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override;
+ void get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) override;
+
+ bool read(io::AioCompletion* aio_comp, uint64_t snap_id,
+ io::Extents&& image_extents, io::ReadResult&& read_result,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) override {
+ return false;
+ }
+
+ void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids,
+ int list_snaps_flags, io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+ json_spirit::mObject m_json_object;
+ bool m_import_only;
+
+ int64_t m_pool_id = -1;
+ std::string m_pool_namespace;
+ std::string m_image_name;
+ std::string m_image_id;
+ std::string m_snap_name;
+ uint64_t m_snap_id = CEPH_NOSNAP;
+
+ void handle_open(int r, Context* on_finish);
+ void handle_snap_set(int r, Context* on_finish);
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::NativeFormat<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_NATIVE_FORMAT_H
diff --git a/src/librbd/migration/OpenSourceImageRequest.cc b/src/librbd/migration/OpenSourceImageRequest.cc
new file mode 100644
index 000000000..8abdedf33
--- /dev/null
+++ b/src/librbd/migration/OpenSourceImageRequest.cc
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/OpenSourceImageRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ImageDispatcher.h"
+#include "librbd/migration/ImageDispatch.h"
+#include "librbd/migration/NativeFormat.h"
+#include "librbd/migration/SourceSpecBuilder.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::OpenSourceImageRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace migration {
+
+template <typename I>
+OpenSourceImageRequest<I>::OpenSourceImageRequest(
+ librados::IoCtx& io_ctx, I* dst_image_ctx, uint64_t src_snap_id,
+ const MigrationInfo &migration_info, I** src_image_ctx, Context* on_finish)
+ : m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())), m_io_ctx(io_ctx),
+ m_dst_image_ctx(dst_image_ctx), m_src_snap_id(src_snap_id),
+ m_migration_info(migration_info), m_src_image_ctx(src_image_ctx),
+ m_on_finish(on_finish) {
+ ldout(m_cct, 10) << dendl;
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::send() {
+ open_source();
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::open_source() {
+ ldout(m_cct, 10) << dendl;
+
+ // note that all source image ctx properties are placeholders
+ *m_src_image_ctx = I::create("", "", CEPH_NOSNAP, m_io_ctx, true);
+ auto src_image_ctx = *m_src_image_ctx;
+ src_image_ctx->child = m_dst_image_ctx;
+
+ // use default layout values (can be overridden by source layers later)
+ src_image_ctx->order = 22;
+ src_image_ctx->layout = file_layout_t();
+ src_image_ctx->layout.stripe_count = 1;
+ src_image_ctx->layout.stripe_unit = 1ULL << src_image_ctx->order;
+ src_image_ctx->layout.object_size = 1Ull << src_image_ctx->order;
+ src_image_ctx->layout.pool_id = -1;
+
+ bool import_only = true;
+ auto source_spec = m_migration_info.source_spec;
+ if (source_spec.empty()) {
+ // implies legacy migration from RBD image in same cluster
+ source_spec = NativeFormat<I>::build_source_spec(
+ m_migration_info.pool_id, m_migration_info.pool_namespace,
+ m_migration_info.image_name, m_migration_info.image_id);
+ import_only = false;
+ }
+
+ ldout(m_cct, 15) << "source_spec=" << source_spec << ", "
+ << "source_snap_id=" << m_src_snap_id << ", "
+ << "import_only=" << import_only << dendl;
+
+ SourceSpecBuilder<I> source_spec_builder{src_image_ctx};
+ json_spirit::mObject source_spec_object;
+ int r = source_spec_builder.parse_source_spec(source_spec,
+ &source_spec_object);
+ if (r < 0) {
+ lderr(m_cct) << "failed to parse migration source-spec:" << cpp_strerror(r)
+ << dendl;
+ (*m_src_image_ctx)->state->close();
+ finish(r);
+ return;
+ }
+
+ r = source_spec_builder.build_format(source_spec_object, import_only,
+ &m_format);
+ if (r < 0) {
+ lderr(m_cct) << "failed to build migration format handler: "
+ << cpp_strerror(r) << dendl;
+ (*m_src_image_ctx)->state->close();
+ finish(r);
+ return;
+ }
+
+ auto ctx = util::create_context_callback<
+ OpenSourceImageRequest<I>,
+ &OpenSourceImageRequest<I>::handle_open_source>(this);
+ m_format->open(ctx);
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::handle_open_source(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to open migration source: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ get_image_size();
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::get_image_size() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ OpenSourceImageRequest<I>,
+ &OpenSourceImageRequest<I>::handle_get_image_size>(this);
+ m_format->get_image_size(CEPH_NOSNAP, &m_image_size, ctx);
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::handle_get_image_size(int r) {
+ ldout(m_cct, 10) << "r=" << r << ", "
+ << "image_size=" << m_image_size << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve image size: " << cpp_strerror(r)
+ << dendl;
+ close_image(r);
+ return;
+ }
+
+ auto src_image_ctx = *m_src_image_ctx;
+ src_image_ctx->image_lock.lock();
+ src_image_ctx->size = m_image_size;
+ src_image_ctx->image_lock.unlock();
+
+ get_snapshots();
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::get_snapshots() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ OpenSourceImageRequest<I>,
+ &OpenSourceImageRequest<I>::handle_get_snapshots>(this);
+ m_format->get_snapshots(&m_snap_infos, ctx);
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::handle_get_snapshots(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve snapshots: " << cpp_strerror(r)
+ << dendl;
+ close_image(r);
+ return;
+ }
+
+ // copy snapshot metadata to image ctx
+ auto src_image_ctx = *m_src_image_ctx;
+ src_image_ctx->image_lock.lock();
+
+ src_image_ctx->snaps.clear();
+ src_image_ctx->snap_info.clear();
+ src_image_ctx->snap_ids.clear();
+
+ ::SnapContext snapc;
+ for (auto it = m_snap_infos.rbegin(); it != m_snap_infos.rend(); ++it) {
+ auto& [snap_id, snap_info] = *it;
+ snapc.snaps.push_back(snap_id);
+
+ ldout(m_cct, 10) << "adding snap: ns=" << snap_info.snap_namespace << ", "
+ << "name=" << snap_info.name << ", "
+ << "id=" << snap_id << dendl;
+ src_image_ctx->add_snap(
+ snap_info.snap_namespace, snap_info.name, snap_id,
+ snap_info.size, snap_info.parent, snap_info.protection_status,
+ snap_info.flags, snap_info.timestamp);
+ }
+ if (!snapc.snaps.empty()) {
+ snapc.seq = snapc.snaps[0];
+ }
+ src_image_ctx->snapc = snapc;
+
+ ldout(m_cct, 15) << "read snap id: " << m_src_snap_id << ", "
+ << "write snapc={"
+ << "seq=" << snapc.seq << ", "
+ << "snaps=" << snapc.snaps << "}" << dendl;
+
+ // ensure data_ctx and data_io_context are pointing to correct snapshot
+ if (m_src_snap_id != CEPH_NOSNAP) {
+ int r = src_image_ctx->snap_set(m_src_snap_id);
+ if (r < 0) {
+ src_image_ctx->image_lock.unlock();
+
+ lderr(m_cct) << "error setting source image snap id: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ }
+
+ src_image_ctx->image_lock.unlock();
+
+ finish(0);
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::close_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ auto ctx = new LambdaContext([this, r](int) {
+ finish(r);
+ });
+ (*m_src_image_ctx)->state->close(ctx);
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::register_image_dispatch() {
+ ldout(m_cct, 10) << dendl;
+
+ // intercept any IO requests to the source image
+ auto io_image_dispatch = ImageDispatch<I>::create(
+ *m_src_image_ctx, std::move(m_format));
+ (*m_src_image_ctx)->io_image_dispatcher->register_dispatch(io_image_dispatch);
+}
+
+template <typename I>
+void OpenSourceImageRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ *m_src_image_ctx = nullptr;
+ } else {
+ register_image_dispatch();
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::OpenSourceImageRequest<librbd::ImageCtx>;
diff --git a/src/librbd/migration/OpenSourceImageRequest.h b/src/librbd/migration/OpenSourceImageRequest.h
new file mode 100644
index 000000000..f0dab3ad9
--- /dev/null
+++ b/src/librbd/migration/OpenSourceImageRequest.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_OPEN_SOURCE_IMAGE_REQUEST_H
+#define CEPH_LIBRBD_MIGRATION_OPEN_SOURCE_IMAGE_REQUEST_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "librbd/Types.h"
+#include <map>
+#include <memory>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace migration {
+
+struct FormatInterface;
+
+template <typename ImageCtxT>
+class OpenSourceImageRequest {
+public:
+ static OpenSourceImageRequest* create(librados::IoCtx& io_ctx,
+ ImageCtxT* destination_image_ctx,
+ uint64_t src_snap_id,
+ const MigrationInfo &migration_info,
+ ImageCtxT** source_image_ctx,
+ Context* on_finish) {
+ return new OpenSourceImageRequest(io_ctx, destination_image_ctx,
+ src_snap_id, migration_info,
+ source_image_ctx, on_finish);
+ }
+
+ OpenSourceImageRequest(librados::IoCtx& io_ctx,
+ ImageCtxT* destination_image_ctx,
+ uint64_t src_snap_id,
+ const MigrationInfo &migration_info,
+ ImageCtxT** source_image_ctx,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_SOURCE
+ * |
+ * v
+ * GET_IMAGE_SIZE * * * * * * *
+ * | *
+ * v v
+ * GET_SNAPSHOTS * * * * > CLOSE_IMAGE
+ * | |
+ * v |
+ * <finish> <------------------/
+ *
+ * @endverbatim
+ */
+
+ typedef std::map<uint64_t, SnapInfo> SnapInfos;
+
+ CephContext* m_cct;
+ librados::IoCtx& m_io_ctx;
+ ImageCtxT* m_dst_image_ctx;
+ uint64_t m_src_snap_id;
+ MigrationInfo m_migration_info;
+ ImageCtxT** m_src_image_ctx;
+ Context* m_on_finish;
+
+ std::unique_ptr<FormatInterface> m_format;
+
+ uint64_t m_image_size = 0;
+ SnapInfos m_snap_infos;
+
+ void open_source();
+ void handle_open_source(int r);
+
+ void get_image_size();
+ void handle_get_image_size(int r);
+
+ void get_snapshots();
+ void handle_get_snapshots(int r);
+
+ void close_image(int r);
+
+ void register_image_dispatch();
+
+ void finish(int r);
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::OpenSourceImageRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_OPEN_SOURCE_IMAGE_REQUEST_H
diff --git a/src/librbd/migration/QCOW.h b/src/librbd/migration/QCOW.h
new file mode 100644
index 000000000..23401e515
--- /dev/null
+++ b/src/librbd/migration/QCOW.h
@@ -0,0 +1,466 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/* Based on QEMU block/qcow.cc and block/qcow2.h, which has this license: */
+
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef CEPH_LIBRBD_MIGRATION_QCOW2_H
+#define CEPH_LIBRBD_MIGRATION_QCOW2_H
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+#include "librbd/migration/QCOW.h"
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+#define QCOW_CRYPT_LUKS 2
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+#define QCOW_MAX_SNAPSHOTS 65536
+
+/* Field widths in qcow2 mean normal cluster offsets cannot reach
+ * 64PB; depending on cluster size, compressed clusters can have a
+ * smaller limit (64PB for up to 16k clusters, then ramps down to
+ * 512TB for 2M clusters). */
+#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1)
+
+/* 8 MB refcount table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_REFTABLE_SIZE (1ULL << 23)
+
+/* 32 MB L1 table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_L1_SIZE (1ULL << 25)
+
+/* Allow for an average of 1k per snapshot table entry, should be plenty of
+ * space for snapshot names and IDs */
+#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)
+
+/* Maximum amount of extra data per snapshot table entry to accept */
+#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024
+
+/* Bitmap header extension constraints */
+#define QCOW2_MAX_BITMAPS 65535
+#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS)
+
+/* Maximum of parallel sub-request per guest request */
+#define QCOW2_MAX_WORKERS 8
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED (1ULL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1ULL << 62)
+/* The cluster reads as all zeros */
+#define QCOW_OFLAG_ZERO (1ULL << 0)
+
+#define QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER 32
+
+/* The subcluster X [0..31] is allocated */
+#define QCOW_OFLAG_SUB_ALLOC(X) (1ULL << (X))
+/* The subcluster X [0..31] reads as zeroes */
+#define QCOW_OFLAG_SUB_ZERO(X) (QCOW_OFLAG_SUB_ALLOC(X) << 32)
+/* Subclusters [X, Y) (0 <= X <= Y <= 32) are allocated */
+#define QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) \
+ (QCOW_OFLAG_SUB_ALLOC(Y) - QCOW_OFLAG_SUB_ALLOC(X))
+/* Subclusters [X, Y) (0 <= X <= Y <= 32) read as zeroes */
+#define QCOW_OFLAG_SUB_ZERO_RANGE(X, Y) \
+ (QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) << 32)
+/* L2 entry bitmap with all allocation bits set */
+#define QCOW_L2_BITMAP_ALL_ALLOC (QCOW_OFLAG_SUB_ALLOC_RANGE(0, 32))
+/* L2 entry bitmap with all "read as zeroes" bits set */
+#define QCOW_L2_BITMAP_ALL_ZEROES (QCOW_OFLAG_SUB_ZERO_RANGE(0, 32))
+
+/* Size of normal and extended L2 entries */
+#define QCOW_L2E_SIZE_NORMAL (sizeof(uint64_t))
+#define QCOW_L2E_SIZE_EXTENDED (sizeof(uint64_t) * 2)
+
+/* Size of L1 table entries */
+#define QCOW_L1E_SIZE (sizeof(uint64_t))
+
+/* Size of reftable entries */
+#define QCOW_REFTABLE_ENTRY_SIZE (sizeof(uint64_t))
+
+#define QCOW_MIN_CLUSTER_BITS 9
+#define QCOW_MAX_CLUSTER_BITS 21
+
+/* Defined in the qcow2 spec (compressed cluster descriptor) */
+#define QCOW2_COMPRESSED_SECTOR_SIZE 512U
+#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL))
+
+#define QCOW_L2_CACHE_SIZE 16
+
+/* Must be at least 2 to cover COW */
+#define QCOW_MIN_L2_CACHE_SIZE 2 /* cache entries */
+
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define QCOW_MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
+
+#define QCOW_DEFAULT_L2_CACHE_MAX_SIZE (1ULL << 25)
+#define QCOW_DEFAULT_CACHE_CLEAN_INTERVAL 600 /* seconds */
+
+#define QCOW_DEFAULT_CLUSTER_SIZE 65536
+
+#define QCOW2_OPT_DATA_FILE "data-file"
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+#define QCOW2_OPT_OVERLAP "overlap-check"
+#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
+#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
+#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1"
+#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2"
+#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table"
+#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block"
+#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table"
+#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1"
+#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
+#define QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY "overlap-check.bitmap-directory"
+#define QCOW2_OPT_CACHE_SIZE "cache-size"
+#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
+#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size"
+#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
+#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"
+
+typedef struct QCowHeaderProbe {
+ uint32_t magic;
+ uint32_t version;
+} __attribute__((__packed__)) QCowHeaderProbe;
+
+typedef struct QCowHeaderV1
+{
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t mtime;
+ uint64_t size; /* in bytes */
+ uint8_t cluster_bits;
+ uint8_t l2_bits;
+ uint16_t padding;
+ uint32_t crypt_method;
+ uint64_t l1_table_offset;
+} __attribute__((__packed__)) QCowHeaderV1;
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t cluster_bits;
+ uint64_t size; /* in bytes */
+ uint32_t crypt_method;
+ uint32_t l1_size; /* XXX: save number of clusters instead ? */
+ uint64_t l1_table_offset;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_clusters;
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+
+ /* The following fields are only valid for version >= 3 */
+ uint64_t incompatible_features;
+ uint64_t compatible_features;
+ uint64_t autoclear_features;
+
+ uint32_t refcount_order;
+ uint32_t header_length;
+
+ /* Additional fields */
+ uint8_t compression_type;
+
+ /* header must be a multiple of 8 */
+ uint8_t padding[7];
+} __attribute__((__packed__)) QCowHeader;
+
+typedef struct QCowSnapshotHeader {
+ /* header is 8 byte aligned */
+ uint64_t l1_table_offset;
+
+ uint32_t l1_size;
+ uint16_t id_str_size;
+ uint16_t name_size;
+
+ uint32_t date_sec;
+ uint32_t date_nsec;
+
+ uint64_t vm_clock_nsec;
+
+ uint32_t vm_state_size;
+ uint32_t extra_data_size; /* for extension */
+ /* extra data follows */
+ /* id_str follows */
+ /* name follows */
+} __attribute__((__packed__)) QCowSnapshotHeader;
+
+typedef struct QCowSnapshotExtraData {
+ uint64_t vm_state_size_large;
+ uint64_t disk_size;
+ uint64_t icount;
+} __attribute__((__packed__)) QCowSnapshotExtraData;
+
+
+typedef struct QCowSnapshot {
+ uint64_t l1_table_offset;
+ uint32_t l1_size;
+ char *id_str;
+ char *name;
+ uint64_t disk_size;
+ uint64_t vm_state_size;
+ uint32_t date_sec;
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec;
+ /* icount value for the moment when snapshot was taken */
+ uint64_t icount;
+ /* Size of all extra data, including QCowSnapshotExtraData if available */
+ uint32_t extra_data_size;
+ /* Data beyond QCowSnapshotExtraData, if any */
+ void *unknown_extra_data;
+} QCowSnapshot;
+
+typedef struct Qcow2CryptoHeaderExtension {
+ uint64_t offset;
+ uint64_t length;
+} __attribute__((__packed__)) Qcow2CryptoHeaderExtension;
+
+typedef struct Qcow2UnknownHeaderExtension {
+ uint32_t magic;
+ uint32_t len;
+ uint8_t data[];
+} Qcow2UnknownHeaderExtension;
+
+enum {
+ QCOW2_FEAT_TYPE_INCOMPATIBLE = 0,
+ QCOW2_FEAT_TYPE_COMPATIBLE = 1,
+ QCOW2_FEAT_TYPE_AUTOCLEAR = 2,
+};
+
+/* Incompatible feature bits */
+enum {
+ QCOW2_INCOMPAT_DIRTY_BITNR = 0,
+ QCOW2_INCOMPAT_CORRUPT_BITNR = 1,
+ QCOW2_INCOMPAT_DATA_FILE_BITNR = 2,
+ QCOW2_INCOMPAT_COMPRESSION_BITNR = 3,
+ QCOW2_INCOMPAT_EXTL2_BITNR = 4,
+ QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+ QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR,
+ QCOW2_INCOMPAT_DATA_FILE = 1 << QCOW2_INCOMPAT_DATA_FILE_BITNR,
+ QCOW2_INCOMPAT_COMPRESSION = 1 << QCOW2_INCOMPAT_COMPRESSION_BITNR,
+ QCOW2_INCOMPAT_EXTL2 = 1 << QCOW2_INCOMPAT_EXTL2_BITNR,
+
+ QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY
+ | QCOW2_INCOMPAT_CORRUPT
+ | QCOW2_INCOMPAT_DATA_FILE
+ | QCOW2_INCOMPAT_COMPRESSION
+ | QCOW2_INCOMPAT_EXTL2,
+};
+
+/* Compatible feature bits */
+enum {
+ QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0,
+ QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+
+ QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS,
+};
+
+/* Autoclear feature bits */
+enum {
+ QCOW2_AUTOCLEAR_BITMAPS_BITNR = 0,
+ QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR = 1,
+ QCOW2_AUTOCLEAR_BITMAPS = 1 << QCOW2_AUTOCLEAR_BITMAPS_BITNR,
+ QCOW2_AUTOCLEAR_DATA_FILE_RAW = 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR,
+
+ QCOW2_AUTOCLEAR_MASK = QCOW2_AUTOCLEAR_BITMAPS
+ | QCOW2_AUTOCLEAR_DATA_FILE_RAW,
+};
+
+enum qcow2_discard_type {
+ QCOW2_DISCARD_NEVER = 0,
+ QCOW2_DISCARD_ALWAYS,
+ QCOW2_DISCARD_REQUEST,
+ QCOW2_DISCARD_SNAPSHOT,
+ QCOW2_DISCARD_OTHER,
+ QCOW2_DISCARD_MAX
+};
+
+typedef struct Qcow2Feature {
+ uint8_t type;
+ uint8_t bit;
+ char name[46];
+} __attribute__((__packed__)) Qcow2Feature;
+
+typedef struct Qcow2DiscardRegion {
+ uint64_t offset;
+ uint64_t bytes;
+} Qcow2DiscardRegion;
+
+typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array,
+ uint64_t index);
+typedef void Qcow2SetRefcountFunc(void *refcount_array,
+ uint64_t index, uint64_t value);
+
+typedef struct Qcow2BitmapHeaderExt {
+ uint32_t nb_bitmaps;
+ uint32_t reserved32;
+ uint64_t bitmap_directory_size;
+ uint64_t bitmap_directory_offset;
+} __attribute__((__packed__)) Qcow2BitmapHeaderExt;
+
+#define QCOW_RC_CACHE_SIZE QCOW_L2_CACHE_SIZE;
+
+typedef struct Qcow2COWRegion {
+ /**
+ * Offset of the COW region in bytes from the start of the first cluster
+ * touched by the request.
+ */
+ unsigned offset;
+
+ /** Number of bytes to copy */
+ unsigned nb_bytes;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
+typedef struct QCowL2Meta
+{
+ /** Guest offset of the first newly allocated cluster */
+ uint64_t offset;
+
+ /** Host offset of the first newly allocated cluster */
+ uint64_t alloc_offset;
+
+ /** Number of newly allocated clusters */
+ int nb_clusters;
+
+ /** Do not free the old clusters */
+ bool keep_old_clusters;
+
+ /**
+ * The COW Region between the start of the first allocated cluster and the
+ * area the guest actually writes to.
+ */
+ Qcow2COWRegion cow_start;
+
+ /**
+ * The COW Region between the area the guest actually writes to and the
+ * end of the last allocated cluster.
+ */
+ Qcow2COWRegion cow_end;
+
+ /*
+ * Indicates that COW regions are already handled and do not require
+ * any more processing.
+ */
+ bool skip_cow;
+
+ /**
+ * Indicates that this is not a normal write request but a preallocation.
+ * If the image has extended L2 entries this means that no new individual
+ * subclusters will be marked as allocated in the L2 bitmap (but any
+ * existing contents of that bitmap will be kept).
+ */
+ bool prealloc;
+
+ /** Pointer to next L2Meta of the same write request */
+ struct QCowL2Meta *next;
+} QCowL2Meta;
+
+typedef enum QCow2ClusterType {
+ QCOW2_CLUSTER_UNALLOCATED,
+ QCOW2_CLUSTER_ZERO_PLAIN,
+ QCOW2_CLUSTER_ZERO_ALLOC,
+ QCOW2_CLUSTER_NORMAL,
+ QCOW2_CLUSTER_COMPRESSED,
+} QCow2ClusterType;
+
+typedef enum QCow2MetadataOverlap {
+ QCOW2_OL_MAIN_HEADER_BITNR = 0,
+ QCOW2_OL_ACTIVE_L1_BITNR = 1,
+ QCOW2_OL_ACTIVE_L2_BITNR = 2,
+ QCOW2_OL_REFCOUNT_TABLE_BITNR = 3,
+ QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4,
+ QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5,
+ QCOW2_OL_INACTIVE_L1_BITNR = 6,
+ QCOW2_OL_INACTIVE_L2_BITNR = 7,
+ QCOW2_OL_BITMAP_DIRECTORY_BITNR = 8,
+
+ QCOW2_OL_MAX_BITNR = 9,
+
+ QCOW2_OL_NONE = 0,
+ QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR),
+ QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR),
+ QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR),
+ QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR),
+ QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR),
+ QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR),
+ QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR),
+ /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv
+ * reads. */
+ QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR),
+ QCOW2_OL_BITMAP_DIRECTORY = (1 << QCOW2_OL_BITMAP_DIRECTORY_BITNR),
+} QCow2MetadataOverlap;
+
+/* Perform all overlap checks which can be done in constant time */
+#define QCOW2_OL_CONSTANT \
+ (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \
+ QCOW2_OL_SNAPSHOT_TABLE | QCOW2_OL_BITMAP_DIRECTORY)
+
+/* Perform all overlap checks which don't require disk access */
+#define QCOW2_OL_CACHED \
+ (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \
+ QCOW2_OL_INACTIVE_L1)
+
+/* Perform all overlap checks */
+#define QCOW2_OL_ALL \
+ (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
+
+#define QCOW_L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define QCOW_L2E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define QCOW_L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+
+#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+
+#define INV_OFFSET (-1ULL)
+
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+ return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+ return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
+}
+
+static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
+{
+ return r1 > r2 ? r1 - r2 : r2 - r1;
+}
+
+#endif // CEPH_LIBRBD_MIGRATION_QCOW2_H
diff --git a/src/librbd/migration/QCOWFormat.cc b/src/librbd/migration/QCOWFormat.cc
new file mode 100644
index 000000000..7bd4a5ef7
--- /dev/null
+++ b/src/librbd/migration/QCOWFormat.cc
@@ -0,0 +1,1542 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/QCOWFormat.h"
+#include "common/Clock.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/intarith.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/migration/SnapshotInterface.h"
+#include "librbd/migration/SourceSpecBuilder.h"
+#include "librbd/migration/StreamInterface.h"
+#include "librbd/migration/Utils.h"
+#include <boost/asio/dispatch.hpp>
+#include <boost/asio/post.hpp>
+#include <deque>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+namespace migration {
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " \
+ << __func__ << ": "
+
+namespace qcow_format {
+
+struct ClusterExtent {
+ uint64_t cluster_offset;
+ uint64_t cluster_length;
+ uint64_t intra_cluster_offset;
+ uint64_t image_offset;
+ uint64_t buffer_offset;
+
+ ClusterExtent(uint64_t cluster_offset, uint64_t cluster_length,
+ uint64_t intra_cluster_offset, uint64_t image_offset,
+ uint64_t buffer_offset)
+ : cluster_offset(cluster_offset), cluster_length(cluster_length),
+ intra_cluster_offset(intra_cluster_offset), image_offset(image_offset),
+ buffer_offset(buffer_offset) {
+ }
+};
+
+typedef std::vector<ClusterExtent> ClusterExtents;
+
+void LookupTable::init() {
+ if (cluster_offsets == nullptr) {
+ cluster_offsets = reinterpret_cast<uint64_t*>(bl.c_str());
+ }
+}
+
+void LookupTable::decode() {
+ init();
+
+ // L2 tables are selectively byte-swapped on demand if only requesting a
+ // single cluster offset
+ if (decoded) {
+ return;
+ }
+
+ // translate the lookup table (big-endian -> CPU endianess)
+ for (auto idx = 0UL; idx < size; ++idx) {
+ cluster_offsets[idx] = be64toh(cluster_offsets[idx]);
+ }
+
+ decoded = true;
+}
+
+void populate_cluster_extents(CephContext* cct, uint64_t cluster_size,
+ const io::Extents& image_extents,
+ ClusterExtents* cluster_extents) {
+ uint64_t buffer_offset = 0;
+ for (auto [image_offset, image_length] : image_extents) {
+ while (image_length > 0) {
+ auto intra_cluster_offset = image_offset & (cluster_size - 1);
+ auto intra_cluster_length = cluster_size - intra_cluster_offset;
+ auto cluster_length = std::min(image_length, intra_cluster_length);
+
+ ldout(cct, 20) << "image_offset=" << image_offset << ", "
+ << "image_length=" << image_length << ", "
+ << "cluster_length=" << cluster_length << dendl;
+
+
+ cluster_extents->emplace_back(0, cluster_length, intra_cluster_offset,
+ image_offset, buffer_offset);
+
+ image_offset += cluster_length;
+ image_length -= cluster_length;
+ buffer_offset += cluster_length;
+ }
+ }
+}
+
+} // namespace qcow_format
+
+using namespace qcow_format;
+
+template <typename I>
+struct QCOWFormat<I>::Cluster {
+ const uint64_t cluster_offset;
+ bufferlist cluster_data_bl;
+
+ Cluster(uint64_t cluster_offset) : cluster_offset(cluster_offset) {
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ClusterCache: " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::ClusterCache {
+public:
+ ClusterCache(QCOWFormat* qcow_format)
+ : qcow_format(qcow_format),
+ m_strand(*qcow_format->m_image_ctx->asio_engine) {
+ }
+
+ void get_cluster(uint64_t cluster_offset, uint64_t cluster_length,
+ uint64_t intra_cluster_offset, bufferlist* bl,
+ Context* on_finish) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl;
+
+ // cache state machine runs in a single strand thread
+ boost::asio::dispatch(
+ m_strand,
+ [this, cluster_offset, cluster_length, intra_cluster_offset, bl,
+ on_finish]() {
+ execute_get_cluster(cluster_offset, cluster_length,
+ intra_cluster_offset, bl, on_finish);
+ });
+ }
+
+private:
+ typedef std::tuple<uint64_t, uint64_t, bufferlist*, Context*> Completion;
+ typedef std::list<Completion> Completions;
+
+ QCOWFormat* qcow_format;
+ boost::asio::io_context::strand m_strand;
+
+ std::shared_ptr<Cluster> cluster;
+ std::unordered_map<uint64_t, Completions> cluster_completions;
+
+ void execute_get_cluster(uint64_t cluster_offset, uint64_t cluster_length,
+ uint64_t intra_cluster_offset, bufferlist* bl,
+ Context* on_finish) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl;
+
+ if (cluster && cluster->cluster_offset == cluster_offset) {
+ // most-recent cluster matches
+ bl->substr_of(cluster->cluster_data_bl, intra_cluster_offset,
+ cluster_length);
+ boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+ [on_finish]() { on_finish->complete(0); });
+ return;
+ }
+
+ // record callback for cluster
+ bool new_request = (cluster_completions.count(cluster_offset) == 0);
+ cluster_completions[cluster_offset].emplace_back(
+ intra_cluster_offset, cluster_length, bl, on_finish);
+ if (new_request) {
+ // start the new read request
+ read_cluster(std::make_shared<Cluster>(cluster_offset));
+ }
+ }
+
+ void read_cluster(std::shared_ptr<Cluster> cluster) {
+ auto cct = qcow_format->m_image_ctx->cct;
+
+ uint64_t stream_offset = cluster->cluster_offset;
+ uint64_t stream_length = qcow_format->m_cluster_size;
+ if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) {
+ // compressed clusters encode the compressed length in the lower bits
+ stream_offset = cluster->cluster_offset &
+ qcow_format->m_cluster_offset_mask;
+ stream_length = (cluster->cluster_offset >>
+ (63 - qcow_format->m_cluster_bits)) &
+ (qcow_format->m_cluster_size - 1);
+ }
+
+ ldout(cct, 20) << "cluster_offset=" << cluster->cluster_offset << ", "
+ << "stream_offset=" << stream_offset << ", "
+ << "stream_length=" << stream_length << dendl;
+
+ // read the cluster into the cache entry
+ auto ctx = new LambdaContext([this, cluster](int r) {
+ boost::asio::post(m_strand, [this, cluster, r]() {
+ handle_read_cluster(r, cluster); }); });
+ qcow_format->m_stream->read({{stream_offset, stream_length}},
+ &cluster->cluster_data_bl, ctx);
+ }
+
+ void handle_read_cluster(int r, std::shared_ptr<Cluster> cluster) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", "
+ << "cluster_offset=" << cluster->cluster_offset << dendl;
+
+ auto completions = std::move(cluster_completions[cluster->cluster_offset]);
+ cluster_completions.erase(cluster->cluster_offset);
+
+ if (r < 0) {
+ lderr(cct) << "failed to read cluster offset " << cluster->cluster_offset
+ << ": " << cpp_strerror(r) << dendl;
+ } else {
+ if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) {
+ bufferlist compressed_bl{std::move(cluster->cluster_data_bl)};
+ cluster->cluster_data_bl.clear();
+
+ // TODO
+ lderr(cct) << "support for compressed clusters is not available"
+ << dendl;
+ r = -EINVAL;
+ } else {
+ // cache the MRU cluster in case of sequential IO
+ this->cluster = cluster;
+ }
+ }
+
+ // complete the IO back to caller
+ boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+ [r, cluster, completions=std::move(completions)]() {
+ for (auto completion : completions) {
+ if (r >= 0) {
+ std::get<2>(completion)->substr_of(
+ cluster->cluster_data_bl,
+ std::get<0>(completion),
+ std::get<1>(completion));
+ }
+ std::get<3>(completion)->complete(r);
+ }
+ });
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::L2TableCache: " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::L2TableCache {
+public:
+ L2TableCache(QCOWFormat* qcow_format)
+ : qcow_format(qcow_format),
+ m_strand(*qcow_format->m_image_ctx->asio_engine),
+ l2_cache_entries(QCOW_L2_CACHE_SIZE) {
+ }
+
+ void get_l2_table(const LookupTable* l1_table, uint64_t l2_table_offset,
+ std::shared_ptr<const LookupTable>* l2_table,
+ Context* on_finish) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "l2_table_offset=" << l2_table_offset << dendl;
+
+ // cache state machine runs in a single strand thread
+ Request request{l1_table, l2_table_offset, l2_table, on_finish};
+ boost::asio::dispatch(
+ m_strand, [this, request=std::move(request)]() {
+ requests.push_back(std::move(request));
+ });
+ dispatch_request();
+ }
+
+ void get_cluster_offset(const LookupTable* l1_table,
+ uint64_t image_offset, uint64_t* cluster_offset,
+ Context* on_finish) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ uint32_t l1_table_index = image_offset >> qcow_format->m_l1_shift;
+ uint64_t l2_table_offset = l1_table->cluster_offsets[std::min<uint32_t>(
+ l1_table_index, l1_table->size - 1)] &
+ qcow_format->m_cluster_mask;
+ uint32_t l2_table_index = (image_offset >> qcow_format->m_cluster_bits) &
+ (qcow_format->m_l2_size - 1);
+ ldout(cct, 20) << "image_offset=" << image_offset << ", "
+ << "l1_table_index=" << l1_table_index << ", "
+ << "l2_table_offset=" << l2_table_offset << ", "
+ << "l2_table_index=" << l2_table_index << dendl;
+
+ if (l1_table_index >= l1_table->size) {
+ lderr(cct) << "L1 index " << l1_table_index << " out-of-bounds" << dendl;
+ on_finish->complete(-ERANGE);
+ return;
+ } else if (l2_table_offset == 0) {
+ // L2 table has not been allocated for specified offset
+ ldout(cct, 20) << "image_offset=" << image_offset << ", "
+ << "cluster_offset=DNE" << dendl;
+ *cluster_offset = 0;
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ // cache state machine runs in a single strand thread
+ Request request{l1_table, l2_table_offset, l2_table_index, cluster_offset,
+ on_finish};
+ boost::asio::dispatch(
+ m_strand, [this, request=std::move(request)]() {
+ requests.push_back(std::move(request));
+ });
+ dispatch_request();
+ }
+
+private:
+ QCOWFormat* qcow_format;
+
+ boost::asio::io_context::strand m_strand;
+
+ struct Request {
+ const LookupTable* l1_table;
+
+ uint64_t l2_table_offset;
+
+ // get_cluster_offset request
+ uint32_t l2_table_index;
+ uint64_t* cluster_offset = nullptr;
+
+ // get_l2_table request
+ std::shared_ptr<const LookupTable>* l2_table;
+
+ Context* on_finish;
+
+ Request(const LookupTable* l1_table, uint64_t l2_table_offset,
+ uint32_t l2_table_index, uint64_t* cluster_offset,
+ Context* on_finish)
+ : l1_table(l1_table), l2_table_offset(l2_table_offset),
+ l2_table_index(l2_table_index), cluster_offset(cluster_offset),
+ on_finish(on_finish) {
+ }
+ Request(const LookupTable* l1_table, uint64_t l2_table_offset,
+ std::shared_ptr<const LookupTable>* l2_table, Context* on_finish)
+ : l1_table(l1_table), l2_table_offset(l2_table_offset),
+ l2_table(l2_table), on_finish(on_finish) {
+ }
+ };
+
+ typedef std::deque<Request> Requests;
+
+ struct L2Cache {
+ uint64_t l2_offset = 0;
+ std::shared_ptr<LookupTable> l2_table;
+
+ utime_t timestamp;
+ uint32_t count = 0;
+ bool in_flight = false;
+
+ int ret_val = 0;
+ };
+ std::vector<L2Cache> l2_cache_entries;
+
+ Requests requests;
+
+ void dispatch_request() {
+ boost::asio::dispatch(m_strand, [this]() { execute_request(); });
+ }
+
+ void execute_request() {
+ auto cct = qcow_format->m_image_ctx->cct;
+ if (requests.empty()) {
+ return;
+ }
+
+ auto request = requests.front();
+ ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << dendl;
+
+ std::shared_ptr<LookupTable> l2_table;
+ int r = l2_table_lookup(request.l2_table_offset, &l2_table);
+ if (r < 0) {
+ lderr(cct) << "failed to load L2 table: l2_table_offset="
+ << request.l2_table_offset << ": "
+ << cpp_strerror(r) << dendl;
+ } else if (l2_table == nullptr) {
+ // table not in cache -- will restart once its loaded
+ return;
+ } else if (request.cluster_offset != nullptr) {
+ auto cluster_offset = l2_table->cluster_offsets[request.l2_table_index];
+ if (!l2_table->decoded) {
+ // table hasn't been byte-swapped
+ cluster_offset = be64toh(cluster_offset);
+ }
+
+ *request.cluster_offset = cluster_offset & qcow_format->m_cluster_mask;
+ if (*request.cluster_offset == QCOW_OFLAG_ZERO) {
+ ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << ", "
+ << "l2_table_index=" << request.l2_table_index << ", "
+ << "cluster_offset=zeroed" << dendl;
+ } else {
+ ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << ", "
+ << "l2_table_index=" << request.l2_table_index << ", "
+ << "cluster_offset=" << *request.cluster_offset
+ << dendl;
+ }
+ } else if (request.l2_table != nullptr) {
+ // ensure it's in the correct byte-order
+ l2_table->decode();
+ *request.l2_table = l2_table;
+ } else {
+ ceph_assert(false);
+ }
+
+ // complete the L2 cache request
+ boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+ [r, ctx=request.on_finish]() { ctx->complete(r); });
+ requests.pop_front();
+
+ // process next request (if any)
+ dispatch_request();
+ }
+
+ int l2_table_lookup(uint64_t l2_offset,
+ std::shared_ptr<LookupTable>* l2_table) {
+ auto cct = qcow_format->m_image_ctx->cct;
+
+ l2_table->reset();
+
+ // find a match in the existing cache
+ for (auto idx = 0U; idx < l2_cache_entries.size(); ++idx) {
+ auto& l2_cache = l2_cache_entries[idx];
+ if (l2_cache.l2_offset == l2_offset) {
+ if (l2_cache.in_flight) {
+ ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+ << "index=" << idx << " (in-flight)" << dendl;
+ return 0;
+ }
+
+ if (l2_cache.ret_val < 0) {
+ ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+ << "index=" << idx << " (error): "
+ << cpp_strerror(l2_cache.ret_val) << dendl;
+ int r = l2_cache.ret_val;
+ l2_cache = L2Cache{};
+
+ return r;
+ }
+
+ ++l2_cache.count;
+ if (l2_cache.count == std::numeric_limits<uint32_t>::max()) {
+ for (auto& entry : l2_cache_entries) {
+ entry.count >>= 1;
+ }
+ }
+
+ ldout(cct, 20) << "l2_offset=" << l2_offset << ", " << "index=" << idx
+ << dendl;
+ *l2_table = l2_cache.l2_table;
+ return 0;
+ }
+ }
+
+ // find the least used entry
+ int32_t min_idx = -1;
+ uint32_t min_count = std::numeric_limits<uint32_t>::max();
+ utime_t min_timestamp;
+ for (uint32_t idx = 0U; idx < l2_cache_entries.size(); ++idx) {
+ auto& l2_cache = l2_cache_entries[idx];
+ if (l2_cache.in_flight) {
+ continue;
+ }
+
+ if (l2_cache.count > 0) {
+ --l2_cache.count;
+ }
+
+ if (l2_cache.count <= min_count) {
+ if (min_idx == -1 || l2_cache.timestamp < min_timestamp) {
+ min_timestamp = l2_cache.timestamp;
+ min_count = l2_cache.count;
+ min_idx = idx;
+ }
+ }
+ }
+
+ if (min_idx == -1) {
+ // no space in the cache due to in-flight requests
+ ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+ << "index=DNE (cache busy)" << dendl;
+ return 0;
+ }
+
+ ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+ << "index=" << min_idx << " (loading)" << dendl;
+ auto& l2_cache = l2_cache_entries[min_idx];
+ l2_cache.l2_table = std::make_shared<LookupTable>(qcow_format->m_l2_size);
+ l2_cache.l2_offset = l2_offset;
+ l2_cache.timestamp = ceph_clock_now();
+ l2_cache.count = 1;
+ l2_cache.in_flight = true;
+
+ // read the L2 table into the L2 cache entry
+ auto ctx = new LambdaContext([this, index=min_idx, l2_offset](int r) {
+ boost::asio::post(m_strand, [this, index, l2_offset, r]() {
+ handle_l2_table_lookup(r, index, l2_offset); }); });
+ qcow_format->m_stream->read(
+ {{l2_offset, qcow_format->m_l2_size * sizeof(uint64_t)}},
+ &l2_cache.l2_table->bl, ctx);
+ return 0;
+ }
+
+ void handle_l2_table_lookup(int r, uint32_t index, uint64_t l2_offset) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", "
+ << "l2_offset=" << l2_offset << ", "
+ << "index=" << index << dendl;
+
+ auto& l2_cache = l2_cache_entries[index];
+ ceph_assert(l2_cache.in_flight);
+ l2_cache.in_flight = false;
+
+ if (r < 0) {
+ lderr(cct) << "failed to load L2 table: "
+ << "l2_offset=" << l2_cache.l2_offset << ": "
+ << cpp_strerror(r) << dendl;
+ l2_cache.ret_val = r;
+ } else {
+ // keep the L2 table in big-endian byte-order until the full table
+ // is requested
+ l2_cache.l2_table->init();
+ }
+
+ // restart the state machine
+ dispatch_request();
+ }
+
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ReadRequest: " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::ReadRequest {
+public:
+ ReadRequest(QCOWFormat* qcow_format, io::AioCompletion* aio_comp,
+ const LookupTable* l1_table, io::Extents&& image_extents)
+ : qcow_format(qcow_format), aio_comp(aio_comp), l1_table(l1_table),
+ image_extents(std::move(image_extents)) {
+ }
+
+ void send() {
+ get_cluster_offsets();
+ }
+
+private:
+ QCOWFormat* qcow_format;
+ io::AioCompletion* aio_comp;
+
+ const LookupTable* l1_table;
+ io::Extents image_extents;
+
+ size_t image_extents_idx = 0;
+ uint32_t image_extent_offset = 0;
+
+ ClusterExtents cluster_extents;
+
+ void get_cluster_offsets() {
+ auto cct = qcow_format->m_image_ctx->cct;
+ populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents,
+ &cluster_extents);
+
+ ldout(cct, 20) << dendl;
+ auto ctx = new LambdaContext([this](int r) {
+ handle_get_cluster_offsets(r); });
+ auto gather_ctx = new C_Gather(cct, ctx);
+
+ for (auto& cluster_extent : cluster_extents) {
+ auto sub_ctx = new LambdaContext(
+ [this, &cluster_extent, on_finish=gather_ctx->new_sub()](int r) {
+ handle_get_cluster_offset(r, cluster_extent, on_finish); });
+ qcow_format->m_l2_table_cache->get_cluster_offset(
+ l1_table, cluster_extent.image_offset,
+ &cluster_extent.cluster_offset, sub_ctx);
+ }
+
+ gather_ctx->activate();
+ }
+
+ void handle_get_cluster_offset(int r, const ClusterExtent& cluster_extent,
+ Context* on_finish) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", "
+ << "image_offset=" << cluster_extent.image_offset << ", "
+ << "cluster_offset=" << cluster_extent.cluster_offset
+ << dendl;
+
+ if (r == -ENOENT) {
+ ldout(cct, 20) << "image offset DNE in QCOW image" << dendl;
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "failed to map image offset " << cluster_extent.image_offset
+ << ": " << cpp_strerror(r) << dendl;
+ }
+
+ on_finish->complete(r);
+ }
+
+ void handle_get_cluster_offsets(int r) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve cluster extents: " << cpp_strerror(r)
+ << dendl;
+ aio_comp->fail(r);
+ delete this;
+ return;
+ }
+
+ read_clusters();
+ }
+
+ void read_clusters() {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ aio_comp->set_request_count(cluster_extents.size());
+ for (auto& cluster_extent : cluster_extents) {
+ auto read_ctx = new io::ReadResult::C_ImageReadRequest(
+ aio_comp, cluster_extent.buffer_offset,
+ {{cluster_extent.image_offset, cluster_extent.cluster_length}});
+ read_ctx->ignore_enoent = true;
+
+ auto log_ctx = new LambdaContext(
+ [this, cct=qcow_format->m_image_ctx->cct,
+ image_offset=cluster_extent.image_offset,
+ image_length=cluster_extent.cluster_length, ctx=read_ctx](int r) {
+ handle_read_cluster(cct, r, image_offset, image_length, ctx);
+ });
+
+ if (cluster_extent.cluster_offset == 0) {
+ // QCOW header is at offset 0, implies cluster DNE
+ log_ctx->complete(-ENOENT);
+ } else if (cluster_extent.cluster_offset == QCOW_OFLAG_ZERO) {
+ // explicitly zeroed section
+ read_ctx->bl.append_zero(cluster_extent.cluster_length);
+ log_ctx->complete(0);
+ } else {
+ // request the (sub)cluster from the cluster cache
+ qcow_format->m_cluster_cache->get_cluster(
+ cluster_extent.cluster_offset, cluster_extent.cluster_length,
+ cluster_extent.intra_cluster_offset, &read_ctx->bl, log_ctx);
+ }
+ }
+
+ delete this;
+ }
+
+ void handle_read_cluster(CephContext* cct, int r, uint64_t image_offset,
+ uint64_t image_length, Context* on_finish) const {
+ // NOTE: treat as static function, expect object has been deleted
+
+ ldout(cct, 20) << "r=" << r << ", "
+ << "image_offset=" << image_offset << ", "
+ << "image_length=" << image_length << dendl;
+
+ if (r != -ENOENT && r < 0) {
+ lderr(cct) << "failed to read image extent " << image_offset << "~"
+ << image_length << ": " << cpp_strerror(r) << dendl;
+ }
+
+ on_finish->complete(r);
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::" \
+ << "ListSnapsRequest: " << this << " " \
+ << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::ListSnapsRequest {
+public:
+ ListSnapsRequest(
+ QCOWFormat* qcow_format, uint32_t l1_table_index,
+ ClusterExtents&& cluster_extents,
+ const std::map<uint64_t, const LookupTable*>& snap_id_to_l1_table,
+ io::SnapshotDelta* snapshot_delta, Context* on_finish)
+ : qcow_format(qcow_format), l1_table_index(l1_table_index),
+ cluster_extents(std::move(cluster_extents)),
+ snap_id_to_l1_table(snap_id_to_l1_table), snapshot_delta(snapshot_delta),
+ on_finish(on_finish) {
+ }
+
+ void send() {
+ get_l2_table();
+ }
+
+private:
+ QCOWFormat* qcow_format;
+ uint32_t l1_table_index;
+ ClusterExtents cluster_extents;
+ std::map<uint64_t, const LookupTable*> snap_id_to_l1_table;
+ io::SnapshotDelta* snapshot_delta;
+ Context* on_finish;
+
+ std::shared_ptr<const LookupTable> previous_l2_table;
+ std::shared_ptr<const LookupTable> l2_table;
+
+ void get_l2_table() {
+ auto cct = qcow_format->m_image_ctx->cct;
+ if (snap_id_to_l1_table.empty()) {
+ finish(0);
+ return;
+ }
+
+ auto it = snap_id_to_l1_table.begin();
+ auto [snap_id, l1_table] = *it;
+ snap_id_to_l1_table.erase(it);
+
+ previous_l2_table = l2_table;
+ l2_table.reset();
+
+ auto ctx = new LambdaContext([this, snap_id = snap_id](int r) {
+ boost::asio::post(qcow_format->m_strand, [this, snap_id, r]() {
+ handle_get_l2_table(r, snap_id);
+ });
+ });
+
+ if (l1_table_index >= l1_table->size ||
+ l1_table->cluster_offsets[l1_table_index] == 0) {
+ ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
+ << "snap_id=" << snap_id << ": DNE" << dendl;
+ ctx->complete(-ENOENT);
+ return;
+ }
+
+ uint64_t l2_table_offset = l1_table->cluster_offsets[l1_table_index] &
+ qcow_format->m_cluster_mask;
+
+ ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
+ << "snap_id=" << snap_id << ", "
+ << "l2_table_offset=" << l2_table_offset << dendl;
+ qcow_format->m_l2_table_cache->get_l2_table(l1_table, l2_table_offset,
+ &l2_table, ctx);
+ }
+
+ void handle_get_l2_table(int r, uint64_t snap_id) {
+ ceph_assert(qcow_format->m_strand.running_in_this_thread());
+
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", "
+ << "snap_id=" << snap_id << dendl;
+
+ if (r == -ENOENT) {
+ l2_table.reset();
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve L2 table for snapshot " << snap_id
+ << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ // compare the cluster offsets at each requested L2 offset between
+ // the previous snapshot's L2 table and the current L2 table.
+ auto& sparse_extents = (*snapshot_delta)[{snap_id, snap_id}];
+ for (auto& cluster_extent : cluster_extents) {
+ uint32_t l2_table_index =
+ (cluster_extent.image_offset >> qcow_format->m_cluster_bits) &
+ (qcow_format->m_l2_size - 1);
+
+ std::optional<uint64_t> cluster_offset;
+ if (l2_table && l2_table_index < l2_table->size) {
+ cluster_offset = l2_table->cluster_offsets[l2_table_index] &
+ qcow_format->m_cluster_offset_mask;
+ }
+
+ std::optional<uint64_t> prev_cluster_offset;
+ if (previous_l2_table && l2_table_index < previous_l2_table->size) {
+ prev_cluster_offset =
+ previous_l2_table->cluster_offsets[l2_table_index] &
+ qcow_format->m_cluster_offset_mask;
+ }
+
+ ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
+ << "snap_id=" << snap_id << ", "
+ << "image_offset=" << cluster_extent.image_offset << ", "
+ << "l2_table_index=" << l2_table_index << ", "
+ << "cluster_offset=" << cluster_offset << ", "
+ << "prev_cluster_offset=" << prev_cluster_offset << dendl;
+
+ auto state = io::SPARSE_EXTENT_STATE_DATA;
+ if (cluster_offset == prev_cluster_offset) {
+ continue;
+ } else if ((prev_cluster_offset && !cluster_offset) ||
+ *cluster_offset == QCOW_OFLAG_ZERO) {
+ // explicitly zeroed or deallocated
+ state = io::SPARSE_EXTENT_STATE_ZEROED;
+ }
+
+ sparse_extents.insert(
+ cluster_extent.image_offset, cluster_extent.cluster_length,
+ {state, cluster_extent.cluster_length});
+ }
+
+ ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
+ << "snap_id=" << snap_id << ", "
+ << "sparse_extents=" << sparse_extents << dendl;
+
+ // continue processing the L2 table at this index for all snapshots
+ boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+ [this]() { get_l2_table(); });
+ }
+
+
+ void finish(int r) {
+ auto cct = qcow_format->m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+QCOWFormat<I>::QCOWFormat(
+ I* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<I>* source_spec_builder)
+ : m_image_ctx(image_ctx), m_json_object(json_object),
+ m_source_spec_builder(source_spec_builder),
+ m_strand(*image_ctx->asio_engine) {
+}
+
+template <typename I>
+void QCOWFormat<I>::open(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ int r = m_source_spec_builder->build_stream(m_json_object, &m_stream);
+ if (r < 0) {
+ lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_open(r, on_finish); });
+ m_stream->open(ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_open(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open QCOW image: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ probe(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::probe(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_probe(r, on_finish); });
+ m_bl.clear();
+ m_stream->read({{0, 8}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_probe(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to probe QCOW image: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto header_probe = *reinterpret_cast<QCowHeaderProbe*>(
+ m_bl.c_str());
+ header_probe.magic = be32toh(header_probe.magic);
+ header_probe.version = be32toh(header_probe.version);
+
+ if (header_probe.magic != QCOW_MAGIC) {
+ lderr(cct) << "invalid QCOW header magic" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_bl.clear();
+ if (header_probe.version == 1) {
+#ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+ read_v1_header(on_finish);
+#else // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+ lderr(cct) << "QCOW is not supported" << dendl;
+ on_finish->complete(-ENOTSUP);
+#endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+ return;
+ } else if (header_probe.version >= 2 && header_probe.version <= 3) {
+ read_v2_header(on_finish);
+ return;
+ } else {
+ lderr(cct) << "invalid QCOW header version " << header_probe.version
+ << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+}
+
+#ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+
+template <typename I>
+void QCOWFormat<I>::read_v1_header(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_read_v1_header(r, on_finish); });
+ m_bl.clear();
+ m_stream->read({{0, sizeof(QCowHeaderV1)}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_v1_header(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read QCOW header: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto header = *reinterpret_cast<QCowHeaderV1*>(m_bl.c_str());
+
+ // byte-swap important fields
+ header.magic = be32toh(header.magic);
+ header.version = be32toh(header.version);
+ header.backing_file_offset = be64toh(header.backing_file_offset);
+ header.backing_file_size = be32toh(header.backing_file_size);
+ header.size = be64toh(header.size);
+ header.crypt_method = be32toh(header.crypt_method);
+ header.l1_table_offset = be64toh(header.l1_table_offset);
+
+ if (header.magic != QCOW_MAGIC || header.version != 1) {
+ // honestly shouldn't happen since we've already validated it
+ lderr(cct) << "header is not QCOW" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS ||
+ header.cluster_bits > QCOW_MAX_CLUSTER_BITS) {
+ lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if (header.l2_bits < (QCOW_MIN_CLUSTER_BITS - 3) ||
+ header.l2_bits > (QCOW_MAX_CLUSTER_BITS - 3)) {
+ lderr(cct) << "invalid L2 bits: " << header.l2_bits << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if (header.crypt_method != QCOW_CRYPT_NONE) {
+ lderr(cct) << "invalid or unsupported encryption method" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_size = header.size;
+ if (p2roundup(m_size, static_cast<uint64_t>(512)) != m_size) {
+ lderr(cct) << "image size is not a multiple of block size" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_backing_file_offset = header.backing_file_offset;
+ m_backing_file_size = header.backing_file_size;
+
+ m_cluster_bits = header.cluster_bits;
+ m_cluster_size = 1UL << header.cluster_bits;
+ m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1;
+ m_cluster_mask = ~QCOW_OFLAG_COMPRESSED;
+
+ m_l2_bits = header.l2_bits;
+ m_l2_size = (1UL << m_l2_bits);
+
+ m_l1_shift = m_cluster_bits + m_l2_bits;
+ m_l1_table.size = (m_size + (1LL << m_l1_shift) - 1) >> m_l1_shift;
+ m_l1_table_offset = header.l1_table_offset;
+ if (m_size > (std::numeric_limits<uint64_t>::max() - (1ULL << m_l1_shift)) ||
+ m_l1_table.size >
+ (std::numeric_limits<int32_t>::max() / sizeof(uint64_t))) {
+ lderr(cct) << "image size too big: " << m_size << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ ldout(cct, 15) << "size=" << m_size << ", "
+ << "cluster_bits=" << m_cluster_bits << ", "
+ << "l2_bits=" << m_l2_bits << dendl;
+
+ // allocate memory for L1 table and L2 + cluster caches
+ m_l2_table_cache = std::make_unique<L2TableCache>(this);
+ m_cluster_cache = std::make_unique<ClusterCache>(this);
+
+ read_l1_table(on_finish);
+}
+
+#endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+
+template <typename I>
+void QCOWFormat<I>::read_v2_header(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_read_v2_header(r, on_finish); });
+ m_bl.clear();
+ m_stream->read({{0, sizeof(QCowHeader)}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_v2_header(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read QCOW2 header: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto header = *reinterpret_cast<QCowHeader*>(m_bl.c_str());
+
+ // byte-swap important fields
+ header.magic = be32toh(header.magic);
+ header.version = be32toh(header.version);
+ header.backing_file_offset = be64toh(header.backing_file_offset);
+ header.backing_file_size = be32toh(header.backing_file_size);
+ header.cluster_bits = be32toh(header.cluster_bits);
+ header.size = be64toh(header.size);
+ header.crypt_method = be32toh(header.crypt_method);
+ header.l1_size = be32toh(header.l1_size);
+ header.l1_table_offset = be64toh(header.l1_table_offset);
+ header.nb_snapshots = be32toh(header.nb_snapshots);
+ header.snapshots_offset = be64toh(header.snapshots_offset);
+
+ if (header.version == 2) {
+ // valid only for version >= 3
+ header.incompatible_features = 0;
+ header.compatible_features = 0;
+ header.autoclear_features = 0;
+ header.header_length = 72;
+ header.compression_type = 0;
+ } else {
+ header.incompatible_features = be64toh(header.incompatible_features);
+ header.compatible_features = be64toh(header.compatible_features);
+ header.autoclear_features = be64toh(header.autoclear_features);
+ header.header_length = be32toh(header.header_length);
+ }
+
+ if (header.magic != QCOW_MAGIC || header.version < 2 || header.version > 3) {
+ // honestly shouldn't happen since we've already validated it
+ lderr(cct) << "header is not QCOW2" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS ||
+ header.cluster_bits > QCOW_MAX_CLUSTER_BITS) {
+ lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if (header.crypt_method != QCOW_CRYPT_NONE) {
+ lderr(cct) << "invalid or unsupported encryption method" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_size = header.size;
+ if (p2roundup(m_size, static_cast<uint64_t>(512)) != m_size) {
+ lderr(cct) << "image size is not a multiple of block size" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if (header.header_length <= offsetof(QCowHeader, compression_type)) {
+ header.compression_type = 0;
+ }
+
+ if ((header.compression_type != 0) ||
+ ((header.incompatible_features & QCOW2_INCOMPAT_COMPRESSION) != 0)) {
+ lderr(cct) << "invalid or unsupported compression type" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ if ((header.incompatible_features & QCOW2_INCOMPAT_DATA_FILE) != 0) {
+ lderr(cct) << "external data file feature not supported" << dendl;
+ on_finish->complete(-ENOTSUP);
+ }
+
+ if ((header.incompatible_features & QCOW2_INCOMPAT_EXTL2) != 0) {
+ lderr(cct) << "extended L2 table feature not supported" << dendl;
+ on_finish->complete(-ENOTSUP);
+ return;
+ }
+
+ header.incompatible_features &= ~QCOW2_INCOMPAT_MASK;
+ if (header.incompatible_features != 0) {
+ lderr(cct) << "unknown incompatible feature enabled" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_backing_file_offset = header.backing_file_offset;
+ m_backing_file_size = header.backing_file_size;
+
+ m_cluster_bits = header.cluster_bits;
+ m_cluster_size = 1UL << header.cluster_bits;
+ m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1;
+ m_cluster_mask = ~(QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_COPIED);
+
+ // L2 table is fixed a (1) cluster block to hold 8-byte (3 bit) offsets
+ m_l2_bits = m_cluster_bits - 3;
+ m_l2_size = (1UL << m_l2_bits);
+
+ m_l1_shift = m_cluster_bits + m_l2_bits;
+ m_l1_table.size = (m_size + (1LL << m_l1_shift) - 1) >> m_l1_shift;
+ m_l1_table_offset = header.l1_table_offset;
+ if (m_size > (std::numeric_limits<uint64_t>::max() - (1ULL << m_l1_shift)) ||
+ m_l1_table.size >
+ (std::numeric_limits<int32_t>::max() / sizeof(uint64_t))) {
+ lderr(cct) << "image size too big: " << m_size << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ } else if (m_l1_table.size > header.l1_size) {
+ lderr(cct) << "invalid L1 table size in header (" << header.l1_size
+ << " < " << m_l1_table.size << ")" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_snapshot_count = header.nb_snapshots;
+ m_snapshots_offset = header.snapshots_offset;
+
+ ldout(cct, 15) << "size=" << m_size << ", "
+ << "cluster_bits=" << m_cluster_bits << ", "
+ << "l1_table_offset=" << m_l1_table_offset << ", "
+ << "snapshot_count=" << m_snapshot_count << ", "
+ << "snapshots_offset=" << m_snapshots_offset << dendl;
+
+ // allocate memory for L1 table and L2 + cluster caches
+ m_l2_table_cache = std::make_unique<L2TableCache>(this);
+ m_cluster_cache = std::make_unique<ClusterCache>(this);
+
+ read_snapshot(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_snapshot(Context* on_finish) {
+ if (m_snapshots_offset == 0 || m_snapshots.size() == m_snapshot_count) {
+ read_l1_table(on_finish);
+ return;
+ }
+
+ // header is always aligned on 8 byte boundary
+ m_snapshots_offset = p2roundup(m_snapshots_offset, static_cast<uint64_t>(8));
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "snap_id=" << (m_snapshots.size() + 1) << ", "
+ << "offset=" << m_snapshots_offset << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_read_snapshot(r, on_finish); });
+ m_bl.clear();
+ m_stream->read({{m_snapshots_offset, sizeof(QCowSnapshotHeader)}}, &m_bl,
+ ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_snapshot(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << ", "
+ << "index=" << m_snapshots.size() << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read QCOW2 snapshot header: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ m_snapshots_offset += m_bl.length();
+ auto header = *reinterpret_cast<QCowSnapshotHeader*>(m_bl.c_str());
+
+ auto& snapshot = m_snapshots[m_snapshots.size() + 1];
+ snapshot.id.resize(be16toh(header.id_str_size));
+ snapshot.name.resize(be16toh(header.name_size));
+ snapshot.l1_table_offset = be64toh(header.l1_table_offset);
+ snapshot.l1_table.size = be32toh(header.l1_size);
+ snapshot.timestamp.sec_ref() = be32toh(header.date_sec);
+ snapshot.timestamp.nsec_ref() = be32toh(header.date_nsec);
+ snapshot.extra_data_size = be32toh(header.extra_data_size);
+
+ ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
+ << "id_str_len=" << snapshot.id.size() << ", "
+ << "name_str_len=" << snapshot.name.size() << ", "
+ << "l1_table_offset=" << snapshot.l1_table_offset << ", "
+ << "l1_size=" << snapshot.l1_table.size << ", "
+ << "extra_data_size=" << snapshot.extra_data_size << dendl;
+
+ read_snapshot_extra(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_snapshot_extra(Context* on_finish) {
+ ceph_assert(!m_snapshots.empty());
+ auto& snapshot = m_snapshots.rbegin()->second;
+
+ uint32_t length = snapshot.extra_data_size +
+ snapshot.id.size() +
+ snapshot.name.size();
+ if (length == 0) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ snapshot.name = uuid_gen.to_string();
+
+ read_snapshot(on_finish);
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
+ << "offset=" << m_snapshots_offset << ", "
+ << "length=" << length << dendl;
+
+ auto offset = m_snapshots_offset;
+ m_snapshots_offset += length;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_read_snapshot_extra(r, on_finish); });
+ m_bl.clear();
+ m_stream->read({{offset, length}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_snapshot_extra(int r, Context* on_finish) {
+ ceph_assert(!m_snapshots.empty());
+ auto& snapshot = m_snapshots.rbegin()->second;
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << ", "
+ << "snap_id=" << m_snapshots.size() << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read QCOW2 snapshot header extra: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ if (snapshot.extra_data_size >=
+ offsetof(QCowSnapshotExtraData, disk_size) + sizeof(uint64_t)) {
+ auto extra = reinterpret_cast<const QCowSnapshotExtraData*>(m_bl.c_str());
+ snapshot.size = be64toh(extra->disk_size);
+ } else {
+ snapshot.size = m_size;
+ }
+
+ auto data = reinterpret_cast<const char*>(m_bl.c_str());
+ data += snapshot.extra_data_size;
+
+ if (!snapshot.id.empty()) {
+ snapshot.id = std::string(data, snapshot.id.size());
+ data += snapshot.id.size();
+ }
+
+ if (!snapshot.name.empty()) {
+ snapshot.name = std::string(data, snapshot.name.size());
+ data += snapshot.name.size();
+ } else {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ snapshot.name = uuid_gen.to_string();
+ }
+
+ ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
+ << "name=" << snapshot.name << ", "
+ << "size=" << snapshot.size << dendl;
+ read_snapshot_l1_table(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_snapshot_l1_table(Context* on_finish) {
+ ceph_assert(!m_snapshots.empty());
+ auto& snapshot = m_snapshots.rbegin()->second;
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
+ << "l1_table_offset=" << snapshot.l1_table_offset
+ << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_read_snapshot_l1_table(r, on_finish); });
+ m_stream->read({{snapshot.l1_table_offset,
+ snapshot.l1_table.size * sizeof(uint64_t)}},
+ &snapshot.l1_table.bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_snapshot_l1_table(int r, Context* on_finish) {
+ ceph_assert(!m_snapshots.empty());
+ auto& snapshot = m_snapshots.rbegin()->second;
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << ", "
+ << "snap_id=" << m_snapshots.size() << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read snapshot L1 table: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ snapshot.l1_table.decode();
+ read_snapshot(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_l1_table(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_read_l1_table(r, on_finish); });
+ m_stream->read({{m_l1_table_offset,
+ m_l1_table.size * sizeof(uint64_t)}},
+ &m_l1_table.bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_l1_table(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read L1 table: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ m_l1_table.decode();
+ read_backing_file(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_backing_file(Context* on_finish) {
+ if (m_backing_file_offset == 0 || m_backing_file_size == 0) {
+ // all data is within the specified file
+ on_finish->complete(0);
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // TODO add support for backing files
+ on_finish->complete(-ENOTSUP);
+}
+
+template <typename I>
+void QCOWFormat<I>::close(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ m_stream->close(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::get_snapshots(SnapInfos* snap_infos, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ snap_infos->clear();
+ for (auto& [snap_id, snapshot] : m_snapshots) {
+ SnapInfo snap_info(snapshot.name, cls::rbd::UserSnapshotNamespace{},
+ snapshot.size, {}, 0, 0, snapshot.timestamp);
+ snap_infos->emplace(snap_id, snap_info);
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void QCOWFormat<I>::get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "snap_id=" << snap_id << dendl;
+
+ if (snap_id == CEPH_NOSNAP) {
+ *size = m_size;
+ } else {
+ auto snapshot_it = m_snapshots.find(snap_id);
+ if (snapshot_it == m_snapshots.end()) {
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ auto& snapshot = snapshot_it->second;
+ *size = snapshot.size;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool QCOWFormat<I>::read(
+ io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents,
+ io::ReadResult&& read_result, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "snap_id=" << snap_id << ", "
+ << "image_extents=" << image_extents << dendl;
+
+ const LookupTable* l1_table = nullptr;
+ if (snap_id == CEPH_NOSNAP) {
+ l1_table = &m_l1_table;
+ } else {
+ auto snapshot_it = m_snapshots.find(snap_id);
+ if (snapshot_it == m_snapshots.end()) {
+ aio_comp->fail(-ENOENT);
+ return true;
+ }
+
+ auto& snapshot = snapshot_it->second;
+ l1_table = &snapshot.l1_table;
+ }
+
+ aio_comp->read_result = std::move(read_result);
+ aio_comp->read_result.set_image_extents(image_extents);
+
+ auto read_request = new ReadRequest(this, aio_comp, l1_table,
+ std::move(image_extents));
+ read_request->send();
+
+ return true;
+}
+
+template <typename I>
+void QCOWFormat<I>::list_snaps(io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ ClusterExtents cluster_extents;
+ populate_cluster_extents(cct, m_cluster_size, image_extents,
+ &cluster_extents);
+
+ // map L1 table indexes to cluster extents
+ std::map<uint64_t, ClusterExtents> l1_cluster_extents;
+ for (auto& cluster_extent : cluster_extents) {
+ uint32_t l1_table_index = cluster_extent.image_offset >> m_l1_shift;
+ auto& l1_cluster_extent = l1_cluster_extents[l1_table_index];
+ l1_cluster_extent.reserve(cluster_extents.size());
+ l1_cluster_extent.push_back(cluster_extent);
+ }
+
+ std::map<uint64_t, const LookupTable*> snap_id_to_l1_table;
+ for (auto& [snap_id, snapshot] : m_snapshots) {
+ snap_id_to_l1_table[snap_id] = &snapshot.l1_table;
+ }
+ snap_id_to_l1_table[CEPH_NOSNAP] = &m_l1_table;
+
+ on_finish = new LambdaContext([this, image_extents,
+ snap_ids=std::move(snap_ids),
+ snapshot_delta, on_finish](int r) mutable {
+ handle_list_snaps(r, std::move(image_extents), std::move(snap_ids),
+ snapshot_delta, on_finish);
+ });
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ for (auto& [l1_table_index, cluster_extents] : l1_cluster_extents) {
+ auto list_snaps_request = new ListSnapsRequest(
+ this, l1_table_index, std::move(cluster_extents), snap_id_to_l1_table,
+ snapshot_delta, gather_ctx->new_sub());
+ list_snaps_request->send();
+ }
+
+ gather_ctx->activate();
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_list_snaps(int r, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids,
+ io::SnapshotDelta* snapshot_delta,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", "
+ << "snapshot_delta=" << *snapshot_delta << dendl;
+
+ std::optional<uint64_t> previous_size = std::nullopt;
+ for (auto& [snap_id, snapshot] : m_snapshots) {
+ auto sparse_extents = &(*snapshot_delta)[{snap_id, snap_id}];
+ util::zero_shrunk_snapshot(cct, image_extents, snap_id, snapshot.size,
+ &previous_size, sparse_extents);
+ }
+
+ auto sparse_extents = &(*snapshot_delta)[{CEPH_NOSNAP, CEPH_NOSNAP}];
+ util::zero_shrunk_snapshot(cct, image_extents, CEPH_NOSNAP, m_size,
+ &previous_size, sparse_extents);
+
+ util::merge_snapshot_delta(snap_ids, snapshot_delta);
+ on_finish->complete(r);
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::QCOWFormat<librbd::ImageCtx>;
diff --git a/src/librbd/migration/QCOWFormat.h b/src/librbd/migration/QCOWFormat.h
new file mode 100644
index 000000000..b36506716
--- /dev/null
+++ b/src/librbd/migration/QCOWFormat.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H
+#define CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H
+
+#include "include/int_types.h"
+#include "librbd/Types.h"
+#include "librbd/migration/FormatInterface.h"
+#include "librbd/migration/QCOW.h"
+#include "acconfig.h"
+#include "json_spirit/json_spirit.h"
+#include <boost/asio/io_context_strand.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+#include <deque>
+#include <vector>
+#include <memory>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> struct SourceSpecBuilder;
+struct StreamInterface;
+
+namespace qcow_format {
+
+struct LookupTable {
+ LookupTable() {}
+ LookupTable(uint32_t size) : size(size) {}
+
+ bufferlist bl;
+ uint64_t* cluster_offsets = nullptr;
+ uint32_t size = 0;
+ bool decoded = false;
+
+ void init();
+ void decode();
+};
+
+} // namespace qcow_format
+
+template <typename ImageCtxT>
+class QCOWFormat : public FormatInterface {
+public:
+ static QCOWFormat* create(
+ ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<ImageCtxT>* source_spec_builder) {
+ return new QCOWFormat(image_ctx, json_object, source_spec_builder);
+ }
+
+ QCOWFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<ImageCtxT>* source_spec_builder);
+ QCOWFormat(const QCOWFormat&) = delete;
+ QCOWFormat& operator=(const QCOWFormat&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override;
+ void get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) override;
+
+ bool read(io::AioCompletion* aio_comp, uint64_t snap_id,
+ io::Extents&& image_extents, io::ReadResult&& read_result,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) override;
+
+ void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids,
+ int list_snaps_flags, io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN
+ * |
+ * v
+ * PROBE
+ * |
+ * |\---> READ V1 HEADER ----------\
+ * | |
+ * \----> READ V2 HEADER |
+ * | |
+ * | /----------\ |
+ * | | | |
+ * v v | |
+ * READ SNAPSHOT | |
+ * | | |
+ * v | |
+ * READ SNAPSHOT EXTRA | |
+ * | | |
+ * v | |
+ * READ SNAPSHOT L1 TABLE |
+ * | |
+ * \--------------------\|
+ * |
+ * v
+ * READ L1 TABLE
+ * |
+ * v
+ * READ BACKING FILE
+ * |
+ * /-------------------------------/
+ * |
+ * v
+ * <opened>
+ *
+ * @endverbatim
+ */
+
+ struct Cluster;
+ struct ClusterCache;
+ struct L2TableCache;
+ struct ReadRequest;
+ struct ListSnapsRequest;
+
+ struct Snapshot {
+ std::string id;
+ std::string name;
+
+ utime_t timestamp;
+ uint64_t size = 0;
+
+ uint64_t l1_table_offset = 0;
+ qcow_format::LookupTable l1_table;
+
+ uint32_t extra_data_size = 0;
+ };
+
+ ImageCtxT* m_image_ctx;
+ json_spirit::mObject m_json_object;
+ const SourceSpecBuilder<ImageCtxT>* m_source_spec_builder;
+
+ boost::asio::io_context::strand m_strand;
+ std::shared_ptr<StreamInterface> m_stream;
+
+ bufferlist m_bl;
+
+ uint64_t m_size = 0;
+
+ uint64_t m_backing_file_offset = 0;
+ uint32_t m_backing_file_size = 0;
+
+ uint32_t m_cluster_bits = 0;
+ uint32_t m_cluster_size = 0;
+ uint64_t m_cluster_offset_mask = 0;
+ uint64_t m_cluster_mask = 0;
+
+ uint32_t m_l1_shift = 0;
+ uint64_t m_l1_table_offset = 0;
+ qcow_format::LookupTable m_l1_table;
+
+ uint32_t m_l2_bits = 0;
+ uint32_t m_l2_size = 0;
+
+ uint32_t m_snapshot_count = 0;
+ uint64_t m_snapshots_offset = 0;
+ std::map<uint64_t, Snapshot> m_snapshots;
+
+ std::unique_ptr<L2TableCache> m_l2_table_cache;
+ std::unique_ptr<ClusterCache> m_cluster_cache;
+
+ void handle_open(int r, Context* on_finish);
+
+ void probe(Context* on_finish);
+ void handle_probe(int r, Context* on_finish);
+
+#ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+ void read_v1_header(Context* on_finish);
+ void handle_read_v1_header(int r, Context* on_finish);
+#endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+
+ void read_v2_header(Context* on_finish);
+ void handle_read_v2_header(int r, Context* on_finish);
+
+ void read_snapshot(Context* on_finish);
+ void handle_read_snapshot(int r, Context* on_finish);
+
+ void read_snapshot_extra(Context* on_finish);
+ void handle_read_snapshot_extra(int r, Context* on_finish);
+
+ void read_snapshot_l1_table(Context* on_finish);
+ void handle_read_snapshot_l1_table(int r, Context* on_finish);
+
+ void read_l1_table(Context* on_finish);
+ void handle_read_l1_table(int r, Context* on_finish);
+
+ void read_backing_file(Context* on_finish);
+
+ void handle_list_snaps(int r, io::Extents&& image_extents,
+ io::SnapIds&& snap_ids,
+ io::SnapshotDelta* snapshot_delta, Context* on_finish);
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::QCOWFormat<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H
diff --git a/src/librbd/migration/RawFormat.cc b/src/librbd/migration/RawFormat.cc
new file mode 100644
index 000000000..0b655d368
--- /dev/null
+++ b/src/librbd/migration/RawFormat.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/RawFormat.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/migration/SnapshotInterface.h"
+#include "librbd/migration/SourceSpecBuilder.h"
+#include "librbd/migration/Utils.h"
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+static const std::string SNAPSHOTS_KEY {"snapshots"};
+
+
+} // anonymous namespace
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::RawFormat: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+RawFormat<I>::RawFormat(
+ I* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<I>* source_spec_builder)
+ : m_image_ctx(image_ctx), m_json_object(json_object),
+ m_source_spec_builder(source_spec_builder) {
+}
+
+template <typename I>
+void RawFormat<I>::open(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ handle_open(r, on_finish); });
+
+ // treat the base image as a HEAD-revision snapshot
+ Snapshots snapshots;
+ int r = m_source_spec_builder->build_snapshot(m_json_object, CEPH_NOSNAP,
+ &snapshots[CEPH_NOSNAP]);
+ if (r < 0) {
+ lderr(cct) << "failed to build HEAD revision handler: " << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto& snapshots_val = m_json_object[SNAPSHOTS_KEY];
+ if (snapshots_val.type() == json_spirit::array_type) {
+ auto& snapshots_arr = snapshots_val.get_array();
+ for (auto& snapshot_val : snapshots_arr) {
+ uint64_t index = snapshots.size();
+ if (snapshot_val.type() != json_spirit::obj_type) {
+ lderr(cct) << "invalid snapshot " << index << " JSON: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& snapshot_obj = snapshot_val.get_obj();
+ r = m_source_spec_builder->build_snapshot(snapshot_obj, index,
+ &snapshots[index]);
+ if (r < 0) {
+ lderr(cct) << "failed to build snapshot " << index << " handler: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+ }
+ } else if (snapshots_val.type() != json_spirit::null_type) {
+ lderr(cct) << "invalid snapshots array" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_snapshots = std::move(snapshots);
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+ SnapshotInterface* previous_snapshot = nullptr;
+ for (auto& [_, snapshot] : m_snapshots) {
+ snapshot->open(previous_snapshot, gather_ctx->new_sub());
+ previous_snapshot = snapshot.get();
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void RawFormat<I>::handle_open(int r, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open raw image: " << cpp_strerror(r)
+ << dendl;
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+ for (auto& [_, snapshot] : m_snapshots) {
+ snapshot->close(gather_ctx->new_sub());
+ }
+
+ m_image_ctx->state->close(new LambdaContext(
+ [r, on_finish=gather_ctx->new_sub()](int _) { on_finish->complete(r); }));
+
+ gather_ctx->activate();
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void RawFormat<I>::close(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+ for (auto& [snap_id, snapshot] : m_snapshots) {
+ snapshot->close(gather_ctx->new_sub());
+ }
+
+ gather_ctx->activate();
+}
+
+template <typename I>
+void RawFormat<I>::get_snapshots(SnapInfos* snap_infos, Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ snap_infos->clear();
+ for (auto& [snap_id, snapshot] : m_snapshots) {
+ if (snap_id == CEPH_NOSNAP) {
+ continue;
+ }
+ snap_infos->emplace(snap_id, snapshot->get_snap_info());
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+void RawFormat<I>::get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto snapshot_it = m_snapshots.find(snap_id);
+ if (snapshot_it == m_snapshots.end()) {
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ *size = snapshot_it->second->get_snap_info().size;
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool RawFormat<I>::read(
+ io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents,
+ io::ReadResult&& read_result, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "snap_id=" << snap_id << ", "
+ << "image_extents=" << image_extents << dendl;
+
+ auto snapshot_it = m_snapshots.find(snap_id);
+ if (snapshot_it == m_snapshots.end()) {
+ aio_comp->fail(-ENOENT);
+ return true;
+ }
+
+ snapshot_it->second->read(aio_comp, std::move(image_extents),
+ std::move(read_result), op_flags, read_flags,
+ parent_trace);
+ return true;
+}
+
+template <typename I>
+void RawFormat<I>::list_snaps(io::Extents&& image_extents,
+ io::SnapIds&& snap_ids, int list_snaps_flags,
+ io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ on_finish = new LambdaContext([this, snap_ids=std::move(snap_ids),
+ snapshot_delta, on_finish](int r) mutable {
+ handle_list_snaps(r, std::move(snap_ids), snapshot_delta, on_finish);
+ });
+
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ std::optional<uint64_t> previous_size = std::nullopt;
+ for (auto& [snap_id, snapshot] : m_snapshots) {
+ auto& sparse_extents = (*snapshot_delta)[{snap_id, snap_id}];
+
+ // zero out any space between the previous snapshot end and this
+ // snapshot's end
+ auto& snap_info = snapshot->get_snap_info();
+ util::zero_shrunk_snapshot(cct, image_extents, snap_id, snap_info.size,
+ &previous_size, &sparse_extents);
+
+ // build set of data/zeroed extents for the current snapshot
+ snapshot->list_snap(io::Extents{image_extents}, list_snaps_flags,
+ &sparse_extents, parent_trace, gather_ctx->new_sub());
+ }
+
+ gather_ctx->activate();
+}
+
+template <typename I>
+void RawFormat<I>::handle_list_snaps(int r, io::SnapIds&& snap_ids,
+ io::SnapshotDelta* snapshot_delta,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", "
+ << "snapshot_delta=" << snapshot_delta << dendl;
+
+ util::merge_snapshot_delta(snap_ids, snapshot_delta);
+ on_finish->complete(r);
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::RawFormat<librbd::ImageCtx>;
diff --git a/src/librbd/migration/RawFormat.h b/src/librbd/migration/RawFormat.h
new file mode 100644
index 000000000..a20c0814f
--- /dev/null
+++ b/src/librbd/migration/RawFormat.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_RAW_FORMAT_H
+#define CEPH_LIBRBD_MIGRATION_RAW_FORMAT_H
+
+#include "include/int_types.h"
+#include "librbd/Types.h"
+#include "librbd/migration/FormatInterface.h"
+#include "json_spirit/json_spirit.h"
+#include <map>
+#include <memory>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> struct SourceSpecBuilder;
+struct SnapshotInterface;
+
+template <typename ImageCtxT>
+class RawFormat : public FormatInterface {
+public:
+ static RawFormat* create(
+ ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<ImageCtxT>* source_spec_builder) {
+ return new RawFormat(image_ctx, json_object, source_spec_builder);
+ }
+
+ RawFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<ImageCtxT>* source_spec_builder);
+ RawFormat(const RawFormat&) = delete;
+ RawFormat& operator=(const RawFormat&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override;
+ void get_image_size(uint64_t snap_id, uint64_t* size,
+ Context* on_finish) override;
+
+ bool read(io::AioCompletion* aio_comp, uint64_t snap_id,
+ io::Extents&& image_extents, io::ReadResult&& read_result,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) override;
+
+ void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids,
+ int list_snaps_flags, io::SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) override;
+
+private:
+ typedef std::shared_ptr<SnapshotInterface> Snapshot;
+ typedef std::map<uint64_t, Snapshot> Snapshots;
+
+ ImageCtxT* m_image_ctx;
+ json_spirit::mObject m_json_object;
+ const SourceSpecBuilder<ImageCtxT>* m_source_spec_builder;
+
+ Snapshots m_snapshots;
+
+ void handle_open(int r, Context* on_finish);
+
+ void handle_list_snaps(int r, io::SnapIds&& snap_ids,
+ io::SnapshotDelta* snapshot_delta, Context* on_finish);
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::RawFormat<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_RAW_FORMAT_H
diff --git a/src/librbd/migration/RawSnapshot.cc b/src/librbd/migration/RawSnapshot.cc
new file mode 100644
index 000000000..4a83fd8ad
--- /dev/null
+++ b/src/librbd/migration/RawSnapshot.cc
@@ -0,0 +1,220 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/RawSnapshot.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/migration/SourceSpecBuilder.h"
+#include "librbd/migration/StreamInterface.h"
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string NAME_KEY{"name"};
+
+} // anonymous namespace
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::RawSnapshot::OpenRequest " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+struct RawSnapshot<I>::OpenRequest {
+ RawSnapshot* raw_snapshot;
+ Context* on_finish;
+
+ OpenRequest(RawSnapshot* raw_snapshot, Context* on_finish)
+ : raw_snapshot(raw_snapshot), on_finish(on_finish) {
+ }
+
+ void send() {
+ open_stream();
+ }
+
+ void open_stream() {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ OpenRequest, &OpenRequest::handle_open_stream>(this);
+ raw_snapshot->m_stream->open(ctx);
+ }
+
+ void handle_open_stream(int r) {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open stream: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_image_size();
+ }
+
+ void get_image_size() {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ OpenRequest, &OpenRequest::handle_get_image_size>(this);
+ raw_snapshot->m_stream->get_size(&raw_snapshot->m_snap_info.size, ctx);
+ }
+
+ void handle_get_image_size(int r) {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << ", "
+ << "image_size=" << raw_snapshot->m_snap_info.size << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to open stream: " << cpp_strerror(r) << dendl;
+ close_stream(r);
+ return;
+ }
+
+ finish(0);
+ }
+
+ void close_stream(int r) {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = new LambdaContext([this, r](int) {
+ handle_close_stream(r);
+ });
+ raw_snapshot->m_stream->close(ctx);
+ }
+
+ void handle_close_stream(int r) {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ raw_snapshot->m_stream.reset();
+
+ finish(r);
+ }
+
+ void finish(int r) {
+ auto cct = raw_snapshot->m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::RawSnapshot: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+RawSnapshot<I>::RawSnapshot(I* image_ctx,
+ const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<I>* source_spec_builder,
+ uint64_t index)
+ : m_image_ctx(image_ctx), m_json_object(json_object),
+ m_source_spec_builder(source_spec_builder), m_index(index),
+ m_snap_info({}, {}, 0, {}, 0, 0, {}) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+}
+
+template <typename I>
+void RawSnapshot<I>::open(SnapshotInterface* previous_snapshot,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+
+ // special-case for treating the HEAD revision as a snapshot
+ if (m_index != CEPH_NOSNAP) {
+ auto& name_val = m_json_object[NAME_KEY];
+ if (name_val.type() == json_spirit::str_type) {
+ m_snap_info.name = name_val.get_str();
+ } else if (name_val.type() == json_spirit::null_type) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+
+ m_snap_info.name = uuid_gen.to_string();
+ } else {
+ lderr(cct) << "invalid snapshot name" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ }
+
+ ldout(cct, 10) << "name=" << m_snap_info.name << dendl;
+
+ int r = m_source_spec_builder->build_stream(m_json_object, &m_stream);
+ if (r < 0) {
+ lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r)
+ << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ auto req = new OpenRequest(this, on_finish);
+ req->send();
+}
+
+template <typename I>
+void RawSnapshot<I>::close(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ if (!m_stream) {
+ on_finish->complete(0);
+ return;
+ }
+
+ m_stream->close(on_finish);
+}
+
+template <typename I>
+void RawSnapshot<I>::read(io::AioCompletion* aio_comp,
+ io::Extents&& image_extents,
+ io::ReadResult&& read_result, int op_flags,
+ int read_flags,
+ const ZTracer::Trace &parent_trace) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ aio_comp->read_result = std::move(read_result);
+ aio_comp->read_result.set_image_extents(image_extents);
+
+ aio_comp->set_request_count(1);
+ auto ctx = new io::ReadResult::C_ImageReadRequest(aio_comp,
+ 0, image_extents);
+
+ // raw directly maps the image-extent IO down to a byte IO extent
+ m_stream->read(std::move(image_extents), &ctx->bl, ctx);
+}
+
+template <typename I>
+void RawSnapshot<I>::list_snap(io::Extents&& image_extents,
+ int list_snaps_flags,
+ io::SparseExtents* sparse_extents,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+ // raw does support sparse extents so list the full IO extent as a delta
+ for (auto& [image_offset, image_length] : image_extents) {
+ sparse_extents->insert(image_offset, image_length,
+ {io::SPARSE_EXTENT_STATE_DATA, image_length});
+ }
+
+ on_finish->complete(0);
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::RawSnapshot<librbd::ImageCtx>;
diff --git a/src/librbd/migration/RawSnapshot.h b/src/librbd/migration/RawSnapshot.h
new file mode 100644
index 000000000..9f76d6878
--- /dev/null
+++ b/src/librbd/migration/RawSnapshot.h
@@ -0,0 +1,75 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_RAW_SNAPSHOT_H
+#define CEPH_LIBRBD_MIGRATION_RAW_SNAPSHOT_H
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include "librbd/migration/SnapshotInterface.h"
+#include "json_spirit/json_spirit.h"
+#include <memory>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> struct SourceSpecBuilder;
+struct StreamInterface;
+
+template <typename ImageCtxT>
+class RawSnapshot : public SnapshotInterface {
+public:
+ static RawSnapshot* create(
+ ImageCtx* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<ImageCtxT>* source_spec_builder, uint64_t index) {
+ return new RawSnapshot(image_ctx, json_object, source_spec_builder, index);
+ }
+
+ RawSnapshot(ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+ const SourceSpecBuilder<ImageCtxT>* source_spec_builder,
+ uint64_t index);
+ RawSnapshot(const RawSnapshot&) = delete;
+ RawSnapshot& operator=(const RawSnapshot&) = delete;
+
+ void open(SnapshotInterface* previous_snapshot, Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ const SnapInfo& get_snap_info() const override {
+ return m_snap_info;
+ }
+
+ void read(io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::ReadResult&& read_result, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) override;
+
+ void list_snap(io::Extents&& image_extents, int list_snaps_flags,
+ io::SparseExtents* sparse_extents,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) override;
+
+private:
+ struct OpenRequest;
+
+ ImageCtxT* m_image_ctx;
+ json_spirit::mObject m_json_object;
+ const SourceSpecBuilder<ImageCtxT>* m_source_spec_builder;
+ uint64_t m_index = 0;
+
+ SnapInfo m_snap_info;
+
+ std::shared_ptr<StreamInterface> m_stream;
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::RawSnapshot<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_RAW_SNAPSHOT_H
diff --git a/src/librbd/migration/S3Stream.cc b/src/librbd/migration/S3Stream.cc
new file mode 100644
index 000000000..3b4db0cef
--- /dev/null
+++ b/src/librbd/migration/S3Stream.cc
@@ -0,0 +1,202 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/S3Stream.h"
+#include "common/armor.h"
+#include "common/ceph_crypto.h"
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/migration/HttpClient.h"
+#include "librbd/migration/HttpProcessorInterface.h"
+#include <boost/beast/http.hpp>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/chrono.h>
+#include <fmt/format.h>
+
+#include <time.h>
+
+namespace librbd {
+namespace migration {
+
+using HttpRequest = boost::beast::http::request<boost::beast::http::empty_body>;
+
+namespace {
+
+const std::string URL_KEY {"url"};
+const std::string ACCESS_KEY {"access_key"};
+const std::string SECRET_KEY {"secret_key"};
+
+} // anonymous namespace
+
+template <typename I>
+struct S3Stream<I>::HttpProcessor : public HttpProcessorInterface {
+ S3Stream* s3stream;
+
+ HttpProcessor(S3Stream* s3stream) : s3stream(s3stream) {
+ }
+
+ void process_request(EmptyRequest& request) override {
+ s3stream->process_request(request);
+ }
+};
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::S3Stream: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+S3Stream<I>::S3Stream(I* image_ctx, const json_spirit::mObject& json_object)
+ : m_image_ctx(image_ctx), m_cct(image_ctx->cct),
+ m_asio_engine(image_ctx->asio_engine), m_json_object(json_object),
+ m_http_processor(std::make_unique<HttpProcessor>(this)) {
+}
+
+template <typename I>
+S3Stream<I>::~S3Stream() {
+}
+
+template <typename I>
+void S3Stream<I>::open(Context* on_finish) {
+ auto& url_value = m_json_object[URL_KEY];
+ if (url_value.type() != json_spirit::str_type) {
+ lderr(m_cct) << "failed to locate '" << URL_KEY << "' key" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& access_key = m_json_object[ACCESS_KEY];
+ if (access_key.type() != json_spirit::str_type) {
+ lderr(m_cct) << "failed to locate '" << ACCESS_KEY << "' key" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ auto& secret_key = m_json_object[SECRET_KEY];
+ if (secret_key.type() != json_spirit::str_type) {
+ lderr(m_cct) << "failed to locate '" << SECRET_KEY << "' key" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ m_url = url_value.get_str();
+
+ librados::Rados rados(m_image_ctx->md_ctx);
+ int r = 0;
+ m_access_key = access_key.get_str();
+ if (util::is_config_key_uri(m_access_key)) {
+ r = util::get_config_key(rados, m_access_key, &m_access_key);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve access key from config: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+ }
+
+ m_secret_key = secret_key.get_str();
+ if (util::is_config_key_uri(m_secret_key)) {
+ r = util::get_config_key(rados, m_secret_key, &m_secret_key);
+ if (r < 0) {
+ lderr(m_cct) << "failed to retrieve secret key from config: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+ }
+
+ ldout(m_cct, 10) << "url=" << m_url << ", "
+ << "access_key=" << m_access_key << dendl;
+
+ m_http_client.reset(HttpClient<I>::create(m_image_ctx, m_url));
+ m_http_client->set_http_processor(m_http_processor.get());
+ m_http_client->open(on_finish);
+}
+
+template <typename I>
+void S3Stream<I>::close(Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ if (!m_http_client) {
+ on_finish->complete(0);
+ return;
+ }
+
+ m_http_client->close(on_finish);
+}
+
+template <typename I>
+void S3Stream<I>::get_size(uint64_t* size, Context* on_finish) {
+ ldout(m_cct, 10) << dendl;
+
+ m_http_client->get_size(size, on_finish);
+}
+
+template <typename I>
+void S3Stream<I>::read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) {
+ ldout(m_cct, 20) << "byte_extents=" << byte_extents << dendl;
+
+ m_http_client->read(std::move(byte_extents), data, on_finish);
+}
+
+template <typename I>
+void S3Stream<I>::process_request(HttpRequest& http_request) {
+ ldout(m_cct, 20) << dendl;
+
+ // format RFC 1123 date/time
+ auto time = ceph::real_clock::to_time_t(ceph::real_clock::now());
+ struct tm timeInfo;
+ gmtime_r(&time, &timeInfo);
+
+ std::string date = fmt::format("{:%a, %d %b %Y %H:%M:%S %z}", timeInfo);
+ http_request.set(boost::beast::http::field::date, date);
+
+ // note: we don't support S3 subresources
+ std::string canonicalized_resource = std::string(http_request.target());
+
+ std::string string_to_sign = fmt::format(
+ "{}\n\n\n{}\n{}",
+ std::string(boost::beast::http::to_string(http_request.method())),
+ date, canonicalized_resource);
+
+ // create HMAC-SHA1 signature from secret key + string-to-sign
+ sha1_digest_t digest;
+ ceph::crypto::HMACSHA1 hmac(
+ reinterpret_cast<const unsigned char*>(m_secret_key.data()),
+ m_secret_key.size());
+ hmac.Update(reinterpret_cast<const unsigned char*>(string_to_sign.data()),
+ string_to_sign.size());
+ hmac.Final(reinterpret_cast<unsigned char*>(digest.v));
+
+ // base64 encode the result
+ char buf[64];
+ int r = ceph_armor(std::begin(buf), std::begin(buf) + sizeof(buf),
+ reinterpret_cast<const char *>(digest.v),
+ reinterpret_cast<const char *>(digest.v + digest.SIZE));
+ if (r < 0) {
+ ceph_abort("ceph_armor failed");
+ }
+
+ // store the access-key + signature in the HTTP authorization header
+ std::string signature = std::string(std::begin(buf), std::begin(buf) + r);
+ std::string authorization = fmt::format("AWS {}:{}", m_access_key, signature);
+ http_request.set(boost::beast::http::field::authorization, authorization);
+
+ ldout(m_cct, 20) << "string_to_sign=" << string_to_sign << ", "
+ << "authorization=" << authorization << dendl;
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::S3Stream<librbd::ImageCtx>;
diff --git a/src/librbd/migration/S3Stream.h b/src/librbd/migration/S3Stream.h
new file mode 100644
index 000000000..586b21787
--- /dev/null
+++ b/src/librbd/migration/S3Stream.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_S3_STREAM_H
+#define CEPH_LIBRBD_MIGRATION_S3_STREAM_H
+
+#include "include/int_types.h"
+#include "librbd/migration/StreamInterface.h"
+#include <boost/beast/http/empty_body.hpp>
+#include <boost/beast/http/message.hpp>
+#include <boost/beast/http/string_body.hpp>
+#include <json_spirit/json_spirit.h>
+#include <memory>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> class HttpClient;
+
+template <typename ImageCtxT>
+class S3Stream : public StreamInterface {
+public:
+ static S3Stream* create(ImageCtxT* image_ctx,
+ const json_spirit::mObject& json_object) {
+ return new S3Stream(image_ctx, json_object);
+ }
+
+ S3Stream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object);
+ ~S3Stream() override;
+
+ S3Stream(const S3Stream&) = delete;
+ S3Stream& operator=(const S3Stream&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_size(uint64_t* size, Context* on_finish) override;
+
+ void read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) override;
+
+private:
+ using HttpRequest = boost::beast::http::request<
+ boost::beast::http::empty_body>;
+ using HttpResponse = boost::beast::http::response<
+ boost::beast::http::string_body>;
+
+ struct HttpProcessor;
+
+ ImageCtxT* m_image_ctx;
+ CephContext* m_cct;
+ std::shared_ptr<AsioEngine> m_asio_engine;
+ json_spirit::mObject m_json_object;
+
+ std::string m_url;
+ std::string m_access_key;
+ std::string m_secret_key;
+
+ std::unique_ptr<HttpProcessor> m_http_processor;
+ std::unique_ptr<HttpClient<ImageCtxT>> m_http_client;
+
+ void process_request(HttpRequest& http_request);
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::S3Stream<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_S3_STREAM_H
diff --git a/src/librbd/migration/SnapshotInterface.h b/src/librbd/migration/SnapshotInterface.h
new file mode 100644
index 000000000..9990802c5
--- /dev/null
+++ b/src/librbd/migration/SnapshotInterface.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_SNAPSHOT_INTERFACE_H
+#define CEPH_LIBRBD_MIGRATION_SNAPSHOT_INTERFACE_H
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+namespace io {
+struct AioCompletion;
+struct ReadResult;
+} // namespace io
+
+namespace migration {
+
+struct SnapshotInterface {
+ virtual ~SnapshotInterface() {
+ }
+
+ virtual void open(SnapshotInterface* previous_snapshot,
+ Context* on_finish) = 0;
+ virtual void close(Context* on_finish) = 0;
+
+ virtual const SnapInfo& get_snap_info() const = 0;
+
+ virtual void read(io::AioCompletion* aio_comp, io::Extents&& image_extents,
+ io::ReadResult&& read_result, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) = 0;
+
+ virtual void list_snap(io::Extents&& image_extents, int list_snaps_flags,
+ io::SparseExtents* sparse_extents,
+ const ZTracer::Trace &parent_trace,
+ Context* on_finish) = 0;
+};
+
+} // namespace migration
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIGRATION_SNAPSHOT_INTERFACE_H
diff --git a/src/librbd/migration/SourceSpecBuilder.cc b/src/librbd/migration/SourceSpecBuilder.cc
new file mode 100644
index 000000000..214d7ce0e
--- /dev/null
+++ b/src/librbd/migration/SourceSpecBuilder.cc
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/SourceSpecBuilder.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/migration/FileStream.h"
+#include "librbd/migration/HttpStream.h"
+#include "librbd/migration/S3Stream.h"
+#include "librbd/migration/NativeFormat.h"
+#include "librbd/migration/QCOWFormat.h"
+#include "librbd/migration/RawFormat.h"
+#include "librbd/migration/RawSnapshot.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::SourceSpecBuilder: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string STREAM_KEY{"stream"};
+const std::string TYPE_KEY{"type"};
+
+} // anonymous namespace
+
+template <typename I>
+int SourceSpecBuilder<I>::parse_source_spec(
+ const std::string& source_spec,
+ json_spirit::mObject* source_spec_object) const {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ json_spirit::mValue json_root;
+ if(json_spirit::read(source_spec, json_root)) {
+ try {
+ *source_spec_object = json_root.get_obj();
+ return 0;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ lderr(cct) << "invalid source-spec JSON" << dendl;
+ return -EBADMSG;
+}
+
+template <typename I>
+int SourceSpecBuilder<I>::build_format(
+ const json_spirit::mObject& source_spec_object, bool import_only,
+ std::unique_ptr<FormatInterface>* format) const {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto type_value_it = source_spec_object.find(TYPE_KEY);
+ if (type_value_it == source_spec_object.end() ||
+ type_value_it->second.type() != json_spirit::str_type) {
+ lderr(cct) << "failed to locate format type value" << dendl;
+ return -EINVAL;
+ }
+
+ auto& type = type_value_it->second.get_str();
+ if (type == "native") {
+ format->reset(NativeFormat<I>::create(m_image_ctx, source_spec_object,
+ import_only));
+ } else if (type == "qcow") {
+ format->reset(QCOWFormat<I>::create(m_image_ctx, source_spec_object, this));
+ } else if (type == "raw") {
+ format->reset(RawFormat<I>::create(m_image_ctx, source_spec_object, this));
+ } else {
+ lderr(cct) << "unknown or unsupported format type '" << type << "'"
+ << dendl;
+ return -ENOSYS;
+ }
+ return 0;
+}
+
+template <typename I>
+int SourceSpecBuilder<I>::build_snapshot(
+ const json_spirit::mObject& source_spec_object, uint64_t index,
+ std::shared_ptr<SnapshotInterface>* snapshot) const {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto type_value_it = source_spec_object.find(TYPE_KEY);
+ if (type_value_it == source_spec_object.end() ||
+ type_value_it->second.type() != json_spirit::str_type) {
+ lderr(cct) << "failed to locate snapshot type value" << dendl;
+ return -EINVAL;
+ }
+
+ auto& type = type_value_it->second.get_str();
+ if (type == "raw") {
+ snapshot->reset(RawSnapshot<I>::create(m_image_ctx, source_spec_object,
+ this, index));
+ } else {
+ lderr(cct) << "unknown or unsupported format type '" << type << "'"
+ << dendl;
+ return -ENOSYS;
+ }
+ return 0;
+}
+
+template <typename I>
+int SourceSpecBuilder<I>::build_stream(
+ const json_spirit::mObject& source_spec_object,
+ std::shared_ptr<StreamInterface>* stream) const {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto stream_value_it = source_spec_object.find(STREAM_KEY);
+ if (stream_value_it == source_spec_object.end() ||
+ stream_value_it->second.type() != json_spirit::obj_type) {
+ lderr(cct) << "failed to locate stream object" << dendl;
+ return -EINVAL;
+ }
+
+ auto& stream_obj = stream_value_it->second.get_obj();
+ auto type_value_it = stream_obj.find(TYPE_KEY);
+ if (type_value_it == stream_obj.end() ||
+ type_value_it->second.type() != json_spirit::str_type) {
+ lderr(cct) << "failed to locate stream type value" << dendl;
+ return -EINVAL;
+ }
+
+ auto& type = type_value_it->second.get_str();
+ if (type == "file") {
+ stream->reset(FileStream<I>::create(m_image_ctx, stream_obj));
+ } else if (type == "http") {
+ stream->reset(HttpStream<I>::create(m_image_ctx, stream_obj));
+ } else if (type == "s3") {
+ stream->reset(S3Stream<I>::create(m_image_ctx, stream_obj));
+ } else {
+ lderr(cct) << "unknown or unsupported stream type '" << type << "'"
+ << dendl;
+ return -ENOSYS;
+ }
+
+ return 0;
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::SourceSpecBuilder<librbd::ImageCtx>;
diff --git a/src/librbd/migration/SourceSpecBuilder.h b/src/librbd/migration/SourceSpecBuilder.h
new file mode 100644
index 000000000..191cb1cbd
--- /dev/null
+++ b/src/librbd/migration/SourceSpecBuilder.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_SOURCE_SPEC_BUILDER_H
+#define CEPH_LIBRBD_MIGRATION_SOURCE_SPEC_BUILDER_H
+
+#include "include/int_types.h"
+#include <json_spirit/json_spirit.h>
+#include <memory>
+#include <optional>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace migration {
+
+struct FormatInterface;
+struct SnapshotInterface;
+struct StreamInterface;
+
+template <typename ImageCtxT>
+class SourceSpecBuilder {
+public:
+ SourceSpecBuilder(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ int parse_source_spec(const std::string& source_spec,
+ json_spirit::mObject* source_spec_object) const;
+
+ int build_format(const json_spirit::mObject& format_object, bool import_only,
+ std::unique_ptr<FormatInterface>* format) const;
+
+ int build_snapshot(const json_spirit::mObject& source_spec_object,
+ uint64_t index,
+ std::shared_ptr<SnapshotInterface>* snapshot) const;
+
+ int build_stream(const json_spirit::mObject& source_spec_object,
+ std::shared_ptr<StreamInterface>* stream) const;
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::SourceSpecBuilder<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_SOURCE_SPEC_BUILDER_H
diff --git a/src/librbd/migration/StreamInterface.h b/src/librbd/migration/StreamInterface.h
new file mode 100644
index 000000000..782a9a5f8
--- /dev/null
+++ b/src/librbd/migration/StreamInterface.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_STREAM_INTERFACE_H
+#define CEPH_LIBRBD_MIGRATION_STREAM_INTERFACE_H
+
+#include "include/buffer_fwd.h"
+#include "include/int_types.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+namespace migration {
+
+struct StreamInterface {
+ virtual ~StreamInterface() {
+ }
+
+ virtual void open(Context* on_finish) = 0;
+ virtual void close(Context* on_finish) = 0;
+
+ virtual void get_size(uint64_t* size, Context* on_finish) = 0;
+
+ virtual void read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) = 0;
+};
+
+} // namespace migration
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIGRATION_STREAM_INTERFACE_H
diff --git a/src/librbd/migration/Types.h b/src/librbd/migration/Types.h
new file mode 100644
index 000000000..244dc28b7
--- /dev/null
+++ b/src/librbd/migration/Types.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_TYPES_H
+#define CEPH_LIBRBD_MIGRATION_TYPES_H
+
+#include <string>
+#include <utility>
+
+namespace librbd {
+namespace migration {
+
+enum UrlScheme {
+ URL_SCHEME_HTTP,
+ URL_SCHEME_HTTPS,
+};
+
+struct UrlSpec {
+ UrlSpec() {}
+ UrlSpec(UrlScheme scheme, const std::string& host, const std::string& port,
+ const std::string& path)
+ : scheme(scheme), host(host), port(port), path(path) {
+ }
+
+ UrlScheme scheme = URL_SCHEME_HTTP;
+ std::string host;
+ std::string port = "80";
+ std::string path = "/";
+
+};
+
+inline bool operator==(const UrlSpec& lhs, const UrlSpec& rhs) {
+ return (lhs.scheme == rhs.scheme &&
+ lhs.host == rhs.host &&
+ lhs.port == rhs.port &&
+ lhs.path == rhs.path);
+}
+
+} // namespace migration
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIGRATION_TYPES_H
diff --git a/src/librbd/migration/Utils.cc b/src/librbd/migration/Utils.cc
new file mode 100644
index 000000000..c5c1279d8
--- /dev/null
+++ b/src/librbd/migration/Utils.cc
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include <boost/lexical_cast.hpp>
+#include <regex>
+
+namespace librbd {
+namespace migration {
+namespace util {
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::util::" << __func__ << ": "
+
+int parse_url(CephContext* cct, const std::string& url, UrlSpec* url_spec) {
+ ldout(cct, 10) << "url=" << url << dendl;
+ *url_spec = UrlSpec{};
+
+ // parse the provided URL (scheme, user, password, host, port, path,
+ // parameters, query, and fragment)
+ std::regex url_regex(
+ R"(^(?:([^:/]*)://)?(?:(\w+)(?::(\w+))?@)?([^/;\?:#]+)(?::([^/;\?#]+))?)"
+ R"((?:/([^;\?#]*))?(?:;([^\?#]+))?(?:\?([^#]+))?(?:#(\w+))?$)");
+ std::smatch match;
+ if(!std::regex_match(url, match, url_regex)) {
+ lderr(cct) << "invalid url: '" << url << "'" << dendl;
+ return -EINVAL;
+ }
+
+ auto& scheme = match[1];
+ if (scheme == "http" || scheme == "") {
+ url_spec->scheme = URL_SCHEME_HTTP;
+ } else if (scheme == "https") {
+ url_spec->scheme = URL_SCHEME_HTTPS;
+ url_spec->port = "443";
+ } else {
+ lderr(cct) << "invalid url scheme: '" << url << "'" << dendl;
+ return -EINVAL;
+ }
+
+ url_spec->host = match[4];
+ auto& port = match[5];
+ if (port.matched) {
+ try {
+ boost::lexical_cast<uint16_t>(port);
+ } catch (boost::bad_lexical_cast&) {
+ lderr(cct) << "invalid url port: '" << url << "'" << dendl;
+ return -EINVAL;
+ }
+ url_spec->port = port;
+ }
+
+ auto& path = match[6];
+ if (path.matched) {
+ url_spec->path += path;
+ }
+ return 0;
+}
+
+void zero_shrunk_snapshot(CephContext* cct, const io::Extents& image_extents,
+ uint64_t snap_id, uint64_t new_size,
+ std::optional<uint64_t> *previous_size,
+ io::SparseExtents* sparse_extents) {
+ if (*previous_size && **previous_size > new_size) {
+ ldout(cct, 20) << "snapshot resize " << **previous_size << " -> "
+ << new_size << dendl;
+ interval_set<uint64_t> zero_interval;
+ zero_interval.insert(new_size, **previous_size - new_size);
+
+ for (auto& image_extent : image_extents) {
+ interval_set<uint64_t> image_interval;
+ image_interval.insert(image_extent.first, image_extent.second);
+
+ image_interval.intersection_of(zero_interval);
+ for (auto [image_offset, image_length] : image_interval) {
+ ldout(cct, 20) << "zeroing extent " << image_offset << "~"
+ << image_length << " at snapshot " << snap_id << dendl;
+ sparse_extents->insert(image_offset, image_length,
+ {io::SPARSE_EXTENT_STATE_ZEROED, image_length});
+ }
+ }
+ }
+ *previous_size = new_size;
+}
+
+void merge_snapshot_delta(const io::SnapIds& snap_ids,
+ io::SnapshotDelta* snapshot_delta) {
+ io::SnapshotDelta orig_snapshot_delta = std::move(*snapshot_delta);
+ snapshot_delta->clear();
+
+ auto snap_id_it = snap_ids.begin();
+ ceph_assert(snap_id_it != snap_ids.end());
+
+ // merge any snapshot intervals that were not requested
+ std::list<io::SparseExtents*> pending_sparse_extents;
+ for (auto& [snap_key, sparse_extents] : orig_snapshot_delta) {
+ // advance to next valid requested snap id
+ while (snap_id_it != snap_ids.end() && *snap_id_it < snap_key.first) {
+ ++snap_id_it;
+ }
+ if (snap_id_it == snap_ids.end()) {
+ break;
+ }
+
+ // loop through older write/read snapshot sparse extents to remove any
+ // overlaps with the current sparse extent
+ for (auto prev_sparse_extents : pending_sparse_extents) {
+ for (auto& sparse_extent : sparse_extents) {
+ prev_sparse_extents->erase(sparse_extent.get_off(),
+ sparse_extent.get_len());
+ }
+ }
+
+ auto write_read_snap_ids = std::make_pair(*snap_id_it, snap_key.second);
+ (*snapshot_delta)[write_read_snap_ids] = std::move(sparse_extents);
+
+ if (write_read_snap_ids.first > snap_key.first) {
+ // the current snapshot wasn't requested so it might need to get
+ // merged with a later snapshot
+ pending_sparse_extents.push_back(&(*snapshot_delta)[write_read_snap_ids]);
+ } else {
+ // we don't merge results passed a valid requested snapshot
+ pending_sparse_extents.clear();
+ }
+ }
+}
+
+} // namespace util
+} // namespace migration
+} // namespace librbd
diff --git a/src/librbd/migration/Utils.h b/src/librbd/migration/Utils.h
new file mode 100644
index 000000000..afbadde7d
--- /dev/null
+++ b/src/librbd/migration/Utils.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_UTILS_H
+#define CEPH_LIBRBD_MIGRATION_UTILS_H
+
+#include "include/common_fwd.h"
+#include "librbd/io/Types.h"
+#include "librbd/migration/Types.h"
+#include <optional>
+#include <string>
+
+namespace librbd {
+namespace migration {
+namespace util {
+
+int parse_url(CephContext* cct, const std::string& url, UrlSpec* url_spec);
+
+void zero_shrunk_snapshot(CephContext* cct, const io::Extents& image_extents,
+ uint64_t snap_id, uint64_t new_size,
+ std::optional<uint64_t> *previous_size,
+ io::SparseExtents* sparse_extents);
+void merge_snapshot_delta(const io::SnapIds& snap_ids,
+ io::SnapshotDelta* snapshot_delta);
+
+} // namespace util
+} // namespace migration
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIGRATION_UTILS_H
diff --git a/src/librbd/mirror/DemoteRequest.cc b/src/librbd/mirror/DemoteRequest.cc
new file mode 100644
index 000000000..350a76d83
--- /dev/null
+++ b/src/librbd/mirror/DemoteRequest.cc
@@ -0,0 +1,216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/DemoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "librbd/mirror/snapshot/DemoteRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::DemoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void DemoteRequest<I>::send() {
+ get_info();
+}
+
+template <typename I>
+void DemoteRequest<I>::get_info() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_get_info>(this);
+ auto req = GetInfoRequest<I>::create(m_image_ctx, &m_mirror_image,
+ &m_promotion_state,
+ &m_primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_get_info(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_promotion_state != PROMOTION_STATE_PRIMARY) {
+ lderr(cct) << "image is not primary" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void DemoteRequest<I>::acquire_lock() {
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.owner_lock.lock_shared();
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ m_image_ctx.owner_lock.unlock_shared();
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ lderr(cct) << "exclusive lock is not active" << dendl;
+ finish(-EINVAL);
+ } else {
+ demote();
+ }
+ return;
+ }
+
+ // avoid accepting new requests from peers while we demote
+ // the image
+ m_image_ctx.exclusive_lock->block_requests(0);
+ m_blocked_requests = true;
+
+ if (m_image_ctx.exclusive_lock->is_lock_owner()) {
+ m_image_ctx.owner_lock.unlock_shared();
+ demote();
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>,
+ &DemoteRequest<I>::handle_acquire_lock>(this, m_image_ctx.exclusive_lock);
+ m_image_ctx.exclusive_lock->acquire_lock(ctx);
+ m_image_ctx.owner_lock.unlock_shared();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_acquire_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_image_ctx.owner_lock.lock_shared();
+ if (m_image_ctx.exclusive_lock != nullptr &&
+ !m_image_ctx.exclusive_lock->is_lock_owner()) {
+ r = m_image_ctx.exclusive_lock->get_unlocked_op_error();
+ m_image_ctx.owner_lock.unlock_shared();
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ finish(r);
+ return;
+ }
+ m_image_ctx.owner_lock.unlock_shared();
+
+ demote();
+}
+
+template <typename I>
+void DemoteRequest<I>::demote() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_demote>(this);
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ Journal<I>::demote(&m_image_ctx, ctx);
+ } else if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ auto req = mirror::snapshot::DemoteRequest<I>::create(
+ &m_image_ctx, m_mirror_image.global_image_id, ctx);
+ req->send();
+ } else {
+ lderr(cct) << "unknown image mirror mode: " << m_mirror_image.mode << dendl;
+ m_ret_val = -EOPNOTSUPP;
+ release_lock();
+ }
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_demote(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_ret_val = r;
+ lderr(cct) << "failed to demote image: " << cpp_strerror(r) << dendl;
+ }
+
+ release_lock();
+}
+
+template <typename I>
+void DemoteRequest<I>::release_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ m_image_ctx.owner_lock.lock_shared();
+ if (m_image_ctx.exclusive_lock == nullptr) {
+ m_image_ctx.owner_lock.unlock_shared();
+ finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>,
+ &DemoteRequest<I>::handle_release_lock>(this, m_image_ctx.exclusive_lock);
+ m_image_ctx.exclusive_lock->release_lock(ctx);
+ m_image_ctx.owner_lock.unlock_shared();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_release_lock(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void DemoteRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ if (m_blocked_requests && m_image_ctx.exclusive_lock != nullptr) {
+ m_image_ctx.exclusive_lock->unblock_requests();
+ }
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::DemoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/DemoteRequest.h b/src/librbd/mirror/DemoteRequest.h
new file mode 100644
index 000000000..ab9239068
--- /dev/null
+++ b/src/librbd/mirror/DemoteRequest.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DemoteRequest {
+public:
+ static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new DemoteRequest(image_ctx, on_finish);
+ }
+
+ DemoteRequest(ImageCtxT &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * ACQUIRE_LOCK * * * *
+ * | *
+ * v *
+ * DEMOTE *
+ * | *
+ * v *
+ * RELEASE_LOCK *
+ * | *
+ * v *
+ * <finish> < * * * * *
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ int m_ret_val = 0;
+ bool m_blocked_requests = false;
+
+ cls::rbd::MirrorImage m_mirror_image;
+ PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY;
+ std::string m_primary_mirror_uuid;
+
+ void get_info();
+ void handle_get_info(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void demote();
+ void handle_demote(int r);
+
+ void release_lock();
+ void handle_release_lock(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::DemoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H
diff --git a/src/librbd/mirror/DisableRequest.cc b/src/librbd/mirror/DisableRequest.cc
new file mode 100644
index 000000000..09378ce58
--- /dev/null
+++ b/src/librbd/mirror/DisableRequest.cc
@@ -0,0 +1,479 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/DisableRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/journal/cls_journal_client.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/PromoteRequest.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "librbd/mirror/ImageRemoveRequest.h"
+#include "librbd/mirror/ImageStateUpdateRequest.h"
+#include "librbd/mirror/snapshot/PromoteRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::DisableRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using util::create_rados_callback;
+
+template <typename I>
+DisableRequest<I>::DisableRequest(I *image_ctx, bool force, bool remove,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_remove(remove),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void DisableRequest<I>::send() {
+ send_get_mirror_info();
+}
+
+template <typename I>
+void DisableRequest<I>::send_get_mirror_info() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_get_mirror_info>(this);
+
+ auto req = GetInfoRequest<I>::create(*m_image_ctx, &m_mirror_image,
+ &m_promotion_state,
+ &m_primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_get_mirror_info(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ if (*result == -ENOENT) {
+ ldout(cct, 20) << "mirroring is not enabled for this image" << dendl;
+ *result = 0;
+ } else {
+ lderr(cct) << "failed to get mirroring info: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return m_on_finish;
+ }
+
+ m_is_primary = (m_promotion_state == PROMOTION_STATE_PRIMARY ||
+ m_promotion_state == PROMOTION_STATE_UNKNOWN);
+
+ if (!m_is_primary && !m_force) {
+ lderr(cct) << "mirrored image is not primary, "
+ << "add force option to disable mirroring" << dendl;
+ *result = -EINVAL;
+ return m_on_finish;
+ }
+
+ send_image_state_update();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_image_state_update() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ DisableRequest<I>,
+ &DisableRequest<I>::handle_image_state_update>(this);
+ auto req = ImageStateUpdateRequest<I>::create(
+ m_image_ctx->md_ctx, m_image_ctx->id,
+ cls::rbd::MIRROR_IMAGE_STATE_DISABLING, m_mirror_image, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_image_state_update(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to disable mirroring: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ send_promote_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_promote_image() {
+ if (m_is_primary) {
+ clean_mirror_state();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ DisableRequest<I>, &DisableRequest<I>::handle_promote_image>(this);
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ // Not primary -- shouldn't have the journal open
+ ceph_assert(m_image_ctx->journal == nullptr);
+
+ auto req = journal::PromoteRequest<I>::create(m_image_ctx, true, ctx);
+ req->send();
+ } else if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ auto req = mirror::snapshot::PromoteRequest<I>::create(
+ m_image_ctx, m_mirror_image.global_image_id, ctx);
+ req->send();
+ } else {
+ lderr(cct) << "unknown image mirror mode: " << m_mirror_image.mode << dendl;
+ ctx->complete(-EOPNOTSUPP);
+ }
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_promote_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to promote image: " << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ send_refresh_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_refresh_image() {
+ if (!m_image_ctx->state->is_refresh_required()) {
+ clean_mirror_state();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ DisableRequest<I>,
+ &DisableRequest<I>::handle_refresh_image>(this);
+ m_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_refresh_image(int* result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(*result) << dendl;
+ return m_on_finish;
+ }
+
+ clean_mirror_state();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::clean_mirror_state() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ remove_mirror_snapshots();
+ } else {
+ send_get_clients();
+ }
+}
+
+template <typename I>
+void DisableRequest<I>::send_get_clients() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = DisableRequest<I>;
+ Context *ctx = util::create_context_callback<
+ klass, &klass::handle_get_clients>(this);
+
+ std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id);
+ m_clients.clear();
+ cls::journal::client::client_list(m_image_ctx->md_ctx, header_oid, &m_clients,
+ ctx);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_get_clients(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_current_ops.empty());
+
+ if (*result < 0) {
+ lderr(cct) << "failed to get registered clients: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ for (auto client : m_clients) {
+ journal::ClientData client_data;
+ auto bl_it = client.data.cbegin();
+ try {
+ using ceph::decode;
+ decode(client_data, bl_it);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode client data" << dendl;
+ m_error_result = -EBADMSG;
+ continue;
+ }
+
+ journal::ClientMetaType type = client_data.get_client_meta_type();
+ if (type != journal::ClientMetaType::MIRROR_PEER_CLIENT_META_TYPE) {
+ continue;
+ }
+
+ if (m_current_ops.find(client.id) != m_current_ops.end()) {
+ // Should not happen.
+ lderr(cct) << "clients with the same id "
+ << client.id << dendl;
+ continue;
+ }
+
+ m_current_ops[client.id] = 0;
+ m_ret[client.id] = 0;
+
+ journal::MirrorPeerClientMeta client_meta =
+ boost::get<journal::MirrorPeerClientMeta>(client_data.client_meta);
+
+ for (const auto& sync : client_meta.sync_points) {
+ send_remove_snap(client.id, sync.snap_namespace, sync.snap_name);
+ }
+
+ if (m_current_ops[client.id] == 0) {
+ // no snaps to remove
+ send_unregister_client(client.id);
+ }
+ }
+
+ if (m_current_ops.empty()) {
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ return m_on_finish;
+ } else if (!m_remove) {
+ return m_on_finish;
+ }
+ locker.unlock();
+
+ // no mirror clients to unregister
+ send_remove_mirror_image();
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::remove_mirror_snapshots() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // remove snapshot-based mirroring snapshots
+ bool removing_snapshots = false;
+ {
+ std::lock_guard locker{m_lock};
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+
+ for (auto &it : m_image_ctx->snap_info) {
+ auto &snap_info = it.second;
+ auto type = cls::rbd::get_snap_namespace_type(
+ snap_info.snap_namespace);
+ if (type == cls::rbd::SNAPSHOT_NAMESPACE_TYPE_MIRROR) {
+ send_remove_snap("", snap_info.snap_namespace, snap_info.name);
+ removing_snapshots = true;
+ }
+ }
+ }
+
+ if (!removing_snapshots) {
+ send_remove_mirror_image();
+ }
+}
+
+template <typename I>
+void DisableRequest<I>::send_remove_snap(
+ const std::string &client_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "client_id=" << client_id
+ << ", snap_name=" << snap_name << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_current_ops[client_id]++;
+
+ Context *ctx = create_context_callback(
+ &DisableRequest<I>::handle_remove_snap, client_id);
+
+ ctx = new LambdaContext([this, snap_namespace, snap_name, ctx](int r) {
+ m_image_ctx->operations->snap_remove(snap_namespace,
+ snap_name.c_str(),
+ ctx);
+ });
+
+ m_image_ctx->op_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_remove_snap(int *result,
+ const std::string &client_id) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ std::unique_lock locker{m_lock};
+
+ ceph_assert(m_current_ops[client_id] > 0);
+ m_current_ops[client_id]--;
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to remove mirroring snapshot: "
+ << cpp_strerror(*result) << dendl;
+ m_ret[client_id] = *result;
+ }
+
+ if (m_current_ops[client_id] == 0) {
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ ceph_assert(client_id.empty());
+ m_current_ops.erase(client_id);
+ if (m_ret[client_id] < 0) {
+ return m_on_finish;
+ }
+ locker.unlock();
+
+ send_remove_mirror_image();
+ return nullptr;
+ }
+
+ send_unregister_client(client_id);
+ }
+
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_unregister_client(
+ const std::string &client_id) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_current_ops[client_id] == 0);
+
+ Context *ctx = create_context_callback(
+ &DisableRequest<I>::handle_unregister_client, client_id);
+
+ if (m_ret[client_id] < 0) {
+ m_image_ctx->op_work_queue->queue(ctx, m_ret[client_id]);
+ return;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls::journal::client::client_unregister(&op, client_id);
+ std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id);
+ librados::AioCompletion *comp = create_rados_callback(ctx);
+
+ int r = m_image_ctx->md_ctx.aio_operate(header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_unregister_client(
+ int *result, const std::string &client_id) {
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_current_ops[client_id] == 0);
+ m_current_ops.erase(client_id);
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to unregister remote journal client: "
+ << cpp_strerror(*result) << dendl;
+ m_error_result = *result;
+ }
+
+ if (!m_current_ops.empty()) {
+ return nullptr;
+ }
+
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ return m_on_finish;
+ }
+ locker.unlock();
+
+ send_get_clients();
+ return nullptr;
+}
+
+template <typename I>
+void DisableRequest<I>::send_remove_mirror_image() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ DisableRequest<I>,
+ &DisableRequest<I>::handle_remove_mirror_image>(this);
+ auto req = ImageRemoveRequest<I>::create(
+ m_image_ctx->md_ctx, m_mirror_image.global_image_id, m_image_ctx->id,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableRequest<I>::handle_remove_mirror_image(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to remove mirror image: " << cpp_strerror(*result)
+ << dendl;
+ return m_on_finish;
+ }
+
+ ldout(cct, 20) << "removed image state from rbd_mirroring object" << dendl;
+ return m_on_finish;
+}
+
+template <typename I>
+Context *DisableRequest<I>::create_context_callback(
+ Context*(DisableRequest<I>::*handle)(int*, const std::string &client_id),
+ const std::string &client_id) {
+
+ return new LambdaContext([this, handle, client_id](int r) {
+ Context *on_finish = (this->*handle)(&r, client_id);
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ delete this;
+ }
+ });
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::DisableRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/DisableRequest.h b/src/librbd/mirror/DisableRequest.h
new file mode 100644
index 000000000..f45d1a14c
--- /dev/null
+++ b/src/librbd/mirror/DisableRequest.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H
+
+#include "include/buffer.h"
+#include "common/ceph_mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = ImageCtx>
+class DisableRequest {
+public:
+ static DisableRequest *create(ImageCtxT *image_ctx, bool force,
+ bool remove, Context *on_finish) {
+ return new DisableRequest(image_ctx, force, remove, on_finish);
+ }
+
+ DisableRequest(ImageCtxT *image_ctx, bool force, bool remove,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_INFO * * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * IMAGE_STATE_UPDATE * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * PROMOTE_IMAGE (skip if primary) *
+ * | *
+ * v *
+ * REFRESH_IMAGE (skip if necessary) *
+ * | *
+ * v *
+ * GET_CLIENTS <----------------------------------------\ * * * *
+ * | | (unregister clients) | * (on error)
+ * | |/----------------------------\ | *
+ * | | | | *
+ * | | /-----------\ (repeat | (repeat | (repeat
+ * | | | | as needed) | as needed) | as needed)
+ * | v v | | | *
+ * | REMOVE_SYNC_SNAP --/ * * * * * * | * * * * * * | * * * *
+ * | | | | *
+ * | v | | *
+ * | UNREGISTER_CLIENT ---------------/-------------/ * * * *
+ * | *
+ * | (no more clients *
+ * | to unregister) *
+ * v *
+ * REMOVE_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * *
+ * | (skip if no remove) *
+ * v *
+ * <finish> < * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ bool m_force;
+ bool m_remove;
+ Context *m_on_finish;
+
+ bool m_is_primary = false;
+ cls::rbd::MirrorImage m_mirror_image;
+ PromotionState m_promotion_state = PROMOTION_STATE_NON_PRIMARY;
+ std::string m_primary_mirror_uuid;
+ std::set<cls::journal::Client> m_clients;
+ std::map<std::string, int> m_ret;
+ std::map<std::string, int> m_current_ops;
+ int m_error_result = 0;
+ mutable ceph::mutex m_lock =
+ ceph::make_mutex("mirror::DisableRequest::m_lock");
+
+ void send_get_mirror_info();
+ Context *handle_get_mirror_info(int *result);
+
+ void send_image_state_update();
+ Context *handle_image_state_update(int *result);
+
+ void send_notify_mirroring_watcher();
+ Context *handle_notify_mirroring_watcher(int *result);
+
+ void send_promote_image();
+ Context *handle_promote_image(int *result);
+
+ void send_refresh_image();
+ Context* handle_refresh_image(int* result);
+
+ void clean_mirror_state();
+
+ void send_get_clients();
+ Context *handle_get_clients(int *result);
+
+ void remove_mirror_snapshots();
+
+ void send_remove_snap(const std::string &client_id,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name);
+ Context *handle_remove_snap(int *result, const std::string &client_id);
+
+ void send_unregister_client(const std::string &client_id);
+ Context *handle_unregister_client(int *result, const std::string &client_id);
+
+ void send_remove_mirror_image();
+ Context *handle_remove_mirror_image(int *result);
+
+ void send_notify_mirroring_watcher_removed();
+ Context *handle_notify_mirroring_watcher_removed(int *result);
+
+ Context *create_context_callback(
+ Context*(DisableRequest<ImageCtxT>::*handle)(
+ int*, const std::string &client_id),
+ const std::string &client_id);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::DisableRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H
diff --git a/src/librbd/mirror/EnableRequest.cc b/src/librbd/mirror/EnableRequest.cc
new file mode 100644
index 000000000..fd74a25ba
--- /dev/null
+++ b/src/librbd/mirror/EnableRequest.cc
@@ -0,0 +1,329 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/EnableRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/ImageStateUpdateRequest.h"
+#include "librbd/mirror/snapshot/CreatePrimaryRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::EnableRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+EnableRequest<I>::EnableRequest(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ I* image_ctx,
+ cls::rbd::MirrorImageMode mode,
+ const std::string &non_primary_global_image_id,
+ bool image_clean,
+ asio::ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_image_ctx(image_ctx),
+ m_mode(mode), m_non_primary_global_image_id(non_primary_global_image_id),
+ m_image_clean(image_clean), m_op_work_queue(op_work_queue),
+ m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())) {
+}
+
+template <typename I>
+void EnableRequest<I>::send() {
+ get_mirror_image();
+}
+
+template <typename I>
+void EnableRequest<I>::get_mirror_image() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_id);
+
+ using klass = EnableRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_image>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void EnableRequest<I>::handle_get_mirror_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_get_finish(&iter, &m_mirror_image);
+ }
+
+ if (r == 0 && m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_CREATING &&
+ !m_non_primary_global_image_id.empty()) {
+ // special case where rbd-mirror injects a disabled record to record the
+ // local image id prior to creating ther image
+ ldout(m_cct, 10) << "enabling mirroring on in-progress image replication"
+ << dendl;
+ } else if (r == 0) {
+ if (m_mirror_image.mode != m_mode) {
+ lderr(m_cct) << "invalid current image mirror mode" << dendl;
+ r = -EINVAL;
+ } else if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ ldout(m_cct, 10) << "mirroring is already enabled" << dendl;
+ } else {
+ lderr(m_cct) << "currently disabling" << dendl;
+ r = -EINVAL;
+ }
+ finish(r);
+ return;
+ } else if (r != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ r = 0;
+ m_mirror_image.mode = m_mode;
+ if (m_non_primary_global_image_id.empty()) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ m_mirror_image.global_image_id = uuid_gen.to_string();
+ } else {
+ m_mirror_image.global_image_id = m_non_primary_global_image_id;
+ }
+
+ get_tag_owner();
+}
+
+template <typename I>
+void EnableRequest<I>::get_tag_owner() {
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ open_image();
+ return;
+ } else if (!m_non_primary_global_image_id.empty()) {
+ image_state_update();
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ using klass = EnableRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_get_tag_owner>(this);
+ librbd::Journal<>::is_tag_owner(m_io_ctx, m_image_id, &m_is_primary,
+ m_op_work_queue, ctx);
+}
+
+template <typename I>
+void EnableRequest<I>::handle_get_tag_owner(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to check tag ownership: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_is_primary) {
+ lderr(m_cct) << "last journal tag not owned by local cluster" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ image_state_update();
+}
+
+template <typename I>
+void EnableRequest<I>::open_image() {
+ if (!m_non_primary_global_image_id.empty()) {
+ // special case for rbd-mirror creating a non-primary image
+ enable_non_primary_feature();
+ return;
+ } else if (m_image_ctx != nullptr) {
+ create_primary_snapshot();
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ m_close_image = true;
+ m_image_ctx = I::create("", m_image_id, CEPH_NOSNAP, m_io_ctx, false);
+
+ auto ctx = create_context_callback<
+ EnableRequest<I>, &EnableRequest<I>::handle_open_image>(this);
+ m_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT |
+ OPEN_FLAG_IGNORE_MIGRATING, ctx);
+}
+
+template <typename I>
+void EnableRequest<I>::handle_open_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+ m_image_ctx = nullptr;
+ finish(r);
+ return;
+ }
+
+ create_primary_snapshot();
+}
+
+template <typename I>
+void EnableRequest<I>::create_primary_snapshot() {
+ ldout(m_cct, 10) << dendl;
+
+ ceph_assert(m_image_ctx != nullptr);
+ uint64_t snap_create_flags;
+ int r = util::snap_create_flags_api_to_internal(
+ m_cct, util::get_default_snap_create_flags(m_image_ctx),
+ &snap_create_flags);
+ ceph_assert(r == 0);
+ auto ctx = create_context_callback<
+ EnableRequest<I>,
+ &EnableRequest<I>::handle_create_primary_snapshot>(this);
+ auto req = snapshot::CreatePrimaryRequest<I>::create(
+ m_image_ctx, m_mirror_image.global_image_id,
+ (m_image_clean ? 0 : CEPH_NOSNAP), snap_create_flags,
+ snapshot::CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS, &m_snap_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void EnableRequest<I>::handle_create_primary_snapshot(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to create initial primary snapshot: "
+ << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ close_image();
+}
+
+template <typename I>
+void EnableRequest<I>::close_image() {
+ if (!m_close_image) {
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ } else {
+ image_state_update();
+ }
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ EnableRequest<I>, &EnableRequest<I>::handle_close_image>(this);
+ m_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void EnableRequest<I>::handle_close_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_image_ctx = nullptr;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to close image: " << cpp_strerror(r) << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ }
+
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ image_state_update();
+}
+
+
+template <typename I>
+void EnableRequest<I>::enable_non_primary_feature() {
+ if (m_mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ image_state_update();
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ // ensure image is flagged with non-primary feature so that
+ // standard RBD clients cannot write to it.
+ librados::ObjectWriteOperation op;
+ cls_client::set_features(&op, RBD_FEATURE_NON_PRIMARY,
+ RBD_FEATURE_NON_PRIMARY);
+
+ auto aio_comp = create_rados_callback<
+ EnableRequest<I>,
+ &EnableRequest<I>::handle_enable_non_primary_feature>(this);
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void EnableRequest<I>::handle_enable_non_primary_feature(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable non-primary feature: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void EnableRequest<I>::image_state_update() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ EnableRequest<I>, &EnableRequest<I>::handle_image_state_update>(this);
+ auto req = ImageStateUpdateRequest<I>::create(
+ m_io_ctx, m_image_id, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
+ m_mirror_image, ctx);
+ req->send();
+}
+
+template <typename I>
+void EnableRequest<I>::handle_image_state_update(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void EnableRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::EnableRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/EnableRequest.h b/src/librbd/mirror/EnableRequest.h
new file mode 100644
index 000000000..391028e6e
--- /dev/null
+++ b/src/librbd/mirror/EnableRequest.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "include/rbd/librbd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/mirror/Types.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+namespace mirror {
+
+template <typename ImageCtxT = ImageCtx>
+class EnableRequest {
+public:
+ static EnableRequest *create(ImageCtxT *image_ctx,
+ cls::rbd::MirrorImageMode mode,
+ const std::string &non_primary_global_image_id,
+ bool image_clean, Context *on_finish) {
+ return new EnableRequest(image_ctx->md_ctx, image_ctx->id, image_ctx, mode,
+ non_primary_global_image_id, image_clean,
+ image_ctx->op_work_queue, on_finish);
+ }
+ static EnableRequest *create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ cls::rbd::MirrorImageMode mode,
+ const std::string &non_primary_global_image_id,
+ bool image_clean, asio::ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new EnableRequest(io_ctx, image_id, nullptr, mode,
+ non_primary_global_image_id, image_clean,
+ op_work_queue, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE * * * * * * *
+ * | * (on error)
+ * v (skip if not needed) *
+ * GET_TAG_OWNER * * * * * * * *
+ * | *
+ * v (skip if not needed) *
+ * OPEN_IMAGE *
+ * | *
+ * v (skip if not needed) *
+ * CREATE_PRIMARY_SNAPSHOT * * *
+ * | *
+ * v (skip of not opened) *
+ * CLOSE_IMAGE *
+ * | *
+ * v (skip if not needed) *
+ * ENABLE_NON_PRIMARY_FEATURE *
+ * | *
+ * v (skip if not needed) *
+ * IMAGE_STATE_UPDATE * * * * * *
+ * | *
+ * v *
+ * <finish> < * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ EnableRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ ImageCtxT* image_ctx, cls::rbd::MirrorImageMode mode,
+ const std::string &non_primary_global_image_id,
+ bool image_clean, asio::ContextWQ *op_work_queue,
+ Context *on_finish);
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ ImageCtxT* m_image_ctx;
+ cls::rbd::MirrorImageMode m_mode;
+ std::string m_non_primary_global_image_id;
+ bool m_image_clean;
+ asio::ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ CephContext *m_cct = nullptr;
+ bufferlist m_out_bl;
+ cls::rbd::MirrorImage m_mirror_image;
+
+ int m_ret_val = 0;
+ bool m_close_image = false;
+
+ bool m_is_primary = false;
+ uint64_t m_snap_id = CEPH_NOSNAP;
+
+ void get_mirror_image();
+ void handle_get_mirror_image(int r);
+
+ void get_tag_owner();
+ void handle_get_tag_owner(int r);
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void create_primary_snapshot();
+ void handle_create_primary_snapshot(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void enable_non_primary_feature();
+ void handle_enable_non_primary_feature(int r);
+
+ void image_state_update();
+ void handle_image_state_update(int r);
+
+ void finish(int r);
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::EnableRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H
diff --git a/src/librbd/mirror/GetInfoRequest.cc b/src/librbd/mirror/GetInfoRequest.cc
new file mode 100644
index 000000000..a7ee64567
--- /dev/null
+++ b/src/librbd/mirror/GetInfoRequest.cc
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/GetInfoRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::GetInfoRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+GetInfoRequest<I>::GetInfoRequest(librados::IoCtx& io_ctx,
+ asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ std::string* primary_mirror_uuid,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_op_work_queue(op_work_queue), m_image_id(image_id),
+ m_mirror_image(mirror_image), m_promotion_state(promotion_state),
+ m_primary_mirror_uuid(primary_mirror_uuid), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+}
+
+template <typename I>
+GetInfoRequest<I>::GetInfoRequest(I &image_ctx,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ std::string* primary_mirror_uuid,
+ Context *on_finish)
+ : m_image_ctx(&image_ctx), m_io_ctx(image_ctx.md_ctx),
+ m_op_work_queue(image_ctx.op_work_queue), m_image_id(image_ctx.id),
+ m_mirror_image(mirror_image), m_promotion_state(promotion_state),
+ m_primary_mirror_uuid(primary_mirror_uuid), m_on_finish(on_finish),
+ m_cct(image_ctx.cct) {
+}
+
+template <typename I>
+void GetInfoRequest<I>::send() {
+ get_mirror_image();
+}
+
+template <typename I>
+void GetInfoRequest<I>::get_mirror_image() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_id);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetInfoRequest<I>::handle_get_mirror_image(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_mirror_image->state = cls::rbd::MIRROR_IMAGE_STATE_DISABLED;
+ *m_promotion_state = PROMOTION_STATE_NON_PRIMARY;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_get_finish(&iter, m_mirror_image);
+ }
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 20) << "mirroring is disabled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_mirror_image->mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ get_journal_tag_owner();
+ } else if (m_mirror_image->mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ get_snapcontext();
+ } else {
+ ldout(m_cct, 20) << "unknown mirror image mode: " << m_mirror_image->mode
+ << dendl;
+ finish(-EOPNOTSUPP);
+ }
+}
+
+template <typename I>
+void GetInfoRequest<I>::get_journal_tag_owner() {
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_journal_tag_owner>(this);
+ Journal<I>::get_tag_owner(m_io_ctx, m_image_id, &m_mirror_uuid,
+ m_op_work_queue, ctx);
+}
+
+template <typename I>
+void GetInfoRequest<I>::handle_get_journal_tag_owner(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to determine tag ownership: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_mirror_uuid == Journal<>::LOCAL_MIRROR_UUID) {
+ *m_promotion_state = PROMOTION_STATE_PRIMARY;
+ *m_primary_mirror_uuid = "";
+ } else if (m_mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) {
+ *m_promotion_state = PROMOTION_STATE_ORPHAN;
+ *m_primary_mirror_uuid = "";
+ } else {
+ *m_primary_mirror_uuid = m_mirror_uuid;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetInfoRequest<I>::get_snapcontext() {
+ if (m_image_ctx != nullptr) {
+ {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ calc_promotion_state(m_image_ctx->snap_info);
+ }
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::get_snapcontext_start(&op);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_snapcontext>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetInfoRequest<I>::handle_get_snapcontext(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r >= 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::get_snapcontext_finish(&it, &m_snapc);
+ }
+
+ if (r == -ENOENT &&
+ m_mirror_image->state == cls::rbd::MIRROR_IMAGE_STATE_CREATING) {
+ // image doesn't exist but we have a mirror image record for it
+ ldout(m_cct, 10) << "image does not exist for mirror image id "
+ << m_image_id << dendl;
+ *m_promotion_state = PROMOTION_STATE_UNKNOWN;
+ *m_primary_mirror_uuid = "";
+ finish(0);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to get snapcontext: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ get_snapshots();
+}
+
+
+template <typename I>
+void GetInfoRequest<I>::get_snapshots() {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_snapc.snaps.empty()) {
+ handle_get_snapshots(0);
+ return;
+ }
+
+ librados::ObjectReadOperation op;
+ for (auto snap_id : m_snapc.snaps) {
+ cls_client::snapshot_get_start(&op, snap_id);
+ }
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetInfoRequest<I>, &GetInfoRequest<I>::handle_get_snapshots>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(util::header_name(m_image_id), comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetInfoRequest<I>::handle_get_snapshots(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ std::map<librados::snap_t, SnapInfo> snap_info;
+
+ auto it = m_out_bl.cbegin();
+ for (auto snap_id : m_snapc.snaps) {
+ cls::rbd::SnapshotInfo snap;
+ if (r >= 0) {
+ r = cls_client::snapshot_get_finish(&it, &snap);
+ }
+ snap_info.emplace(
+ snap_id, SnapInfo(snap.name, snap.snapshot_namespace, 0, {}, 0, 0, {}));
+ }
+
+ if (r == -ENOENT) {
+ // restart
+ get_snapcontext();
+ return;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to get snapshots: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ calc_promotion_state(snap_info);
+ finish(0);
+}
+
+template <typename I>
+void GetInfoRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+template <typename I>
+void GetInfoRequest<I>::calc_promotion_state(
+ const std::map<librados::snap_t, SnapInfo> &snap_info) {
+ *m_promotion_state = PROMOTION_STATE_UNKNOWN;
+ *m_primary_mirror_uuid = "";
+
+ for (auto it = snap_info.rbegin(); it != snap_info.rend(); it++) {
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &it->second.snap_namespace);
+
+ if (mirror_ns != nullptr) {
+ switch (mirror_ns->state) {
+ case cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY:
+ *m_promotion_state = PROMOTION_STATE_PRIMARY;
+ break;
+ case cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY:
+ *m_promotion_state = PROMOTION_STATE_NON_PRIMARY;
+ *m_primary_mirror_uuid = mirror_ns->primary_mirror_uuid;
+ break;
+ case cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY_DEMOTED:
+ case cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY_DEMOTED:
+ *m_promotion_state = PROMOTION_STATE_ORPHAN;
+ break;
+ }
+ break;
+ }
+ }
+
+ ldout(m_cct, 10) << "promotion_state=" << *m_promotion_state << ", "
+ << "primary_mirror_uuid=" << *m_primary_mirror_uuid << dendl;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::GetInfoRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/GetInfoRequest.h b/src/librbd/mirror/GetInfoRequest.h
new file mode 100644
index 000000000..dcc6da7da
--- /dev/null
+++ b/src/librbd/mirror/GetInfoRequest.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H
+
+#include "common/snap_types.h"
+#include "include/buffer.h"
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Types.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+struct Context;
+
+namespace cls { namespace rbd { struct MirrorImage; } }
+
+namespace librbd {
+
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetInfoRequest {
+public:
+ static GetInfoRequest *create(librados::IoCtx &io_ctx,
+ asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ std::string* primary_mirror_uuid,
+ Context *on_finish) {
+ return new GetInfoRequest(io_ctx, op_work_queue, image_id, mirror_image,
+ promotion_state, primary_mirror_uuid, on_finish);
+ }
+ static GetInfoRequest *create(ImageCtxT &image_ctx,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ std::string* primary_mirror_uuid,
+ Context *on_finish) {
+ return new GetInfoRequest(image_ctx, mirror_image, promotion_state,
+ primary_mirror_uuid, on_finish);
+ }
+
+ GetInfoRequest(librados::IoCtx& io_ctx, asio::ContextWQ *op_work_queue,
+ const std::string &image_id,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ std::string* primary_mirror_uuid, Context *on_finish);
+ GetInfoRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ std::string* primary_mirror_uuid, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE
+ * |
+ * (journal /--------/ \--------\ (snapshot
+ * mode) | | mode)
+ * v v
+ * GET_JOURNAL_TAG_OWNER GET_SNAPCONTEXT (skip if
+ * | | cached)
+ * | v
+ * | GET_SNAPSHOTS (skip if
+ * | | cached)
+ * \--------\ /--------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx = nullptr;
+ librados::IoCtx &m_io_ctx;
+ asio::ContextWQ *m_op_work_queue;
+ std::string m_image_id;
+ cls::rbd::MirrorImage *m_mirror_image;
+ PromotionState *m_promotion_state;
+ std::string* m_primary_mirror_uuid;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ bufferlist m_out_bl;
+ std::string m_mirror_uuid;
+ ::SnapContext m_snapc;
+
+ void get_mirror_image();
+ void handle_get_mirror_image(int r);
+
+ void get_journal_tag_owner();
+ void handle_get_journal_tag_owner(int r);
+
+ void get_snapcontext();
+ void handle_get_snapcontext(int r);
+
+ void get_snapshots();
+ void handle_get_snapshots(int r);
+
+ void finish(int r);
+
+ void calc_promotion_state(
+ const std::map<librados::snap_t, SnapInfo> &snap_info);
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::GetInfoRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H
+
diff --git a/src/librbd/mirror/GetStatusRequest.cc b/src/librbd/mirror/GetStatusRequest.cc
new file mode 100644
index 000000000..40d4a664b
--- /dev/null
+++ b/src/librbd/mirror/GetStatusRequest.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/GetStatusRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::GetStatusRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void GetStatusRequest<I>::send() {
+ *m_mirror_image_status = cls::rbd::MirrorImageStatus(
+ {{cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID,
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN, "status not found"}});
+
+ get_info();
+}
+
+template <typename I>
+void GetStatusRequest<I>::get_info() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ GetStatusRequest<I>, &GetStatusRequest<I>::handle_get_info>(this);
+ auto req = GetInfoRequest<I>::create(m_image_ctx, m_mirror_image,
+ m_promotion_state,
+ &m_primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void GetStatusRequest<I>::handle_get_info(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ }
+ finish(r);
+ return;
+ } else if (m_mirror_image->state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ finish(0);
+ return;
+ }
+
+ get_status();
+}
+
+template <typename I>
+void GetStatusRequest<I>::get_status() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_status_get_start(
+ &op, m_mirror_image->global_image_id);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetStatusRequest<I>, &GetStatusRequest<I>::handle_get_status>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetStatusRequest<I>::handle_get_status(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_status_get_finish(&iter,
+ m_mirror_image_status);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirror image status: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetStatusRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::GetStatusRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/GetStatusRequest.h b/src/librbd/mirror/GetStatusRequest.h
new file mode 100644
index 000000000..581a0d667
--- /dev/null
+++ b/src/librbd/mirror/GetStatusRequest.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H
+
+#include "include/buffer.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+struct Context;
+namespace cls { namespace rbd { struct MirrorImage; } }
+namespace cls { namespace rbd { struct MirrorImageStatus; } }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetStatusRequest {
+public:
+ static GetStatusRequest *create(ImageCtxT &image_ctx,
+ cls::rbd::MirrorImageStatus *status,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state,
+ Context *on_finish) {
+ return new GetStatusRequest(image_ctx, status, mirror_image,
+ promotion_state, on_finish);
+ }
+
+ GetStatusRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImageStatus *status,
+ cls::rbd::MirrorImage *mirror_image,
+ PromotionState *promotion_state, Context *on_finish)
+ : m_image_ctx(image_ctx), m_mirror_image_status(status),
+ m_mirror_image(mirror_image), m_promotion_state(promotion_state),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * GET_STATUS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ cls::rbd::MirrorImageStatus *m_mirror_image_status;
+ cls::rbd::MirrorImage *m_mirror_image;
+ PromotionState *m_promotion_state;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_primary_mirror_uuid;
+
+ void get_info();
+ void handle_get_info(int r);
+
+ void get_status();
+ void handle_get_status(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::GetStatusRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H
+
diff --git a/src/librbd/mirror/GetUuidRequest.cc b/src/librbd/mirror/GetUuidRequest.cc
new file mode 100644
index 000000000..f8209f905
--- /dev/null
+++ b/src/librbd/mirror/GetUuidRequest.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/GetUuidRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::GetUuidRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+GetUuidRequest<I>::GetUuidRequest(
+ librados::IoCtx& io_ctx, std::string* mirror_uuid, Context* on_finish)
+ : m_mirror_uuid(mirror_uuid), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())) {
+ m_io_ctx.dup(io_ctx);
+ m_io_ctx.set_namespace("");
+}
+
+template <typename I>
+void GetUuidRequest<I>::send() {
+ get_mirror_uuid();
+}
+
+template <typename I>
+void GetUuidRequest<I>::get_mirror_uuid() {
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_uuid_get_start(&op);
+
+ auto aio_comp = create_rados_callback<
+ GetUuidRequest<I>, &GetUuidRequest<I>::handle_get_mirror_uuid>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void GetUuidRequest<I>::handle_get_mirror_uuid(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r >= 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_uuid_get_finish(&it, m_mirror_uuid);
+ if (r >= 0 && m_mirror_uuid->empty()) {
+ r = -ENOENT;
+ }
+ }
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ ldout(m_cct, 5) << "mirror uuid missing" << dendl;
+ } else {
+ lderr(m_cct) << "failed to retrieve mirror uuid: " << cpp_strerror(r)
+ << dendl;
+ }
+ *m_mirror_uuid = "";
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void GetUuidRequest<I>::finish(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::GetUuidRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/GetUuidRequest.h b/src/librbd/mirror/GetUuidRequest.h
new file mode 100644
index 000000000..73cc2d5b2
--- /dev/null
+++ b/src/librbd/mirror/GetUuidRequest.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_GET_UUID_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_GET_UUID_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+#include <string>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetUuidRequest {
+public:
+ static GetUuidRequest *create(librados::IoCtx& io_ctx,
+ std::string* mirror_uuid, Context* on_finish) {
+ return new GetUuidRequest(io_ctx, mirror_uuid, on_finish);
+ }
+
+ GetUuidRequest(librados::IoCtx& io_ctx, std::string* mirror_uuid,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_UUID
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx m_io_ctx;
+ std::string* m_mirror_uuid;
+ Context* m_on_finish;
+
+ CephContext* m_cct;
+
+ bufferlist m_out_bl;
+
+ void get_mirror_uuid();
+ void handle_get_mirror_uuid(int r);
+
+ void finish(int r);
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::GetUuidRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_GET_UUID_REQUEST_H
diff --git a/src/librbd/mirror/ImageRemoveRequest.cc b/src/librbd/mirror/ImageRemoveRequest.cc
new file mode 100644
index 000000000..1aa265dae
--- /dev/null
+++ b/src/librbd/mirror/ImageRemoveRequest.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/ImageRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::ImageRemoveRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using util::create_rados_callback;
+
+template <typename I>
+ImageRemoveRequest<I>::ImageRemoveRequest(
+ librados::IoCtx& io_ctx, const std::string& global_image_id,
+ const std::string& image_id, Context* on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id), m_image_id(image_id),
+ m_on_finish(on_finish), m_cct(static_cast<CephContext*>(m_io_ctx.cct())) {
+}
+
+template <typename I>
+void ImageRemoveRequest<I>::send() {
+ remove_mirror_image();
+}
+
+template <typename I>
+void ImageRemoveRequest<I>::remove_mirror_image() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_remove(&op, m_image_id);
+
+ auto comp = create_rados_callback<
+ ImageRemoveRequest<I>,
+ &ImageRemoveRequest<I>::handle_remove_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ImageRemoveRequest<I>::handle_remove_mirror_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove mirroring image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ notify_mirroring_watcher();
+}
+
+template <typename I>
+void ImageRemoveRequest<I>::notify_mirroring_watcher() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ ImageRemoveRequest<I>,
+ &ImageRemoveRequest<I>::handle_notify_mirroring_watcher>(this);
+ MirroringWatcher<I>::notify_image_updated(
+ m_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED,
+ m_image_id, m_global_image_id, ctx);
+}
+
+template <typename I>
+void ImageRemoveRequest<I>::handle_notify_mirroring_watcher(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to notify mirror image update: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void ImageRemoveRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::ImageRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/ImageRemoveRequest.h b/src/librbd/mirror/ImageRemoveRequest.h
new file mode 100644
index 000000000..c04f9fadc
--- /dev/null
+++ b/src/librbd/mirror/ImageRemoveRequest.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_IMAGE_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_IMAGE_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "cls/rbd/cls_rbd_types.h"
+
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageRemoveRequest {
+public:
+ static ImageRemoveRequest *create(librados::IoCtx& io_ctx,
+ const std::string& global_image_id,
+ const std::string& image_id,
+ Context* on_finish) {
+ return new ImageRemoveRequest(io_ctx, global_image_id, image_id, on_finish);
+ }
+
+ ImageRemoveRequest(librados::IoCtx& io_ctx,
+ const std::string& global_image_id,
+ const std::string& image_id,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * REMOVE_MIRROR_IMAGE
+ * |
+ * v
+ * NOTIFY_MIRRORING_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx& m_io_ctx;
+ std::string m_global_image_id;
+ std::string m_image_id;
+ Context* m_on_finish;
+
+ CephContext* m_cct;
+
+ void remove_mirror_image();
+ void handle_remove_mirror_image(int r);
+
+ void notify_mirroring_watcher();
+ void handle_notify_mirroring_watcher(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::ImageRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_IMAGE_REMOVE_REQUEST_H
diff --git a/src/librbd/mirror/ImageStateUpdateRequest.cc b/src/librbd/mirror/ImageStateUpdateRequest.cc
new file mode 100644
index 000000000..98e987190
--- /dev/null
+++ b/src/librbd/mirror/ImageStateUpdateRequest.cc
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/ImageStateUpdateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::ImageStateUpdateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using util::create_rados_callback;
+
+template <typename I>
+ImageStateUpdateRequest<I>::ImageStateUpdateRequest(
+ librados::IoCtx& io_ctx,
+ const std::string& image_id,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const cls::rbd::MirrorImage& mirror_image,
+ Context* on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id),
+ m_mirror_image_state(mirror_image_state), m_mirror_image(mirror_image),
+ m_on_finish(on_finish), m_cct(static_cast<CephContext*>(m_io_ctx.cct())) {
+ ceph_assert(m_mirror_image_state != cls::rbd::MIRROR_IMAGE_STATE_DISABLED);
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::send() {
+ get_mirror_image();
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::get_mirror_image() {
+ if (!m_mirror_image.global_image_id.empty()) {
+ set_mirror_image();
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_id);
+
+ auto comp = create_rados_callback<
+ ImageStateUpdateRequest<I>,
+ &ImageStateUpdateRequest<I>::handle_get_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::handle_get_mirror_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_get_finish(&iter, &m_mirror_image);
+ }
+
+ if (r == -ENOENT) {
+ ldout(m_cct, 20) << "mirroring is disabled" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ set_mirror_image();
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::set_mirror_image() {
+ if (m_mirror_image.state == m_mirror_image_state) {
+ finish(0);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+ m_mirror_image.state = m_mirror_image_state;
+
+ librados::ObjectWriteOperation op;
+ cls_client::mirror_image_set(&op, m_image_id, m_mirror_image);
+
+ auto comp = create_rados_callback<
+ ImageStateUpdateRequest<I>,
+ &ImageStateUpdateRequest<I>::handle_set_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::handle_set_mirror_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to disable mirroring image: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ notify_mirroring_watcher();
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::notify_mirroring_watcher() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = util::create_context_callback<
+ ImageStateUpdateRequest<I>,
+ &ImageStateUpdateRequest<I>::handle_notify_mirroring_watcher>(this);
+ MirroringWatcher<I>::notify_image_updated(
+ m_io_ctx, m_mirror_image_state, m_image_id, m_mirror_image.global_image_id,
+ ctx);
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::handle_notify_mirroring_watcher(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to notify mirror image update: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void ImageStateUpdateRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::ImageStateUpdateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/ImageStateUpdateRequest.h b/src/librbd/mirror/ImageStateUpdateRequest.h
new file mode 100644
index 000000000..9e0affe6a
--- /dev/null
+++ b/src/librbd/mirror/ImageStateUpdateRequest.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_IMAGE_STATE_UPDATE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_IMAGE_STATE_UPDATE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageStateUpdateRequest {
+public:
+ static ImageStateUpdateRequest *create(
+ librados::IoCtx& io_ctx,
+ const std::string& image_id,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const cls::rbd::MirrorImage& mirror_image,
+ Context* on_finish) {
+ return new ImageStateUpdateRequest(
+ io_ctx, image_id, mirror_image_state, mirror_image, on_finish);
+ }
+
+ ImageStateUpdateRequest(
+ librados::IoCtx& io_ctx,
+ const std::string& image_id,
+ cls::rbd::MirrorImageState mirror_image_state,
+ const cls::rbd::MirrorImage& mirror_image,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (skip if provided)
+ * GET_MIRROR_IMAGE
+ * |
+ * v
+ * SET_MIRROR_IMAGE
+ * |
+ * v
+ * NOTIFY_MIRRORING_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx& m_io_ctx;
+ std::string m_image_id;
+ cls::rbd::MirrorImageState m_mirror_image_state;
+ cls::rbd::MirrorImage m_mirror_image;
+ Context* m_on_finish;
+
+ CephContext* m_cct;
+ bufferlist m_out_bl;
+
+ void get_mirror_image();
+ void handle_get_mirror_image(int r);
+
+ void set_mirror_image();
+ void handle_set_mirror_image(int r);
+
+ void notify_mirroring_watcher();
+ void handle_notify_mirroring_watcher(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::ImageStateUpdateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_IMAGE_STATE_UPDATE_REQUEST_H
diff --git a/src/librbd/mirror/PromoteRequest.cc b/src/librbd/mirror/PromoteRequest.cc
new file mode 100644
index 000000000..b119e4edc
--- /dev/null
+++ b/src/librbd/mirror/PromoteRequest.cc
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/PromoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "librbd/mirror/snapshot/PromoteRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::PromoteRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void PromoteRequest<I>::send() {
+ get_info();
+}
+
+template <typename I>
+void PromoteRequest<I>::get_info() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_get_info>(this);
+ auto req = GetInfoRequest<I>::create(m_image_ctx, &m_mirror_image,
+ &m_promotion_state,
+ &m_primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_get_info(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ lderr(cct) << "mirroring is not currently enabled" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_promotion_state == PROMOTION_STATE_PRIMARY) {
+ lderr(cct) << "image is already primary" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_promotion_state == PROMOTION_STATE_NON_PRIMARY && !m_force) {
+ lderr(cct) << "image is primary within a remote cluster or demotion is not propagated yet"
+ << dendl;
+ finish(-EBUSY);
+ return;
+ }
+
+ promote();
+}
+
+template <typename I>
+void PromoteRequest<I>::promote() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_promote>(this);
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ Journal<I>::promote(&m_image_ctx, ctx);
+ } else if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ auto req = mirror::snapshot::PromoteRequest<I>::create(
+ &m_image_ctx, m_mirror_image.global_image_id, ctx);
+ req->send();
+ } else {
+ lderr(cct) << "unknown image mirror mode: " << m_mirror_image.mode << dendl;
+ finish(-EOPNOTSUPP);
+ }
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_promote(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to promote image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void PromoteRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::PromoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/PromoteRequest.h b/src/librbd/mirror/PromoteRequest.h
new file mode 100644
index 000000000..c54f3bb76
--- /dev/null
+++ b/src/librbd/mirror/PromoteRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PromoteRequest {
+public:
+ static PromoteRequest *create(ImageCtxT &image_ctx, bool force,
+ Context *on_finish) {
+ return new PromoteRequest(image_ctx, force, on_finish);
+ }
+
+ PromoteRequest(ImageCtxT &image_ctx, bool force, Context *on_finish)
+ : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * GET_TAG_OWNER
+ * |
+ * v
+ * PROMOTE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ bool m_force;
+ Context *m_on_finish;
+
+ cls::rbd::MirrorImage m_mirror_image;
+ PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY;
+ std::string m_primary_mirror_uuid;
+
+ void get_info();
+ void handle_get_info(int r);
+
+ void promote();
+ void handle_promote(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::PromoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H
diff --git a/src/librbd/mirror/Types.h b/src/librbd/mirror/Types.h
new file mode 100644
index 000000000..2388b74ef
--- /dev/null
+++ b/src/librbd/mirror/Types.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_TYPES_H
+#define CEPH_LIBRBD_MIRROR_TYPES_H
+
+namespace librbd {
+namespace mirror {
+
+enum PromotionState {
+ PROMOTION_STATE_UNKNOWN,
+ PROMOTION_STATE_PRIMARY,
+ PROMOTION_STATE_NON_PRIMARY,
+ PROMOTION_STATE_ORPHAN
+};
+
+} // namespace mirror
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIRROR_TYPES_H
+
diff --git a/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc
new file mode 100644
index 000000000..eed0aa506
--- /dev/null
+++ b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/CreateNonPrimaryRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/Utils.h"
+#include "librbd/mirror/snapshot/WriteImageStateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::CreateNonPrimaryRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+CreateNonPrimaryRequest<I>::CreateNonPrimaryRequest(
+ I* image_ctx, bool demoted, const std::string &primary_mirror_uuid,
+ uint64_t primary_snap_id, const SnapSeqs& snap_seqs,
+ const ImageState &image_state, uint64_t *snap_id, Context *on_finish)
+ : m_image_ctx(image_ctx), m_demoted(demoted),
+ m_primary_mirror_uuid(primary_mirror_uuid),
+ m_primary_snap_id(primary_snap_id), m_snap_seqs(snap_seqs),
+ m_image_state(image_state), m_snap_id(snap_id), m_on_finish(on_finish) {
+ m_default_ns_ctx.dup(m_image_ctx->md_ctx);
+ m_default_ns_ctx.set_namespace("");
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::send() {
+ refresh_image();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::refresh_image() {
+ if (!m_image_ctx->state->is_refresh_required()) {
+ get_mirror_image();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CreateNonPrimaryRequest<I>,
+ &CreateNonPrimaryRequest<I>::handle_refresh_image>(this);
+ m_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::handle_refresh_image(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_mirror_image();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::get_mirror_image() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, m_image_ctx->id);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ CreateNonPrimaryRequest<I>,
+ &CreateNonPrimaryRequest<I>::handle_get_mirror_image>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::handle_get_mirror_image(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_image_get_finish(&iter, &mirror_image);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ lderr(cct) << "snapshot based mirroring is not enabled" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ if (!is_orphan() && !util::can_create_non_primary_snapshot(m_image_ctx)) {
+ finish(-EINVAL);
+ return;
+ }
+
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ m_snap_name = ".mirror.non_primary." + mirror_image.global_image_id + "." +
+ uuid_gen.to_string();
+
+ get_mirror_peers();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::get_mirror_peers() {
+ if (!m_demoted) {
+ create_snapshot();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_peer_list_start(&op);
+
+ auto aio_comp = create_rados_callback<
+ CreateNonPrimaryRequest<I>,
+ &CreateNonPrimaryRequest<I>::handle_get_mirror_peers>(this);
+ m_out_bl.clear();
+ int r = m_default_ns_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::handle_get_mirror_peers(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ std::vector<cls::rbd::MirrorPeer> peers;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_peer_list_finish(&iter, &peers);
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror peers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ for (auto &peer : peers) {
+ if (peer.mirror_peer_direction == cls::rbd::MIRROR_PEER_DIRECTION_RX) {
+ continue;
+ }
+ m_mirror_peer_uuids.insert(peer.uuid);
+ }
+
+ create_snapshot();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::create_snapshot() {
+ CephContext *cct = m_image_ctx->cct;
+
+ cls::rbd::MirrorSnapshotNamespace ns{
+ (m_demoted ? cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY_DEMOTED :
+ cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY), {},
+ m_primary_mirror_uuid, m_primary_snap_id};
+ if (m_demoted) {
+ ns.mirror_peer_uuids = m_mirror_peer_uuids;
+ }
+ ns.snap_seqs = m_snap_seqs;
+ ns.complete = is_orphan();
+ ldout(cct, 15) << "ns=" << ns << dendl;
+
+ auto ctx = create_context_callback<
+ CreateNonPrimaryRequest<I>,
+ &CreateNonPrimaryRequest<I>::handle_create_snapshot>(this);
+ m_image_ctx->operations->snap_create(ns, m_snap_name, 0, m_prog_ctx, ctx);
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::handle_create_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to create mirror snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ write_image_state();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::write_image_state() {
+ uint64_t snap_id;
+ {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ snap_id = m_image_ctx->get_snap_id(
+ cls::rbd::MirrorSnapshotNamespace{}, m_snap_name);
+ }
+
+ if (m_snap_id != nullptr) {
+ *m_snap_id = snap_id;
+ }
+
+ if (is_orphan()) {
+ finish(0);
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CreateNonPrimaryRequest<I>,
+ &CreateNonPrimaryRequest<I>::handle_write_image_state>(this);
+
+ auto req = WriteImageStateRequest<I>::create(m_image_ctx, snap_id,
+ m_image_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::handle_write_image_state(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to write image state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void CreateNonPrimaryRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::CreateNonPrimaryRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h
new file mode 100644
index 000000000..36f155413
--- /dev/null
+++ b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_NON_PRIMARY_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_NON_PRIMARY_REQUEST_H
+
+#include "include/buffer.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Types.h"
+#include "librbd/internal.h"
+#include "librbd/mirror/snapshot/Types.h"
+
+#include <string>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CreateNonPrimaryRequest {
+public:
+ static CreateNonPrimaryRequest *create(ImageCtxT *image_ctx,
+ bool demoted,
+ const std::string &primary_mirror_uuid,
+ uint64_t primary_snap_id,
+ const SnapSeqs& snap_seqs,
+ const ImageState &image_state,
+ uint64_t *snap_id,
+ Context *on_finish) {
+ return new CreateNonPrimaryRequest(image_ctx, demoted, primary_mirror_uuid,
+ primary_snap_id, snap_seqs, image_state,
+ snap_id, on_finish);
+ }
+
+ CreateNonPrimaryRequest(ImageCtxT *image_ctx,
+ bool demoted,
+ const std::string &primary_mirror_uuid,
+ uint64_t primary_snap_id,
+ const SnapSeqs& snap_seqs,
+ const ImageState &image_state, uint64_t *snap_id,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * REFRESH_IMAGE
+ * |
+ * v
+ * GET_MIRROR_IMAGE
+ * |
+ * v (skip if not needed)
+ * GET_MIRROR_PEERS
+ * |
+ * v
+ * CREATE_SNAPSHOT
+ * |
+ * v
+ * WRITE_IMAGE_STATE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ bool m_demoted;
+ std::string m_primary_mirror_uuid;
+ uint64_t m_primary_snap_id;
+ SnapSeqs m_snap_seqs;
+ ImageState m_image_state;
+ uint64_t *m_snap_id;
+ Context *m_on_finish;
+
+ librados::IoCtx m_default_ns_ctx;
+ std::set<std::string> m_mirror_peer_uuids;
+
+ std::string m_snap_name;
+
+ bufferlist m_out_bl;
+ NoOpProgressContext m_prog_ctx;
+
+ bool is_orphan() const {
+ return m_primary_mirror_uuid.empty();
+ }
+
+ void refresh_image();
+ void handle_refresh_image(int r);
+
+ void get_mirror_image();
+ void handle_get_mirror_image(int r);
+
+ void get_mirror_peers();
+ void handle_get_mirror_peers(int r);
+
+ void create_snapshot();
+ void handle_create_snapshot(int r);
+
+ void write_image_state();
+ void handle_write_image_state(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::CreateNonPrimaryRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_NON_PRIMARY_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/CreatePrimaryRequest.cc b/src/librbd/mirror/snapshot/CreatePrimaryRequest.cc
new file mode 100644
index 000000000..54da9ad61
--- /dev/null
+++ b/src/librbd/mirror/snapshot/CreatePrimaryRequest.cc
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/CreatePrimaryRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/UnlinkPeerRequest.h"
+#include "librbd/mirror/snapshot/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::CreatePrimaryRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+CreatePrimaryRequest<I>::CreatePrimaryRequest(
+ I *image_ctx, const std::string& global_image_id,
+ uint64_t clean_since_snap_id, uint64_t snap_create_flags, uint32_t flags,
+ uint64_t *snap_id, Context *on_finish)
+ : m_image_ctx(image_ctx), m_global_image_id(global_image_id),
+ m_clean_since_snap_id(clean_since_snap_id),
+ m_snap_create_flags(snap_create_flags), m_flags(flags), m_snap_id(snap_id),
+ m_on_finish(on_finish) {
+ m_default_ns_ctx.dup(m_image_ctx->md_ctx);
+ m_default_ns_ctx.set_namespace("");
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::send() {
+ if (!util::can_create_primary_snapshot(
+ m_image_ctx,
+ ((m_flags & CREATE_PRIMARY_FLAG_DEMOTED) != 0),
+ ((m_flags & CREATE_PRIMARY_FLAG_FORCE) != 0), nullptr, nullptr)) {
+ finish(-EINVAL);
+ return;
+ }
+
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ m_snap_name = ".mirror.primary." + m_global_image_id + "." +
+ uuid_gen.to_string();
+
+ get_mirror_peers();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::get_mirror_peers() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_peer_list_start(&op);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ CreatePrimaryRequest<I>,
+ &CreatePrimaryRequest<I>::handle_get_mirror_peers>(this);
+ m_out_bl.clear();
+ int r = m_default_ns_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::handle_get_mirror_peers(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ std::vector<cls::rbd::MirrorPeer> peers;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::mirror_peer_list_finish(&iter, &peers);
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve mirror peers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ for (auto &peer : peers) {
+ if (peer.mirror_peer_direction == cls::rbd::MIRROR_PEER_DIRECTION_RX) {
+ continue;
+ }
+ m_mirror_peer_uuids.insert(peer.uuid);
+ }
+
+ if (m_mirror_peer_uuids.empty() &&
+ ((m_flags & CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS) == 0)) {
+ lderr(cct) << "no mirror tx peers configured for the pool" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ create_snapshot();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::create_snapshot() {
+ cls::rbd::MirrorSnapshotNamespace ns{
+ ((m_flags & CREATE_PRIMARY_FLAG_DEMOTED) != 0 ?
+ cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY_DEMOTED :
+ cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY),
+ m_mirror_peer_uuids, "", m_clean_since_snap_id};
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "name=" << m_snap_name << ", "
+ << "ns=" << ns << dendl;
+ auto ctx = create_context_callback<
+ CreatePrimaryRequest<I>,
+ &CreatePrimaryRequest<I>::handle_create_snapshot>(this);
+ m_image_ctx->operations->snap_create(ns, m_snap_name, m_snap_create_flags,
+ m_prog_ctx, ctx);
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::handle_create_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to create mirror snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ refresh_image();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::refresh_image() {
+ // if snapshot created via remote RPC, refresh is required to retrieve
+ // the snapshot id
+ if (m_snap_id == nullptr) {
+ unlink_peer();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ CreatePrimaryRequest<I>,
+ &CreatePrimaryRequest<I>::handle_refresh_image>(this);
+ m_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::handle_refresh_image(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ *m_snap_id = m_image_ctx->get_snap_id(
+ cls::rbd::MirrorSnapshotNamespace{}, m_snap_name);
+ ldout(cct, 15) << "snap_id=" << *m_snap_id << dendl;
+ }
+
+ unlink_peer();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::unlink_peer() {
+ uint64_t max_snapshots = m_image_ctx->config.template get_val<uint64_t>(
+ "rbd_mirroring_max_mirroring_snapshots");
+ ceph_assert(max_snapshots >= 3);
+
+ std::string peer_uuid;
+ uint64_t snap_id = CEPH_NOSNAP;
+
+ for (auto &peer : m_mirror_peer_uuids) {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ size_t count = 0;
+ uint64_t unlink_snap_id = 0;
+ for (auto &snap_it : m_image_ctx->snap_info) {
+ auto info = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &snap_it.second.snap_namespace);
+ if (info == nullptr) {
+ continue;
+ }
+ if (info->state != cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY) {
+ // reset counters -- we count primary snapshots after the last promotion
+ count = 0;
+ unlink_snap_id = 0;
+ continue;
+ }
+ // call UnlinkPeerRequest only if the snapshot is linked with this peer
+ // or if it's not linked with any peer (happens if mirroring is enabled
+ // on a pool with no peers configured or if UnlinkPeerRequest gets
+ // interrupted)
+ if (info->mirror_peer_uuids.size() == 0) {
+ peer_uuid = peer;
+ snap_id = snap_it.first;
+ break;
+ }
+ if (info->mirror_peer_uuids.count(peer) == 0) {
+ continue;
+ }
+ count++;
+ if (count == max_snapshots) {
+ unlink_snap_id = snap_it.first;
+ }
+ if (count > max_snapshots) {
+ peer_uuid = peer;
+ snap_id = unlink_snap_id;
+ break;
+ }
+ }
+ if (snap_id != CEPH_NOSNAP) {
+ break;
+ }
+ }
+
+ if (snap_id == CEPH_NOSNAP) {
+ finish(0);
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "peer=" << peer_uuid << ", snap_id=" << snap_id << dendl;
+
+ auto ctx = create_context_callback<
+ CreatePrimaryRequest<I>,
+ &CreatePrimaryRequest<I>::handle_unlink_peer>(this);
+ auto req = UnlinkPeerRequest<I>::create(m_image_ctx, snap_id, peer_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::handle_unlink_peer(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to unlink peer: " << cpp_strerror(r) << dendl;
+ finish(0); // not fatal
+ return;
+ }
+
+ unlink_peer();
+}
+
+template <typename I>
+void CreatePrimaryRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::CreatePrimaryRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/CreatePrimaryRequest.h b/src/librbd/mirror/snapshot/CreatePrimaryRequest.h
new file mode 100644
index 000000000..b8e84cf2b
--- /dev/null
+++ b/src/librbd/mirror/snapshot/CreatePrimaryRequest.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_PRIMARY_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_PRIMARY_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/internal.h"
+#include "librbd/mirror/snapshot/Types.h"
+
+#include <string>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CreatePrimaryRequest {
+public:
+ static CreatePrimaryRequest *create(ImageCtxT *image_ctx,
+ const std::string& global_image_id,
+ uint64_t clean_since_snap_id,
+ uint64_t snap_create_flags,
+ uint32_t flags, uint64_t *snap_id,
+ Context *on_finish) {
+ return new CreatePrimaryRequest(image_ctx, global_image_id,
+ clean_since_snap_id, snap_create_flags, flags,
+ snap_id, on_finish);
+ }
+
+ CreatePrimaryRequest(ImageCtxT *image_ctx,
+ const std::string& global_image_id,
+ uint64_t clean_since_snap_id, uint64_t snap_create_flags,
+ uint32_t flags, uint64_t *snap_id, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_PEERS
+ * |
+ * v
+ * CREATE_SNAPSHOT
+ * |
+ * v
+ * REFRESH_IMAGE
+ * |
+ * v
+ * UNLINK_PEER (skip if not needed,
+ * | repeat if needed)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ std::string m_global_image_id;
+ uint64_t m_clean_since_snap_id;
+ const uint64_t m_snap_create_flags;
+ const uint32_t m_flags;
+ uint64_t *m_snap_id;
+ Context *m_on_finish;
+
+ librados::IoCtx m_default_ns_ctx;
+ std::set<std::string> m_mirror_peer_uuids;
+ std::string m_snap_name;
+
+ bufferlist m_out_bl;
+ NoOpProgressContext m_prog_ctx;
+
+ void get_mirror_peers();
+ void handle_get_mirror_peers(int r);
+
+ void create_snapshot();
+ void handle_create_snapshot(int r);
+
+ void refresh_image();
+ void handle_refresh_image(int r);
+
+ void unlink_peer();
+ void handle_unlink_peer(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::CreatePrimaryRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_PRIMARY_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/DemoteRequest.cc b/src/librbd/mirror/snapshot/DemoteRequest.cc
new file mode 100644
index 000000000..ccaa33c83
--- /dev/null
+++ b/src/librbd/mirror/snapshot/DemoteRequest.cc
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/DemoteRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/CreatePrimaryRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::DemoteRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void DemoteRequest<I>::send() {
+ enable_non_primary_feature();
+}
+
+template <typename I>
+void DemoteRequest<I>::enable_non_primary_feature() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // ensure image is flagged with non-primary feature so that
+ // standard RBD clients cannot write to it.
+ librados::ObjectWriteOperation op;
+ cls_client::set_features(&op, RBD_FEATURE_NON_PRIMARY,
+ RBD_FEATURE_NON_PRIMARY);
+
+ auto aio_comp = create_rados_callback<
+ DemoteRequest<I>,
+ &DemoteRequest<I>::handle_enable_non_primary_feature>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_enable_non_primary_feature(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to enable non-primary feature: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ create_snapshot();
+}
+
+template <typename I>
+void DemoteRequest<I>::create_snapshot() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ DemoteRequest<I>, &DemoteRequest<I>::handle_create_snapshot>(this);
+
+ auto req = CreatePrimaryRequest<I>::create(
+ m_image_ctx, m_global_image_id, CEPH_NOSNAP,
+ SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE,
+ (snapshot::CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS |
+ snapshot::CREATE_PRIMARY_FLAG_DEMOTED), nullptr, ctx);
+ req->send();
+}
+
+template <typename I>
+void DemoteRequest<I>::handle_create_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to create mirror snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void DemoteRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::DemoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/DemoteRequest.h b/src/librbd/mirror/snapshot/DemoteRequest.h
new file mode 100644
index 000000000..63c935645
--- /dev/null
+++ b/src/librbd/mirror/snapshot/DemoteRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_DEMOTE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_DEMOTE_REQUEST_H
+
+#include "include/buffer.h"
+
+#include <string>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class DemoteRequest {
+public:
+ static DemoteRequest *create(ImageCtxT *image_ctx,
+ const std::string& global_image_id,
+ Context *on_finish) {
+ return new DemoteRequest(image_ctx, global_image_id, on_finish);
+ }
+
+ DemoteRequest(ImageCtxT *image_ctx, const std::string& global_image_id,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_global_image_id(global_image_id),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * ENABLE_NON_PRIMARY_FEATURE
+ * |
+ * v
+ * CREATE_SNAPSHOT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ std::string m_global_image_id;
+ Context *m_on_finish;
+
+ void enable_non_primary_feature();
+ void handle_enable_non_primary_feature(int r);
+
+ void create_snapshot();
+ void handle_create_snapshot(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::DemoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_DEMOTE_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/GetImageStateRequest.cc b/src/librbd/mirror/snapshot/GetImageStateRequest.cc
new file mode 100644
index 000000000..4692f88cb
--- /dev/null
+++ b/src/librbd/mirror/snapshot/GetImageStateRequest.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/GetImageStateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/Types.h"
+#include "librbd/mirror/snapshot/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::GetImageStateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void GetImageStateRequest<I>::send() {
+ read_object();
+}
+
+
+template <typename I>
+void GetImageStateRequest<I>::read_object() {
+ CephContext *cct = m_image_ctx->cct;
+
+ auto oid = util::image_state_object_name(m_image_ctx, m_snap_id,
+ m_object_index);
+ ldout(cct, 15) << oid << dendl;
+
+ librados::ObjectReadOperation op;
+ m_bl.clear();
+ op.read(0, 0, &m_bl, nullptr);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ GetImageStateRequest<I>,
+ &GetImageStateRequest<I>::handle_read_object>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void GetImageStateRequest<I>::handle_read_object(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read image state object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ auto iter = m_bl.cbegin();
+
+ if (m_object_index == 0) {
+ ImageStateHeader header;
+ try {
+ using ceph::decode;
+ decode(header, iter);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode image state object header" << dendl;
+ finish(-EBADMSG);
+ return;
+ }
+ m_object_count = header.object_count;
+ }
+
+ bufferlist bl;
+ bl.substr_of(m_bl, iter.get_off(), m_bl.length() - iter.get_off());
+ m_state_bl.claim_append(bl);
+
+ m_object_index++;
+
+ if (m_object_index >= m_object_count) {
+ finish(0);
+ return;
+ }
+
+ read_object();
+}
+
+template <typename I>
+void GetImageStateRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ try {
+ using ceph::decode;
+ decode(*m_image_state, m_state_bl);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode image state" << dendl;
+ r = -EBADMSG;
+ }
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::GetImageStateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/GetImageStateRequest.h b/src/librbd/mirror/snapshot/GetImageStateRequest.h
new file mode 100644
index 000000000..483e3a228
--- /dev/null
+++ b/src/librbd/mirror/snapshot/GetImageStateRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_GET_IMAGE_STATE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_GET_IMAGE_STATE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+struct ImageState;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetImageStateRequest {
+public:
+ static GetImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id,
+ ImageState *image_state,
+ Context *on_finish) {
+ return new GetImageStateRequest(image_ctx, snap_id, image_state, on_finish);
+ }
+
+ GetImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id,
+ ImageState *image_state, Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id), m_image_state(image_state),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * READ_OBJECT (repeat for
+ * | every object)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_snap_id;
+ ImageState *m_image_state;
+ Context *m_on_finish;
+
+ bufferlist m_bl;
+ bufferlist m_state_bl;
+
+ size_t m_object_count = 0;
+ size_t m_object_index = 0;
+
+ void read_object();
+ void handle_read_object(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::GetImageStateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_GET_IMAGE_STATE_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/ImageMeta.cc b/src/librbd/mirror/snapshot/ImageMeta.cc
new file mode 100644
index 000000000..826899775
--- /dev/null
+++ b/src/librbd/mirror/snapshot/ImageMeta.cc
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/ImageMeta.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "json_spirit/json_spirit.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "librbd/mirror/snapshot/Utils.h"
+#include "librbd/watcher/Notifier.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::ImageMeta: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_rados_callback;
+using librbd::mirror::snapshot::util::get_image_meta_key;
+
+template <typename I>
+ImageMeta<I>::ImageMeta(I* image_ctx, const std::string& mirror_uuid)
+ : m_image_ctx(image_ctx), m_mirror_uuid(mirror_uuid) {
+}
+
+template <typename I>
+void ImageMeta<I>::load(Context* on_finish) {
+ ldout(m_image_ctx->cct, 15) << "oid=" << m_image_ctx->header_oid << ", "
+ << "key=" << get_image_meta_key(m_mirror_uuid)
+ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::metadata_get_start(&op, get_image_meta_key(m_mirror_uuid));
+
+ m_out_bl.clear();
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_load(on_finish, r);
+ });
+ auto aio_comp = create_rados_callback(ctx);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void ImageMeta<I>::handle_load(Context* on_finish, int r) {
+ ldout(m_image_ctx->cct, 15) << "r=" << r << dendl;
+
+ std::string data;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::metadata_get_finish(&it, &data);
+ }
+
+ if (r == -ENOENT) {
+ ldout(m_image_ctx->cct, 15) << "no snapshot-based mirroring image-meta: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to load snapshot-based mirroring "
+ << "image-meta: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if (json_spirit::read(data, json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ resync_requested = json_obj["resync_requested"].get_bool();
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ lderr(m_image_ctx->cct) << "invalid image-meta JSON received" << dendl;
+ on_finish->complete(-EBADMSG);
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ImageMeta<I>::save(Context* on_finish) {
+ ldout(m_image_ctx->cct, 15) << "oid=" << m_image_ctx->header_oid << ", "
+ << "key=" << get_image_meta_key(m_mirror_uuid)
+ << dendl;
+
+ // simple implementation for now
+ std::string json = "{\"resync_requested\": " +
+ std::string(resync_requested ? "true" : "false") + "}";
+
+ bufferlist bl;
+ bl.append(json);
+
+ // avoid using built-in metadata_set operation since that would require
+ // opening the non-primary image in read/write mode which isn't supported
+ librados::ObjectWriteOperation op;
+ cls_client::metadata_set(&op, {{get_image_meta_key(m_mirror_uuid), bl}});
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_save(on_finish, r);
+ });
+ auto aio_comp = create_rados_callback(ctx);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void ImageMeta<I>::handle_save(Context* on_finish, int r) {
+ ldout(m_image_ctx->cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to save snapshot-based mirroring "
+ << "image-meta: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ notify_update(on_finish);
+}
+
+template <typename I>
+void ImageMeta<I>::notify_update(Context* on_finish) {
+ ldout(m_image_ctx->cct, 15) << dendl;
+
+ // directly send header notification on image since you cannot
+ // open a non-primary image read/write and therefore cannot re-use
+ // the ImageWatcher to send the notification
+ bufferlist bl;
+ encode(watch_notify::NotifyMessage(new watch_notify::HeaderUpdatePayload()),
+ bl);
+
+ m_out_bl.clear();
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_notify_update(on_finish, r);
+ });
+ auto aio_comp = create_rados_callback(ctx);
+ int r = m_image_ctx->md_ctx.aio_notify(
+ m_image_ctx->header_oid, aio_comp, bl, watcher::Notifier::NOTIFY_TIMEOUT,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void ImageMeta<I>::handle_notify_update(Context* on_finish, int r) {
+ ldout(m_image_ctx->cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to notify image update: "
+ << cpp_strerror(r) << dendl;
+ }
+ on_finish->complete(r);
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::ImageMeta<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/ImageMeta.h b/src/librbd/mirror/snapshot/ImageMeta.h
new file mode 100644
index 000000000..5d05f1927
--- /dev/null
+++ b/src/librbd/mirror/snapshot/ImageMeta.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_IMAGE_META_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_IMAGE_META_H
+
+#include "include/rados/librados.hpp"
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT>
+class ImageMeta {
+public:
+ static ImageMeta* create(ImageCtxT* image_ctx,
+ const std::string& mirror_uuid) {
+ return new ImageMeta(image_ctx, mirror_uuid);
+ }
+
+ ImageMeta(ImageCtxT* image_ctx, const std::string& mirror_uuid);
+
+ void load(Context* on_finish);
+ void save(Context* on_finish);
+
+ bool resync_requested = false;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * METADATA_GET
+ * |
+ * v
+ * <idle>
+ * |
+ * v
+ * METADATA_SET
+ * |
+ * v
+ * NOTIFY_UPDATE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT* m_image_ctx;
+ std::string m_mirror_uuid;
+
+ bufferlist m_out_bl;
+
+ void handle_load(Context* on_finish, int r);
+
+ void handle_save(Context* on_finish, int r);
+
+ void notify_update(Context* on_finish);
+ void handle_notify_update(Context* on_finish, int r);
+
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::ImageMeta<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_IMAGE_META_H
diff --git a/src/librbd/mirror/snapshot/PromoteRequest.cc b/src/librbd/mirror/snapshot/PromoteRequest.cc
new file mode 100644
index 000000000..9718c299e
--- /dev/null
+++ b/src/librbd/mirror/snapshot/PromoteRequest.cc
@@ -0,0 +1,405 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/PromoteRequest.h"
+#include "common/Timer.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/image/ListWatchersRequest.h"
+#include "librbd/mirror/snapshot/CreateNonPrimaryRequest.h"
+#include "librbd/mirror/snapshot/CreatePrimaryRequest.h"
+#include "librbd/mirror/snapshot/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::PromoteRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void PromoteRequest<I>::send() {
+ CephContext *cct = m_image_ctx->cct;
+ bool requires_orphan = false;
+ if (!util::can_create_primary_snapshot(m_image_ctx, false, true,
+ &requires_orphan,
+ &m_rollback_snap_id)) {
+ lderr(cct) << "cannot promote" << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_rollback_snap_id == CEPH_NOSNAP && !requires_orphan) {
+ create_promote_snapshot();
+ return;
+ }
+
+ ldout(cct, 15) << "requires_orphan=" << requires_orphan << ", "
+ << "rollback_snap_id=" << m_rollback_snap_id << dendl;
+ create_orphan_snapshot();
+}
+
+template <typename I>
+void PromoteRequest<I>::create_orphan_snapshot() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_create_orphan_snapshot>(this);
+
+ auto req = CreateNonPrimaryRequest<I>::create(
+ m_image_ctx, false, "", CEPH_NOSNAP, {}, {}, nullptr, ctx);
+ req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_create_orphan_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to create orphan snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ list_watchers();
+}
+
+template <typename I>
+void PromoteRequest<I>::list_watchers() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_list_watchers>(this);
+
+ m_watchers.clear();
+ auto flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE |
+ librbd::image::LIST_WATCHERS_MIRROR_INSTANCES_ONLY;
+ auto req = librbd::image::ListWatchersRequest<I>::create(
+ *m_image_ctx, flags, &m_watchers, ctx);
+ req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_list_watchers(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to list watchers: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_watchers.empty()) {
+ acquire_exclusive_lock();
+ return;
+ }
+
+ wait_update_notify();
+}
+
+template <typename I>
+void PromoteRequest<I>::wait_update_notify() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
+
+ std::lock_guard timer_lock{*m_timer_lock};
+
+ m_scheduler_ticks = 5;
+
+ int r = m_image_ctx->state->register_update_watcher(&m_update_watch_ctx,
+ &m_update_watcher_handle);
+ if (r < 0) {
+ lderr(cct) << "failed to register update watcher: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ scheduler_unregister_update_watcher();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_update_notify() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ std::lock_guard timer_lock{*m_timer_lock};
+ m_scheduler_ticks = 0;
+}
+
+template <typename I>
+void PromoteRequest<I>::scheduler_unregister_update_watcher() {
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "scheduler_ticks=" << m_scheduler_ticks << dendl;
+
+ if (m_scheduler_ticks > 0) {
+ m_scheduler_ticks--;
+ m_timer->add_event_after(1, new LambdaContext([this](int) {
+ scheduler_unregister_update_watcher();
+ }));
+ return;
+ }
+
+ m_image_ctx->op_work_queue->queue(new LambdaContext([this](int) {
+ unregister_update_watcher();
+ }), 0);
+}
+
+template <typename I>
+void PromoteRequest<I>::unregister_update_watcher() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_unregister_update_watcher>(this);
+
+ m_image_ctx->state->unregister_update_watcher(m_update_watcher_handle, ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_unregister_update_watcher(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to unregister update watcher: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ list_watchers();
+}
+
+template <typename I>
+void PromoteRequest<I>::acquire_exclusive_lock() {
+ {
+ std::unique_lock locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock != nullptr &&
+ !m_image_ctx->exclusive_lock->is_lock_owner()) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ m_lock_acquired = true;
+ m_image_ctx->exclusive_lock->block_requests(0);
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_acquire_exclusive_lock>(this);
+
+ m_image_ctx->exclusive_lock->acquire_lock(ctx);
+ return;
+ }
+ }
+
+ rollback();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_acquire_exclusive_lock(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to acquire exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ } else {
+ std::unique_lock locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock != nullptr &&
+ !m_image_ctx->exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ r = m_image_ctx->exclusive_lock->get_unlocked_op_error();
+ locker.unlock();
+ finish(r);
+ return;
+ }
+ }
+
+ rollback();
+}
+
+template <typename I>
+void PromoteRequest<I>::rollback() {
+ if (m_rollback_snap_id == CEPH_NOSNAP) {
+ create_promote_snapshot();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+
+ auto info = m_image_ctx->get_snap_info(m_rollback_snap_id);
+ ceph_assert(info != nullptr);
+ auto snap_namespace = info->snap_namespace;
+ auto snap_name = info->name;
+
+ image_locker.unlock();
+
+ auto ctx = create_async_context_callback(
+ *m_image_ctx, create_context_callback<
+ PromoteRequest<I>, &PromoteRequest<I>::handle_rollback>(this));
+
+ m_image_ctx->operations->execute_snap_rollback(snap_namespace, snap_name,
+ m_progress_ctx, ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_rollback(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to rollback: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ create_promote_snapshot();
+}
+
+template <typename I>
+void PromoteRequest<I>::create_promote_snapshot() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_create_promote_snapshot>(this);
+
+ auto req = CreatePrimaryRequest<I>::create(
+ m_image_ctx, m_global_image_id, CEPH_NOSNAP,
+ SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE,
+ (snapshot::CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS |
+ snapshot::CREATE_PRIMARY_FLAG_FORCE), nullptr, ctx);
+ req->send();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_create_promote_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to create promote snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ disable_non_primary_feature();
+}
+
+template <typename I>
+void PromoteRequest<I>::disable_non_primary_feature() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // remove the non-primary feature flag so that the image can be
+ // R/W by standard RBD clients
+ librados::ObjectWriteOperation op;
+ cls_client::set_features(&op, 0U, RBD_FEATURE_NON_PRIMARY);
+
+ auto aio_comp = create_rados_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_disable_non_primary_feature>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_disable_non_primary_feature(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to disable non-primary feature: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ release_exclusive_lock();
+}
+
+template <typename I>
+void PromoteRequest<I>::release_exclusive_lock() {
+ if (m_lock_acquired) {
+ std::unique_lock locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock != nullptr) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ m_image_ctx->exclusive_lock->unblock_requests();
+
+ auto ctx = create_context_callback<
+ PromoteRequest<I>,
+ &PromoteRequest<I>::handle_release_exclusive_lock>(this);
+
+ m_image_ctx->exclusive_lock->release_lock(ctx);
+ return;
+ }
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_release_exclusive_lock(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void PromoteRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::PromoteRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/PromoteRequest.h b/src/librbd/mirror/snapshot/PromoteRequest.h
new file mode 100644
index 000000000..1d9a862a0
--- /dev/null
+++ b/src/librbd/mirror/snapshot/PromoteRequest.h
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_PROMOTE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_PROMOTE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rbd/librbd.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "librbd/internal.h"
+
+#include <string>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PromoteRequest {
+public:
+ static PromoteRequest *create(ImageCtxT *image_ctx,
+ const std::string& global_image_id,
+ Context *on_finish) {
+ return new PromoteRequest(image_ctx, global_image_id, on_finish);
+ }
+
+ PromoteRequest(ImageCtxT *image_ctx, const std::string& global_image_id,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_global_image_id(global_image_id),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | (can promote)
+ * |\----------------------------------------\
+ * | |
+ * | |
+ * v (skip if not needed) |
+ * CREATE_ORPHAN_SNAPSHOT |
+ * | |
+ * | /-- UNREGISTER_UPDATE_WATCHER <-\ |
+ * v v | |
+ * LIST_WATCHERS ----> WAIT_UPDATE_NOTIFY --/ |
+ * | |
+ * | (no watchers) |
+ * v |
+ * ACQUIRE_EXCLUSIVE_LOCK |
+ * | (skip if not needed) |
+ * v |
+ * ROLLBACK |
+ * | |
+ * v |
+ * CREATE_PROMOTE_SNAPSHOT <--------------------/
+ * |
+ * v
+ * DISABLE_NON_PRIMARY_FEATURE
+ * |
+ * v
+ * RELEASE_EXCLUSIVE_LOCK (skip if not needed)
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ std::string m_global_image_id;
+ Context *m_on_finish;
+
+ uint64_t m_rollback_snap_id = CEPH_NOSNAP;
+ bool m_lock_acquired = false;
+ NoOpProgressContext m_progress_ctx;
+
+ class UpdateWatchCtx : public librbd::UpdateWatchCtx {
+ public:
+ UpdateWatchCtx(PromoteRequest *promote_request)
+ : promote_request(promote_request) {
+ }
+
+ void handle_notify() {
+ promote_request->handle_update_notify();
+ }
+
+ private:
+ PromoteRequest *promote_request;
+
+ } m_update_watch_ctx = {this};
+
+ std::list<obj_watch_t> m_watchers;
+ uint64_t m_update_watcher_handle = 0;
+ uint64_t m_scheduler_ticks = 0;
+ SafeTimer *m_timer = nullptr;
+ ceph::mutex *m_timer_lock = nullptr;
+
+ void refresh_image();
+ void handle_refresh_image(int r);
+
+ void create_orphan_snapshot();
+ void handle_create_orphan_snapshot(int r);
+
+ void list_watchers();
+ void handle_list_watchers(int r);
+
+ void wait_update_notify();
+ void handle_update_notify();
+ void scheduler_unregister_update_watcher();
+
+ void unregister_update_watcher();
+ void handle_unregister_update_watcher(int r);
+
+ void acquire_exclusive_lock();
+ void handle_acquire_exclusive_lock(int r);
+
+ void rollback();
+ void handle_rollback(int r);
+
+ void create_promote_snapshot();
+ void handle_create_promote_snapshot(int r);
+
+ void disable_non_primary_feature();
+ void handle_disable_non_primary_feature(int r);
+
+ void release_exclusive_lock();
+ void handle_release_exclusive_lock(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::PromoteRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_PROMOTE_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/RemoveImageStateRequest.cc b/src/librbd/mirror/snapshot/RemoveImageStateRequest.cc
new file mode 100644
index 000000000..204e0489a
--- /dev/null
+++ b/src/librbd/mirror/snapshot/RemoveImageStateRequest.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/RemoveImageStateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/Types.h"
+#include "librbd/mirror/snapshot/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::RemoveImageStateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void RemoveImageStateRequest<I>::send() {
+ get_object_count();
+}
+
+
+template <typename I>
+void RemoveImageStateRequest<I>::get_object_count() {
+ CephContext *cct = m_image_ctx->cct;
+
+ auto oid = util::image_state_object_name(m_image_ctx, m_snap_id, 0);
+ ldout(cct, 15) << oid << dendl;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, &m_bl, nullptr);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ RemoveImageStateRequest<I>,
+ &RemoveImageStateRequest<I>::handle_get_object_count>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op, nullptr);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void RemoveImageStateRequest<I>::handle_get_object_count(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to read image state object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ImageStateHeader header(1);
+ auto iter = m_bl.cbegin();
+ try {
+ using ceph::decode;
+
+ decode(header, iter);
+ } catch (const buffer::error &err) {
+ lderr(cct) << "failed to decode image state object header" << dendl;
+ // still try to remove it
+ }
+
+ m_object_count = header.object_count > 0 ? header.object_count : 1;
+
+ remove_object();
+}
+
+template <typename I>
+void RemoveImageStateRequest<I>::remove_object() {
+ CephContext *cct = m_image_ctx->cct;
+
+ ceph_assert(m_object_count > 0);
+ m_object_count--;
+
+ auto oid = util::image_state_object_name(m_image_ctx, m_snap_id,
+ m_object_count);
+ ldout(cct, 15) << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *comp = create_rados_callback<
+ RemoveImageStateRequest<I>,
+ &RemoveImageStateRequest<I>::handle_remove_object>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void RemoveImageStateRequest<I>::handle_remove_object(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to remove image state object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_object_count == 0) {
+ finish(0);
+ return;
+ }
+
+ remove_object();
+}
+
+template <typename I>
+void RemoveImageStateRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::RemoveImageStateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/RemoveImageStateRequest.h b/src/librbd/mirror/snapshot/RemoveImageStateRequest.h
new file mode 100644
index 000000000..be7dad8e0
--- /dev/null
+++ b/src/librbd/mirror/snapshot/RemoveImageStateRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_REMOVE_IMAGE_STATE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_REMOVE_IMAGE_STATE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class RemoveImageStateRequest {
+public:
+ static RemoveImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id,
+ Context *on_finish) {
+ return new RemoveImageStateRequest(image_ctx, snap_id, on_finish);
+ }
+
+ RemoveImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_OBJECT_COUNT
+ * |
+ * v
+ * REMOVE_OBJECT (repeat for
+ * | every object)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_snap_id;
+ Context *m_on_finish;
+
+ bufferlist m_bl;
+
+ size_t m_object_count = 0;
+
+ void get_object_count();
+ void handle_get_object_count(int r);
+
+ void remove_object();
+ void handle_remove_object(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::RemoveImageStateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_REMOVE_IMAGE_STATE_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/SetImageStateRequest.cc b/src/librbd/mirror/snapshot/SetImageStateRequest.cc
new file mode 100644
index 000000000..9fcee0322
--- /dev/null
+++ b/src/librbd/mirror/snapshot/SetImageStateRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/SetImageStateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/GetMetadataRequest.h"
+#include "librbd/mirror/snapshot/WriteImageStateRequest.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror_snapshot::SetImageStateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void SetImageStateRequest<I>::send() {
+ get_name();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::get_name() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::dir_get_name_start(&op, m_image_ctx->id);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ SetImageStateRequest<I>,
+ &SetImageStateRequest<I>::handle_get_name>(this);
+ m_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::handle_get_name(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_bl.cbegin();
+ r = cls_client::dir_get_name_finish(&it, &m_image_state.name);
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve image name: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(cct, 15) << "name=" << m_image_state.name << dendl;
+
+ get_snap_limit();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::get_snap_limit() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::snapshot_get_limit_start(&op);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ SetImageStateRequest<I>,
+ &SetImageStateRequest<I>::handle_get_snap_limit>(this);
+ m_bl.clear();
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+ &m_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::handle_get_snap_limit(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_bl.cbegin();
+ r = cls_client::snapshot_get_limit_finish(&it, &m_image_state.snap_limit);
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve snapshot limit: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ ldout(cct, 15) << "snap_limit=" << m_image_state.snap_limit << dendl;
+
+ get_metadata();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::get_metadata() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ SetImageStateRequest<I>,
+ &SetImageStateRequest<I>::handle_get_metadata>(this);
+ auto req = image::GetMetadataRequest<I>::create(
+ m_image_ctx->md_ctx, m_image_ctx->header_oid, true, "", "", 0,
+ &m_image_state.metadata, ctx);
+ req->send();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::handle_get_metadata(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve metadata: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+
+ m_image_state.features =
+ m_image_ctx->features & ~RBD_FEATURES_IMPLICIT_ENABLE;
+
+ for (auto &[snap_id, snap_info] : m_image_ctx->snap_info) {
+ auto type = cls::rbd::get_snap_namespace_type(snap_info.snap_namespace);
+ if (type != cls::rbd::SNAPSHOT_NAMESPACE_TYPE_USER) {
+ // only replicate user snapshots -- trash snapshots will be
+ // replicated by an implicit delete if required
+ continue;
+ }
+ m_image_state.snapshots[snap_id] = {snap_info.snap_namespace,
+ snap_info.name,
+ snap_info.protection_status};
+ }
+ }
+
+ write_image_state();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::write_image_state() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ SetImageStateRequest<I>,
+ &SetImageStateRequest<I>::handle_write_image_state>(this);
+
+ auto req = WriteImageStateRequest<I>::create(m_image_ctx, m_snap_id,
+ m_image_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::handle_write_image_state(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to write image state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ update_primary_snapshot();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::update_primary_snapshot() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_snapshot_set_copy_progress(
+ &op, m_snap_id, true, 0);
+
+ auto aio_comp = create_rados_callback<
+ SetImageStateRequest<I>,
+ &SetImageStateRequest<I>::handle_update_primary_snapshot>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SetImageStateRequest<I>::handle_update_primary_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to update primary snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void SetImageStateRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::SetImageStateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/SetImageStateRequest.h b/src/librbd/mirror/snapshot/SetImageStateRequest.h
new file mode 100644
index 000000000..fd2815494
--- /dev/null
+++ b/src/librbd/mirror/snapshot/SetImageStateRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_SET_IMAGE_STATE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_SET_IMAGE_STATE_REQUEST_H
+
+#include "librbd/mirror/snapshot/Types.h"
+
+#include <map>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SetImageStateRequest {
+public:
+ static SetImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id,
+ Context *on_finish) {
+ return new SetImageStateRequest(image_ctx, snap_id, on_finish);
+ }
+
+ SetImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_NAME
+ * |
+ * v
+ * GET_SNAP_LIMIT
+ * |
+ * v
+ * GET_METADATA
+ * |
+ * v
+ * WRITE_IMAGE_STATE
+ * |
+ * v
+ * UPDATE_PRIMARY_SNAPSHOT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_snap_id;
+ Context *m_on_finish;
+
+ ImageState m_image_state;
+
+ bufferlist m_bl;
+ bufferlist m_state_bl;
+
+ void get_name();
+ void handle_get_name(int r);
+
+ void get_snap_limit();
+ void handle_get_snap_limit(int r);
+
+ void get_metadata();
+ void handle_get_metadata(int r);
+
+ void write_image_state();
+ void handle_write_image_state(int r);
+
+ void update_primary_snapshot();
+ void handle_update_primary_snapshot(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::SetImageStateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_SET_IMAGE_STATE_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/Types.cc b/src/librbd/mirror/snapshot/Types.cc
new file mode 100644
index 000000000..866b4c3e2
--- /dev/null
+++ b/src/librbd/mirror/snapshot/Types.cc
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "include/encoding.h"
+#include "include/stringify.h"
+#include "librbd/mirror/snapshot/Types.h"
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+void ImageStateHeader::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(object_count, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ImageStateHeader::decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(object_count, bl);
+ DECODE_FINISH(bl);
+}
+
+void SnapState::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(snap_namespace, bl);
+ encode(name, bl);
+ encode(protection_status, bl);
+ ENCODE_FINISH(bl);
+}
+
+void SnapState::decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(snap_namespace, bl);
+ decode(name, bl);
+ decode(protection_status, bl);
+ DECODE_FINISH(bl);
+}
+
+void SnapState::dump(Formatter *f) const {
+ f->open_object_section("namespace");
+ snap_namespace.dump(f);
+ f->close_section();
+ f->dump_string("name", name);
+ f->dump_unsigned("protection_status", protection_status);
+}
+
+std::ostream& operator<<(std::ostream& os, const SnapState& snap_state) {
+ os << "["
+ << "namespace=" << snap_state.snap_namespace << ", "
+ << "name=" << snap_state.name << ", "
+ << "protection=" << static_cast<int>(snap_state.protection_status)
+ << "]";
+ return os;
+}
+
+void ImageState::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(name, bl);
+ encode(features, bl);
+ encode(snap_limit, bl);
+ encode(snapshots, bl);
+ encode(metadata, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ImageState::decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(name, bl);
+ decode(features, bl);
+ decode(snap_limit, bl);
+ decode(snapshots, bl);
+ decode(metadata, bl);
+ DECODE_FINISH(bl);
+}
+
+void ImageState::dump(Formatter *f) const {
+ f->dump_string("name", name);
+ f->dump_unsigned("features", features);
+ f->dump_unsigned("snap_limit", snap_limit);
+ f->open_array_section("snapshots");
+ for (auto &[id, snap_state] : snapshots) {
+ f->open_object_section(stringify(id).c_str());
+ snap_state.dump(f);
+ f->close_section(); // snap_state
+ }
+ f->close_section(); // snapshots
+ f->open_object_section("metadata");
+ for (auto &it : metadata) {
+ f->dump_stream(it.first.c_str()) << it.second;
+ }
+ f->close_section(); // metadata
+}
+
+std::ostream& operator<<(std::ostream& os, const ImageState& image_state) {
+ os << "["
+ << "name=" << image_state.name << ", "
+ << "features=" << image_state.features << ", "
+ << "snap_limit=" << image_state.snap_limit << ", "
+ << "snaps=" << image_state.snapshots << ", "
+ << "metadata_count=" << image_state.metadata.size()
+ << "]";
+ return os;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
diff --git a/src/librbd/mirror/snapshot/Types.h b/src/librbd/mirror/snapshot/Types.h
new file mode 100644
index 000000000..79947a5f8
--- /dev/null
+++ b/src/librbd/mirror/snapshot/Types.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_TYPES_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_TYPES_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/buffer.h"
+#include "include/types.h"
+
+#include <map>
+#include <string>
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+enum CreatePrimaryFlags {
+ CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS = (1 << 0),
+ CREATE_PRIMARY_FLAG_DEMOTED = (1 << 1),
+ CREATE_PRIMARY_FLAG_FORCE = (1 << 2)
+};
+
+struct ImageStateHeader {
+ uint32_t object_count = 0;
+
+ ImageStateHeader() {
+ }
+ ImageStateHeader(uint32_t object_count) : object_count(object_count) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &it);
+};
+
+WRITE_CLASS_ENCODER(ImageStateHeader);
+
+struct SnapState {
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string name;
+ uint8_t protection_status = 0;
+
+ SnapState() {
+ }
+ SnapState(const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &name, uint8_t protection_status)
+ : snap_namespace(snap_namespace), name(name),
+ protection_status(protection_status) {
+ }
+
+ bool operator==(const SnapState& rhs) const {
+ return snap_namespace == rhs.snap_namespace &&
+ name == rhs.name && protection_status == rhs.protection_status;
+ }
+
+ bool operator<(const SnapState& rhs) const {
+ if (snap_namespace != rhs.snap_namespace) {
+ return snap_namespace < rhs.snap_namespace;
+ }
+ if (name != rhs.name) {
+ return name < rhs.name;
+ }
+ return protection_status < rhs.protection_status;
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &it);
+ void dump(Formatter *f) const;
+};
+
+std::ostream& operator<<(std::ostream& os, const SnapState& snap_state);
+
+WRITE_CLASS_ENCODER(SnapState);
+
+struct ImageState {
+ std::string name;
+ uint64_t features = 0;
+ uint64_t snap_limit = 0;
+ std::map<uint64_t, SnapState> snapshots;
+ std::map<std::string, bufferlist> metadata;
+
+ ImageState() {
+ }
+ ImageState(const std::string &name, uint64_t features, uint64_t snap_limit,
+ const std::map<uint64_t, SnapState> &snapshots,
+ const std::map<std::string, bufferlist> &metadata)
+ : name(name), features(features), snap_limit(snap_limit),
+ snapshots(snapshots), metadata(metadata) {
+ }
+
+ bool operator==(const ImageState& rhs) const {
+ return name == rhs.name && features == rhs.features &&
+ snap_limit == rhs.snap_limit && snapshots == rhs.snapshots;
+ }
+
+ bool operator<(const ImageState& rhs) const {
+ if (name != rhs.name) {
+ return name < rhs.name;
+ }
+ if (features != rhs.features) {
+ return features < rhs.features;
+ }
+ if (snap_limit != rhs.snap_limit) {
+ return snap_limit < rhs.snap_limit;
+ }
+ return snapshots < rhs.snapshots;
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &it);
+ void dump(Formatter *f) const;
+};
+
+std::ostream& operator<<(std::ostream& os, const ImageState& image_state);
+
+WRITE_CLASS_ENCODER(ImageState);
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_TYPES_H
diff --git a/src/librbd/mirror/snapshot/UnlinkPeerRequest.cc b/src/librbd/mirror/snapshot/UnlinkPeerRequest.cc
new file mode 100644
index 000000000..6e1884249
--- /dev/null
+++ b/src/librbd/mirror/snapshot/UnlinkPeerRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/UnlinkPeerRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::UnlinkPeerRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void UnlinkPeerRequest<I>::send() {
+ if (!m_image_ctx->state->is_refresh_required()) {
+ unlink_peer();
+ return;
+ }
+
+ refresh_image();
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::refresh_image() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ UnlinkPeerRequest<I>, &UnlinkPeerRequest<I>::handle_refresh_image>(this);
+ m_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::handle_refresh_image(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ unlink_peer();
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::unlink_peer() {
+ CephContext *cct = m_image_ctx->cct;
+
+ m_image_ctx->image_lock.lock_shared();
+ int r = -ENOENT;
+ cls::rbd::MirrorSnapshotNamespace* mirror_ns = nullptr;
+ m_newer_mirror_snapshots = false;
+ for (auto snap_it = m_image_ctx->snap_info.find(m_snap_id);
+ snap_it != m_image_ctx->snap_info.end(); ++snap_it) {
+ if (snap_it->first == m_snap_id) {
+ r = 0;
+ mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &snap_it->second.snap_namespace);
+ } else if (boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &snap_it->second.snap_namespace) != nullptr) {
+ ldout(cct, 15) << "located newer mirror snapshot" << dendl;
+ m_newer_mirror_snapshots = true;
+ break;
+ }
+ }
+
+ if (r == -ENOENT) {
+ ldout(cct, 15) << "missing snapshot: snap_id=" << m_snap_id << dendl;
+ m_image_ctx->image_lock.unlock_shared();
+ finish(r);
+ return;
+ }
+
+ if (mirror_ns == nullptr) {
+ lderr(cct) << "not mirror snapshot (snap_id=" << m_snap_id << ")" << dendl;
+ m_image_ctx->image_lock.unlock_shared();
+ finish(-EINVAL);
+ return;
+ }
+
+ // if there is or will be no more peers in the mirror snapshot and we have
+ // a more recent mirror snapshot, remove the older one
+ if ((mirror_ns->mirror_peer_uuids.count(m_mirror_peer_uuid) == 0) ||
+ (mirror_ns->mirror_peer_uuids.size() <= 1U && m_newer_mirror_snapshots)) {
+ m_image_ctx->image_lock.unlock_shared();
+ remove_snapshot();
+ return;
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(cct, 15) << "snap_id=" << m_snap_id << ", "
+ << "mirror_peer_uuid=" << m_mirror_peer_uuid << dendl;
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_snapshot_unlink_peer(&op, m_snap_id,
+ m_mirror_peer_uuid);
+ auto aio_comp = create_rados_callback<
+ UnlinkPeerRequest<I>, &UnlinkPeerRequest<I>::handle_unlink_peer>(this);
+ r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::handle_unlink_peer(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r == -ERESTART || r == -ENOENT) {
+ refresh_image();
+ return;
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to unlink peer: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ notify_update();
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::notify_update() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ auto ctx = create_context_callback<
+ UnlinkPeerRequest<I>, &UnlinkPeerRequest<I>::handle_notify_update>(this);
+ m_image_ctx->notify_update(ctx);
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::handle_notify_update(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r == -ENOENT || r == -ETIMEDOUT) {
+ // non-fatel errors
+ lderr(cct) << "failed to notify update: " << cpp_strerror(r) << dendl;
+ } else if (r < 0) {
+ lderr(cct) << "failed to notify update: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ refresh_image();
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::remove_snapshot() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << dendl;
+
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+ int r = 0;
+ {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+
+ auto snap_info = m_image_ctx->get_snap_info(m_snap_id);
+ if (!snap_info) {
+ r = -ENOENT;
+ } else {
+ snap_namespace = snap_info->snap_namespace;
+ snap_name = snap_info->name;
+ }
+ }
+
+ if (r == -ENOENT) {
+ ldout(cct, 15) << "failed to locate snapshot " << m_snap_id << dendl;
+ finish(0);
+ return;
+ }
+
+ auto info = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ snap_namespace);
+
+ info.mirror_peer_uuids.erase(m_mirror_peer_uuid);
+ if (!info.mirror_peer_uuids.empty() || !m_newer_mirror_snapshots) {
+ ldout(cct, 15) << "skipping removal of snapshot: "
+ << "snap_id=" << m_snap_id << ": "
+ << "mirror_peer_uuid=" << m_mirror_peer_uuid << ", "
+ << "mirror_peer_uuids=" << info.mirror_peer_uuids << dendl;
+ finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ UnlinkPeerRequest<I>, &UnlinkPeerRequest<I>::handle_remove_snapshot>(this);
+ m_image_ctx->operations->snap_remove(snap_namespace, snap_name, ctx);
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::handle_remove_snapshot(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to remove snapshot: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void UnlinkPeerRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ auto on_finish = m_on_finish;
+ delete this;
+ on_finish->complete(r);
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::UnlinkPeerRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/UnlinkPeerRequest.h b/src/librbd/mirror/snapshot/UnlinkPeerRequest.h
new file mode 100644
index 000000000..9ef47269d
--- /dev/null
+++ b/src/librbd/mirror/snapshot/UnlinkPeerRequest.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_UNLINK_PEER_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_UNLINK_PEER_REQUEST_H
+
+#include "include/buffer.h"
+
+#include <string>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class UnlinkPeerRequest {
+public:
+ static UnlinkPeerRequest *create(ImageCtxT *image_ctx, uint64_t snap_id,
+ const std::string &mirror_peer_uuid,
+ Context *on_finish) {
+ return new UnlinkPeerRequest(image_ctx, snap_id, mirror_peer_uuid,
+ on_finish);
+ }
+
+ UnlinkPeerRequest(ImageCtxT *image_ctx, uint64_t snap_id,
+ const std::string &mirror_peer_uuid, Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id),
+ m_mirror_peer_uuid(mirror_peer_uuid), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * REFRESH_IMAGE <--------------------------\
+ * | ^ (not found |
+ * | * or last) |
+ * | * |
+ * |\---------------> UNLINK_PEER --> NOTIFY_UPDATE
+ * | (not last peer or
+ * | no newer mirror
+ * | snap exists)
+ * |
+ * |\---------------> REMOVE_SNAPSHOT
+ * | (last peer and |
+ * | newer mirror |
+ * | snap exists) |
+ * | |
+ * |(peer not found) |
+ * v |
+ * <finish> <---------------/
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_snap_id;
+ std::string m_mirror_peer_uuid;
+ Context *m_on_finish;
+
+ bool m_newer_mirror_snapshots = false;
+
+ void refresh_image();
+ void handle_refresh_image(int r);
+
+ void unlink_peer();
+ void handle_unlink_peer(int r);
+
+ void notify_update();
+ void handle_notify_update(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::UnlinkPeerRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_UNLINK_PEER_REQUEST_H
diff --git a/src/librbd/mirror/snapshot/Utils.cc b/src/librbd/mirror/snapshot/Utils.cc
new file mode 100644
index 000000000..ecf884b54
--- /dev/null
+++ b/src/librbd/mirror/snapshot/Utils.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/mirror/snapshot/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::util: " \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+namespace util {
+
+namespace {
+
+const std::string IMAGE_STATE_OBJECT_PREFIX = "rbd_mirror_snapshot.";
+
+bool get_rollback_snap_id(
+ std::map<librados::snap_t, SnapInfo>::reverse_iterator it,
+ std::map<librados::snap_t, SnapInfo>::reverse_iterator end,
+ uint64_t *rollback_snap_id) {
+
+ for (; it != end; it++) {
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &it->second.snap_namespace);
+ if (mirror_ns->state != cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY) {
+ break;
+ }
+ if (mirror_ns->complete) {
+ break;
+ }
+ }
+
+ if (it != end) {
+ *rollback_snap_id = it->first;
+ return true;
+ }
+
+ return false;
+}
+
+} // anonymous namespace
+
+std::string get_image_meta_key(const std::string& mirror_uuid) {
+ return ".rbd_mirror." + mirror_uuid;
+}
+
+template <typename I>
+bool can_create_primary_snapshot(I *image_ctx, bool demoted, bool force,
+ bool* requires_orphan,
+ uint64_t *rollback_snap_id) {
+ CephContext *cct = image_ctx->cct;
+
+ if (requires_orphan != nullptr) {
+ *requires_orphan = false;
+ }
+ if (rollback_snap_id) {
+ *rollback_snap_id = CEPH_NOSNAP;
+ }
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ for (auto it = image_ctx->snap_info.rbegin();
+ it != image_ctx->snap_info.rend(); it++) {
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &it->second.snap_namespace);
+ if (mirror_ns == nullptr) {
+ continue;
+ }
+ ldout(cct, 20) << "previous snapshot snap_id=" << it->first << " "
+ << *mirror_ns << dendl;
+ if (mirror_ns->is_demoted() && !force) {
+ lderr(cct) << "trying to create primary snapshot without force "
+ << "when previous primary snapshot is demoted"
+ << dendl;
+ return false;
+ }
+
+ if (mirror_ns->state == cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY) {
+ if (!force) {
+ lderr(cct) << "trying to create primary snapshot without force "
+ << "when previous snapshot is non-primary"
+ << dendl;
+ return false;
+ }
+ if (demoted) {
+ lderr(cct) << "trying to create primary demoted snapshot "
+ << "when previous snapshot is non-primary"
+ << dendl;
+ return false;
+ }
+
+ if (requires_orphan != nullptr) {
+ *requires_orphan = !mirror_ns->is_demoted();
+ }
+ if (!mirror_ns->complete) {
+ ldout(cct, 20) << "needs rollback" << dendl;
+ if (!rollback_snap_id) {
+ lderr(cct) << "trying to create primary snapshot "
+ << "when previous non-primary snapshot is not copied yet"
+ << dendl;
+ return false;
+ }
+ if (!get_rollback_snap_id(++it, image_ctx->snap_info.rend(),
+ rollback_snap_id)) {
+ lderr(cct) << "cannot rollback" << dendl;
+ return false;
+ }
+ ldout(cct, 20) << "rollback_snap_id=" << *rollback_snap_id << dendl;
+ }
+ return true;
+ }
+
+ return true;
+ }
+
+ ldout(cct, 20) << "no previous mirror snapshots found" << dendl;
+ return true;
+}
+
+template <typename I>
+bool can_create_non_primary_snapshot(I *image_ctx) {
+ CephContext *cct = image_ctx->cct;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ for (auto it = image_ctx->snap_info.rbegin();
+ it != image_ctx->snap_info.rend(); it++) {
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &it->second.snap_namespace);
+ if (mirror_ns != nullptr) {
+ ldout(cct, 20) << "previous mirror snapshot snap_id=" << it->first << " "
+ << *mirror_ns << dendl;
+
+ if (mirror_ns->state == cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY) {
+ if (!mirror_ns->complete) {
+ lderr(cct) << "trying to create non-primary snapshot "
+ << "when previous non-primary snapshot is not copied yet"
+ << dendl;
+ return false;
+ }
+ return true;
+ }
+
+ if (mirror_ns->state == cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY) {
+ lderr(cct) << "trying to create non-primary snapshot "
+ << "when previous primary snapshot is not in demoted state"
+ << dendl;
+ return false;
+ }
+ return true;
+ }
+ }
+
+ ldout(cct, 20) << "no previous mirror snapshots found" << dendl;
+ return true;
+}
+
+template <typename I>
+std::string image_state_object_name(I *image_ctx, uint64_t snap_id,
+ uint64_t index) {
+ return IMAGE_STATE_OBJECT_PREFIX + image_ctx->id + "." +
+ stringify(snap_id) + "." + stringify(index);
+}
+
+} // namespace util
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template bool librbd::mirror::snapshot::util::can_create_primary_snapshot(
+ librbd::ImageCtx *image_ctx, bool demoted, bool force,
+ bool* requires_orphan, uint64_t *rollback_snap_id);
+
+template bool librbd::mirror::snapshot::util::can_create_non_primary_snapshot(
+ librbd::ImageCtx *image_ctx);
+
+template std::string librbd::mirror::snapshot::util::image_state_object_name(
+ librbd::ImageCtx *image_ctx, uint64_t snap_id, uint64_t index);
diff --git a/src/librbd/mirror/snapshot/Utils.h b/src/librbd/mirror/snapshot/Utils.h
new file mode 100644
index 000000000..127ec5865
--- /dev/null
+++ b/src/librbd/mirror/snapshot/Utils.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_UTILS_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_UTILS_H
+
+#include "include/int_types.h"
+#include "include/stringify.h"
+#include <string>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+namespace util {
+
+std::string get_image_meta_key(const std::string& mirror_uuid);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+bool can_create_primary_snapshot(ImageCtxT *image_ctx, bool demoted, bool force,
+ bool* requires_orphan,
+ uint64_t *rollback_snap_id);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+bool can_create_non_primary_snapshot(ImageCtxT *image_ctx);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+std::string image_state_object_name(ImageCtxT *image_ctx, uint64_t snap_id,
+ uint64_t index);
+
+} // namespace util
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_UTILS_H
diff --git a/src/librbd/mirror/snapshot/WriteImageStateRequest.cc b/src/librbd/mirror/snapshot/WriteImageStateRequest.cc
new file mode 100644
index 000000000..c79dd7e2c
--- /dev/null
+++ b/src/librbd/mirror/snapshot/WriteImageStateRequest.cc
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirror/snapshot/WriteImageStateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::mirror::snapshot::WriteImageStateRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace mirror {
+namespace snapshot {
+
+namespace {
+
+static size_t header_length() {
+ bufferlist bl;
+ ImageStateHeader header;
+
+ using ceph::encode;
+ encode(header, bl);
+
+ return bl.length();
+}
+
+}
+using librbd::util::create_rados_callback;
+
+template <typename I>
+WriteImageStateRequest<I>::WriteImageStateRequest(I *image_ctx,
+ uint64_t snap_id,
+ const ImageState &image_state,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_snap_id(snap_id), m_image_state(image_state),
+ m_on_finish(on_finish), m_object_size(
+ 1 << image_ctx->config.template get_val<uint64_t>("rbd_default_order")) {
+ bufferlist bl;
+ encode(m_image_state, bl);
+
+ m_object_count = 1 + (header_length() + bl.length()) / m_object_size;
+ ImageStateHeader header(m_object_count);
+
+ encode(header, m_bl);
+ m_bl.claim_append(bl);
+}
+
+template <typename I>
+void WriteImageStateRequest<I>::send() {
+ write_object();
+}
+
+template <typename I>
+void WriteImageStateRequest<I>::write_object() {
+ CephContext *cct = m_image_ctx->cct;
+ ceph_assert(m_object_count > 0);
+
+ m_object_count--;
+
+ auto oid = util::image_state_object_name(m_image_ctx, m_snap_id,
+ m_object_count);
+ ldout(cct, 15) << oid << dendl;
+
+ size_t off = m_object_count * m_object_size;
+ size_t len = std::min(m_bl.length() - off, m_object_size);
+ bufferlist bl;
+ bl.substr_of(m_bl, off, len);
+
+ librados::ObjectWriteOperation op;
+ op.write_full(bl);
+
+ librados::AioCompletion *comp = create_rados_callback<
+ WriteImageStateRequest<I>,
+ &WriteImageStateRequest<I>::handle_write_object>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void WriteImageStateRequest<I>::handle_write_object(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to write object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_object_count == 0) {
+ finish(0);
+ return;
+ }
+
+ write_object();
+}
+
+template <typename I>
+void WriteImageStateRequest<I>::finish(int r) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+template class librbd::mirror::snapshot::WriteImageStateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/mirror/snapshot/WriteImageStateRequest.h b/src/librbd/mirror/snapshot/WriteImageStateRequest.h
new file mode 100644
index 000000000..d2c4a7f80
--- /dev/null
+++ b/src/librbd/mirror/snapshot/WriteImageStateRequest.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_WRITE_IMAGE_STATE_REQUEST_H
+#define CEPH_LIBRBD_MIRROR_SNAPSHOT_WRITE_IMAGE_STATE_REQUEST_H
+
+#include "librbd/mirror/snapshot/Types.h"
+
+#include <map>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class WriteImageStateRequest {
+public:
+ static WriteImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id,
+ const ImageState &image_state,
+ Context *on_finish) {
+ return new WriteImageStateRequest(image_ctx, snap_id, image_state,
+ on_finish);
+ }
+
+ WriteImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id,
+ const ImageState &image_state, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * WRITE_OBJECT (repeat for
+ * | every object)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_snap_id;
+ ImageState m_image_state;
+ Context *m_on_finish;
+
+ bufferlist m_bl;
+
+ const size_t m_object_size;
+ size_t m_object_count = 0;
+
+ void write_object();
+ void handle_write_object(int r);
+
+ void finish(int r);
+};
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+extern template class librbd::mirror::snapshot::WriteImageStateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_WRITE_IMAGE_STATE_REQUEST_H
diff --git a/src/librbd/mirroring_watcher/Types.cc b/src/librbd/mirroring_watcher/Types.cc
new file mode 100644
index 000000000..3226b6352
--- /dev/null
+++ b/src/librbd/mirroring_watcher/Types.cc
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "librbd/mirroring_watcher/Types.h"
+#include "librbd/watcher/Utils.h"
+
+namespace librbd {
+namespace mirroring_watcher {
+
+namespace {
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void ModeUpdatedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(mirror_mode), bl);
+}
+
+void ModeUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ uint32_t mirror_mode_decode;
+ decode(mirror_mode_decode, iter);
+ mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode_decode);
+}
+
+void ModeUpdatedPayload::dump(Formatter *f) const {
+ f->dump_stream("mirror_mode") << mirror_mode;
+}
+
+void ImageUpdatedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(mirror_image_state), bl);
+ encode(image_id, bl);
+ encode(global_image_id, bl);
+}
+
+void ImageUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ uint32_t mirror_image_state_decode;
+ decode(mirror_image_state_decode, iter);
+ mirror_image_state = static_cast<cls::rbd::MirrorImageState>(
+ mirror_image_state_decode);
+ decode(image_id, iter);
+ decode(global_image_id, iter);
+}
+
+void ImageUpdatedPayload::dump(Formatter *f) const {
+ f->dump_stream("mirror_image_state") << mirror_image_state;
+ f->dump_string("image_id", image_id);
+ f->dump_string("global_image_id", global_image_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_MODE_UPDATED:
+ payload = ModeUpdatedPayload();
+ break;
+ case NOTIFY_OP_IMAGE_UPDATED:
+ payload = ImageUpdatedPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(ModeUpdatedPayload(cls::rbd::MIRROR_MODE_DISABLED)));
+ o.push_back(new NotifyMessage(ImageUpdatedPayload(cls::rbd::MIRROR_IMAGE_STATE_DISABLING,
+ "image id", "global image id")));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_MODE_UPDATED:
+ out << "ModeUpdated";
+ break;
+ case NOTIFY_OP_IMAGE_UPDATED:
+ out << "ImageUpdated";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+} // namespace mirroring_watcher
+} // namespace librbd
diff --git a/src/librbd/mirroring_watcher/Types.h b/src/librbd/mirroring_watcher/Types.h
new file mode 100644
index 000000000..1e096a9d3
--- /dev/null
+++ b/src/librbd/mirroring_watcher/Types.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
+#define CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+namespace mirroring_watcher {
+
+enum NotifyOp {
+ NOTIFY_OP_MODE_UPDATED = 0,
+ NOTIFY_OP_IMAGE_UPDATED = 1
+};
+
+struct ModeUpdatedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_MODE_UPDATED;
+
+ cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+
+ ModeUpdatedPayload() {
+ }
+ ModeUpdatedPayload(cls::rbd::MirrorMode mirror_mode)
+ : mirror_mode(mirror_mode) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImageUpdatedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_UPDATED;
+
+ cls::rbd::MirrorImageState mirror_image_state =
+ cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+ std::string image_id;
+ std::string global_image_id;
+
+ ImageUpdatedPayload() {
+ }
+ ImageUpdatedPayload(cls::rbd::MirrorImageState mirror_image_state,
+ const std::string &image_id,
+ const std::string &global_image_id)
+ : mirror_image_state(mirror_image_state), image_id(image_id),
+ global_image_id(global_image_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ModeUpdatedPayload,
+ ImageUpdatedPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace mirroring_watcher
+} // namespace librbd
+
+using librbd::mirroring_watcher::encode;
+using librbd::mirroring_watcher::decode;
+
+#endif // CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
diff --git a/src/librbd/object_map/CreateRequest.cc b/src/librbd/object_map/CreateRequest.cc
new file mode 100644
index 000000000..d26f929fa
--- /dev/null
+++ b/src/librbd/object_map/CreateRequest.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/CreateRequest.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "osdc/Striper.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::CreateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+CreateRequest<I>::CreateRequest(I *image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void CreateRequest<I>::send() {
+ CephContext *cct = m_image_ctx->cct;
+
+ uint64_t max_size = m_image_ctx->size;
+
+ {
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_snap_ids.push_back(CEPH_NOSNAP);
+ for (auto it : m_image_ctx->snap_info) {
+ max_size = std::max(max_size, it.second.size);
+ m_snap_ids.push_back(it.first);
+ }
+
+ if (ObjectMap<>::is_compatible(m_image_ctx->layout, max_size)) {
+ send_object_map_resize();
+ return;
+ }
+ }
+
+ lderr(cct) << "image size not compatible with object map" << dendl;
+ m_on_finish->complete(-EINVAL);
+}
+
+template <typename I>
+void CreateRequest<I>::send_object_map_resize() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ CreateRequest<I>, &CreateRequest<I>::handle_object_map_resize>(this);
+ C_Gather *gather_ctx = new C_Gather(cct, ctx);
+
+ for (auto snap_id : m_snap_ids) {
+ librados::ObjectWriteOperation op;
+ uint64_t snap_size = m_image_ctx->get_image_size(snap_id);
+
+ cls_client::object_map_resize(&op, Striper::get_num_objects(
+ m_image_ctx->layout, snap_size),
+ OBJECT_NONEXISTENT);
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id));
+ librados::AioCompletion *comp = create_rados_callback(gather_ctx->new_sub());
+ int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+Context *CreateRequest<I>::handle_object_map_resize(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "object map resize failed: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::CreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/CreateRequest.h b/src/librbd/object_map/CreateRequest.h
new file mode 100644
index 000000000..33984cda1
--- /dev/null
+++ b/src/librbd/object_map/CreateRequest.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H
+
+#include "include/buffer.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class CreateRequest {
+public:
+ static CreateRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return new CreateRequest(image_ctx, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . .
+ * v v .
+ * OBJECT_MAP_RESIZE . (for every snapshot)
+ * | . .
+ * v . . .
+ * <finis>
+ *
+ * @endverbatim
+ */
+
+ CreateRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ Context *m_on_finish;
+
+ std::vector<uint64_t> m_snap_ids;
+
+ void send_object_map_resize();
+ Context *handle_object_map_resize(int *result);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::CreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H
diff --git a/src/librbd/object_map/DiffRequest.cc b/src/librbd/object_map/DiffRequest.cc
new file mode 100644
index 000000000..566e98ac0
--- /dev/null
+++ b/src/librbd/object_map/DiffRequest.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/DiffRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+#include <string>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::DiffRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+void DiffRequest<I>::send() {
+ auto cct = m_image_ctx->cct;
+
+ if (m_snap_id_start == CEPH_NOSNAP || m_snap_id_start > m_snap_id_end) {
+ lderr(cct) << "invalid start/end snap ids: "
+ << "snap_id_start=" << m_snap_id_start << ", "
+ << "snap_id_end=" << m_snap_id_end << dendl;
+ finish(-EINVAL);
+ return;
+ } else if (m_snap_id_start == m_snap_id_end) {
+ // no delta between the same snapshot
+ finish(0);
+ return;
+ }
+
+ m_object_diff_state->clear();
+
+ // collect all the snap ids in the provided range (inclusive)
+ if (m_snap_id_start != 0) {
+ m_snap_ids.insert(m_snap_id_start);
+ }
+
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ auto snap_info_it = m_image_ctx->snap_info.upper_bound(m_snap_id_start);
+ auto snap_info_it_end = m_image_ctx->snap_info.lower_bound(m_snap_id_end);
+ for (; snap_info_it != snap_info_it_end; ++snap_info_it) {
+ m_snap_ids.insert(snap_info_it->first);
+ }
+ m_snap_ids.insert(m_snap_id_end);
+
+ load_object_map(&image_locker);
+}
+
+template <typename I>
+void DiffRequest<I>::load_object_map(
+ std::shared_lock<ceph::shared_mutex>* image_locker) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+
+ if (m_snap_ids.empty()) {
+ image_locker->unlock();
+
+ finish(0);
+ return;
+ }
+
+ m_current_snap_id = *m_snap_ids.begin();
+ m_snap_ids.erase(m_current_snap_id);
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "snap_id=" << m_current_snap_id << dendl;
+
+ if ((m_image_ctx->features & RBD_FEATURE_FAST_DIFF) == 0) {
+ image_locker->unlock();
+
+ ldout(cct, 10) << "fast-diff feature not enabled" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ // ignore ENOENT with intermediate snapshots since deleted
+ // snaps will get merged with later snapshots
+ m_ignore_enoent = (m_current_snap_id != m_snap_id_start &&
+ m_current_snap_id != m_snap_id_end);
+
+ if (m_current_snap_id == CEPH_NOSNAP) {
+ m_current_size = m_image_ctx->size;
+ } else {
+ auto snap_it = m_image_ctx->snap_info.find(m_current_snap_id);
+ if (snap_it == m_image_ctx->snap_info.end()) {
+ ldout(cct, 10) << "snapshot " << m_current_snap_id << " does not exist"
+ << dendl;
+ if (!m_ignore_enoent) {
+ image_locker->unlock();
+
+ finish(-ENOENT);
+ return;
+ }
+
+ load_object_map(image_locker);
+ return;
+ }
+
+ m_current_size = snap_it->second.size;
+ }
+
+ uint64_t flags = 0;
+ int r = m_image_ctx->get_flags(m_current_snap_id, &flags);
+ if (r < 0) {
+ image_locker->unlock();
+
+ lderr(cct) << "failed to retrieve image flags: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+ image_locker->unlock();
+
+ if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+ ldout(cct, 1) << "cannot perform fast diff on invalid object map"
+ << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id,
+ m_current_snap_id));
+
+ librados::ObjectReadOperation op;
+ cls_client::object_map_load_start(&op);
+
+ m_out_bl.clear();
+ auto aio_comp = create_rados_callback<
+ DiffRequest<I>, &DiffRequest<I>::handle_load_object_map>(this);
+ r = m_image_ctx->md_ctx.aio_operate(oid, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void DiffRequest<I>::handle_load_object_map(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = cls_client::object_map_load_finish(&bl_it, &m_object_map);
+ }
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id,
+ m_current_snap_id));
+ if (r == -ENOENT && m_ignore_enoent) {
+ ldout(cct, 10) << "object map " << oid << " does not exist" << dendl;
+
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ load_object_map(&image_locker);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "failed to load object map: " << oid << dendl;
+ finish(r);
+ return;
+ }
+ ldout(cct, 20) << "loaded object map " << oid << dendl;
+
+ uint64_t num_objs = Striper::get_num_objects(m_image_ctx->layout,
+ m_current_size);
+ if (m_object_map.size() < num_objs) {
+ ldout(cct, 1) << "object map too small: "
+ << m_object_map.size() << " < " << num_objs << dendl;
+ finish(-EINVAL);
+ return;
+ } else {
+ m_object_map.resize(num_objs);
+ }
+
+ size_t prev_object_diff_state_size = m_object_diff_state->size();
+ if (prev_object_diff_state_size < num_objs) {
+ // the diff state should be the largest of all snapshots in the set
+ m_object_diff_state->resize(num_objs);
+ }
+ if (m_object_map.size() < m_object_diff_state->size()) {
+ // the image was shrunk so expanding the object map will flag end objects
+ // as non-existent and they will be compared against the previous object
+ // diff state
+ m_object_map.resize(m_object_diff_state->size());
+ }
+
+ uint64_t overlap = std::min(m_object_map.size(), prev_object_diff_state_size);
+ auto it = m_object_map.begin();
+ auto overlap_end_it = it + overlap;
+ auto diff_it = m_object_diff_state->begin();
+ uint64_t i = 0;
+ for (; it != overlap_end_it; ++it, ++diff_it, ++i) {
+ uint8_t object_map_state = *it;
+ uint8_t prev_object_diff_state = *diff_it;
+ if (object_map_state == OBJECT_EXISTS ||
+ object_map_state == OBJECT_PENDING ||
+ (object_map_state == OBJECT_EXISTS_CLEAN &&
+ prev_object_diff_state != DIFF_STATE_DATA &&
+ prev_object_diff_state != DIFF_STATE_DATA_UPDATED)) {
+ *diff_it = DIFF_STATE_DATA_UPDATED;
+ } else if (object_map_state == OBJECT_NONEXISTENT &&
+ prev_object_diff_state != DIFF_STATE_HOLE &&
+ prev_object_diff_state != DIFF_STATE_HOLE_UPDATED) {
+ *diff_it = DIFF_STATE_HOLE_UPDATED;
+ }
+
+ ldout(cct, 20) << "object state: " << i << " "
+ << static_cast<uint32_t>(prev_object_diff_state)
+ << "->" << static_cast<uint32_t>(*diff_it) << " ("
+ << static_cast<uint32_t>(object_map_state) << ")"
+ << dendl;
+ }
+ ldout(cct, 20) << "computed overlap diffs" << dendl;
+
+ bool diff_from_start = (m_snap_id_start == 0);
+ auto end_it = m_object_map.end();
+ if (m_object_map.size() > prev_object_diff_state_size) {
+ for (; it != end_it; ++it,++diff_it, ++i) {
+ uint8_t object_map_state = *it;
+ if (object_map_state == OBJECT_NONEXISTENT) {
+ *diff_it = DIFF_STATE_HOLE;
+ } else if (diff_from_start ||
+ (m_object_diff_state_valid &&
+ object_map_state != OBJECT_EXISTS_CLEAN)) {
+ *diff_it = DIFF_STATE_DATA_UPDATED;
+ } else {
+ *diff_it = DIFF_STATE_DATA;
+ }
+
+ ldout(cct, 20) << "object state: " << i << " "
+ << "->" << static_cast<uint32_t>(*diff_it) << " ("
+ << static_cast<uint32_t>(*it) << ")" << dendl;
+ }
+ }
+ ldout(cct, 20) << "computed resize diffs" << dendl;
+
+ m_object_diff_state_valid = true;
+
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ load_object_map(&image_locker);
+}
+
+template <typename I>
+void DiffRequest<I>::finish(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::DiffRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/DiffRequest.h b/src/librbd/object_map/DiffRequest.h
new file mode 100644
index 000000000..e83a1629e
--- /dev/null
+++ b/src/librbd/object_map/DiffRequest.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_DIFF_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_DIFF_REQUEST_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+#include "common/ceph_mutex.h"
+#include "librbd/object_map/Types.h"
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT>
+class DiffRequest {
+public:
+ static DiffRequest* create(ImageCtxT* image_ctx, uint64_t snap_id_start,
+ uint64_t snap_id_end,
+ BitVector<2>* object_diff_state,
+ Context* on_finish) {
+ return new DiffRequest(image_ctx, snap_id_start, snap_id_end,
+ object_diff_state, on_finish);
+ }
+
+ DiffRequest(ImageCtxT* image_ctx, uint64_t snap_id_start,
+ uint64_t snap_id_end, BitVector<2>* object_diff_state,
+ Context* on_finish)
+ : m_image_ctx(image_ctx), m_snap_id_start(snap_id_start),
+ m_snap_id_end(snap_id_end), m_object_diff_state(object_diff_state),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | /---------\
+ * | | |
+ * v v |
+ * LOAD_OBJECT_MAP ---/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ ImageCtxT* m_image_ctx;
+ uint64_t m_snap_id_start;
+ uint64_t m_snap_id_end;
+ BitVector<2>* m_object_diff_state;
+ Context* m_on_finish;
+
+ std::set<uint64_t> m_snap_ids;
+ uint64_t m_current_snap_id = 0;
+ bool m_ignore_enoent = false;
+
+ uint64_t m_current_size = 0;
+
+ BitVector<2> m_object_map;
+ bool m_object_diff_state_valid = false;
+
+ bufferlist m_out_bl;
+
+ void load_object_map(std::shared_lock<ceph::shared_mutex>* image_locker);
+ void handle_load_object_map(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::DiffRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_DIFF_REQUEST_H
diff --git a/src/librbd/object_map/InvalidateRequest.cc b/src/librbd/object_map/InvalidateRequest.cc
new file mode 100644
index 000000000..bf2db9660
--- /dev/null
+++ b/src/librbd/object_map/InvalidateRequest.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/InvalidateRequest.h"
+#include "common/dout.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::InvalidateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+template <typename I>
+InvalidateRequest<I>* InvalidateRequest<I>::create(I &image_ctx,
+ uint64_t snap_id, bool force,
+ Context *on_finish) {
+ return new InvalidateRequest<I>(image_ctx, snap_id, force, on_finish);
+}
+
+template <typename I>
+void InvalidateRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_wlocked(image_ctx.image_lock));
+
+ uint64_t snap_flags;
+ int r = image_ctx.get_flags(m_snap_id, &snap_flags);
+ if (r < 0 || ((snap_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0)) {
+ this->async_complete(r);
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ lderr(cct) << this << " invalidating object map in-memory" << dendl;
+
+ // update in-memory flags
+ uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID;
+ if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ flags |= RBD_FLAG_FAST_DIFF_INVALID;
+ }
+
+ r = image_ctx.update_flags(m_snap_id, flags, true);
+ if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+
+ // do not update on-disk flags if not image owner
+ if (image_ctx.image_watcher == nullptr ||
+ (!m_force && m_snap_id == CEPH_NOSNAP &&
+ image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner())) {
+ this->async_complete(-EROFS);
+ return;
+ }
+
+ lderr(cct) << this << " invalidating object map on-disk" << dendl;
+ librados::ObjectWriteOperation op;
+ cls_client::set_flags(&op, m_snap_id, flags, flags);
+
+ librados::AioCompletion *rados_completion =
+ this->create_callback_completion();
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+ &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+bool InvalidateRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
+ return true;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/InvalidateRequest.h b/src/librbd/object_map/InvalidateRequest.h
new file mode 100644
index 000000000..ce15bb2d3
--- /dev/null
+++ b/src/librbd/object_map/InvalidateRequest.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class InvalidateRequest : public AsyncRequest<ImageCtxT> {
+public:
+ static InvalidateRequest* create(ImageCtxT &image_ctx, uint64_t snap_id,
+ bool force, Context *on_finish);
+
+ InvalidateRequest(ImageCtxT &image_ctx, uint64_t snap_id, bool force,
+ Context *on_finish)
+ : AsyncRequest<ImageCtxT>(image_ctx, on_finish),
+ m_snap_id(snap_id), m_force(force) {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ uint64_t m_snap_id;
+ bool m_force;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
diff --git a/src/librbd/object_map/LockRequest.cc b/src/librbd/object_map/LockRequest.cc
new file mode 100644
index 000000000..b9dc3c42e
--- /dev/null
+++ b/src/librbd/object_map/LockRequest.cc
@@ -0,0 +1,157 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/LockRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::LockRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+LockRequest<I>::LockRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish), m_broke_lock(false) {
+}
+
+template <typename I>
+void LockRequest<I>::send() {
+ send_lock();
+}
+
+template <typename I>
+void LockRequest<I>::send_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::lock(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "", "",
+ utime_t(), 0);
+
+ using klass = LockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_lock>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_lock(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val == 0) {
+ return m_on_finish;
+ } else if (*ret_val == -EEXIST) {
+ // already locked by myself
+ *ret_val = 0;
+ return m_on_finish;
+ } else if (m_broke_lock || *ret_val != -EBUSY) {
+ lderr(cct) << "failed to lock object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ return m_on_finish;
+ }
+
+ send_get_lock_info();
+ return nullptr;
+}
+
+template <typename I>
+void LockRequest<I>::send_get_lock_info() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectReadOperation op;
+ rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+ using klass = LockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_get_lock_info>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_get_lock_info(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val == -ENOENT) {
+ send_lock();
+ return nullptr;
+ }
+
+ ClsLockType lock_type;
+ std::string lock_tag;
+ if (*ret_val == 0) {
+ auto it = m_out_bl.cbegin();
+ *ret_val = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+ &lock_type, &lock_tag);
+ }
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to list object map locks: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ return m_on_finish;
+ }
+
+ send_break_locks();
+ return nullptr;
+}
+
+template <typename I>
+void LockRequest<I>::send_break_locks() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << ", "
+ << "num_lockers=" << m_lockers.size() << dendl;
+
+ librados::ObjectWriteOperation op;
+ for (auto &locker : m_lockers) {
+ rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, locker.first.cookie,
+ locker.first.locker);
+ }
+
+ using klass = LockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_break_locks>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_break_locks(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ m_broke_lock = true;
+ if (*ret_val == 0 || *ret_val == -ENOENT) {
+ send_lock();
+ return nullptr;
+ }
+
+ lderr(cct) << "failed to break object map lock: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::LockRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/LockRequest.h b/src/librbd/object_map/LockRequest.h
new file mode 100644
index 000000000..0333548e6
--- /dev/null
+++ b/src/librbd/object_map/LockRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
+
+#include "include/buffer.h"
+#include "cls/lock/cls_lock_types.h"
+#include <map>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class LockRequest {
+public:
+ static LockRequest* create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new LockRequest(image_ctx, on_finish);
+ }
+ LockRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> /------------------------------------- BREAK_LOCKS * * *
+ * | | ^ *
+ * | | | *
+ * | | | *
+ * | v (EBUSY && !broke_lock) | *
+ * \---------> LOCK_OBJECT_MAP * * * * * * * * * * * > GET_LOCK_INFO * *
+ * | * ^ * *
+ * | * * * *
+ * | * * (ENOENT) * *
+ * | * * * * * * * * * * * * * * * * * *
+ * | * *
+ * | * (other errors) *
+ * | * *
+ * v v (other errors) *
+ * <finish> < * * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ bool m_broke_lock;
+ std::map<rados::cls::lock::locker_id_t,
+ rados::cls::lock::locker_info_t> m_lockers;
+ bufferlist m_out_bl;
+
+ void send_lock();
+ Context *handle_lock(int *ret_val);
+
+ void send_get_lock_info();
+ Context *handle_get_lock_info(int *ret_val);
+
+ void send_break_locks();
+ Context *handle_break_locks(int *ret_val);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::LockRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
diff --git a/src/librbd/object_map/RefreshRequest.cc b/src/librbd/object_map/RefreshRequest.cc
new file mode 100644
index 000000000..0f6b81923
--- /dev/null
+++ b/src/librbd/object_map/RefreshRequest.cc
@@ -0,0 +1,311 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/RefreshRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "librbd/object_map/LockRequest.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::RefreshRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace object_map {
+
+template <typename I>
+RefreshRequest<I>::RefreshRequest(I &image_ctx, ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish)
+ : m_image_ctx(image_ctx), m_object_map_lock(object_map_lock),
+ m_object_map(object_map), m_snap_id(snap_id), m_on_finish(on_finish),
+ m_object_count(0), m_truncate_on_disk_object_map(false) {
+}
+
+template <typename I>
+void RefreshRequest<I>::send() {
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ m_object_count = Striper::get_num_objects(
+ m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id));
+ }
+
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": "
+ << "object_count=" << m_object_count << dendl;
+ send_lock();
+}
+
+template <typename I>
+void RefreshRequest<I>::apply() {
+ uint64_t num_objs;
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ num_objs = Striper::get_num_objects(
+ m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id));
+ }
+ ceph_assert(m_on_disk_object_map.size() >= num_objs);
+
+ std::unique_lock object_map_locker{*m_object_map_lock};
+ *m_object_map = m_on_disk_object_map;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_lock() {
+ CephContext *cct = m_image_ctx.cct;
+ if (m_object_count > cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT) {
+ send_invalidate_and_close();
+ return;
+ } else if (m_snap_id != CEPH_NOSNAP) {
+ send_load();
+ return;
+ }
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_lock>(this);
+
+ LockRequest<I> *req = LockRequest<I>::create(m_image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_lock(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ ceph_assert(*ret_val == 0);
+ send_load();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_load() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::object_map_load_start(&op);
+
+ using klass = RefreshRequest<I>;
+ m_out_bl.clear();
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_load>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_load(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ *ret_val = cls_client::object_map_load_finish(&bl_it,
+ &m_on_disk_object_map);
+ }
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ if (*ret_val == -EINVAL) {
+ // object map is corrupt on-disk -- clear it and properly size it
+ // so future IO can keep the object map in sync
+ lderr(cct) << "object map corrupt on-disk: " << oid << dendl;
+ m_truncate_on_disk_object_map = true;
+ send_resize_invalidate();
+ return nullptr;
+ } else if (*ret_val < 0) {
+ lderr(cct) << "failed to load object map: " << oid << dendl;
+ if (*ret_val == -ETIMEDOUT &&
+ !cct->_conf.get_val<bool>("rbd_invalidate_object_map_on_timeout")) {
+ return m_on_finish;
+ }
+
+ send_invalidate();
+ return nullptr;
+ }
+
+ if (m_on_disk_object_map.size() < m_object_count) {
+ lderr(cct) << "object map smaller than current object count: "
+ << m_on_disk_object_map.size() << " != "
+ << m_object_count << dendl;
+ send_resize_invalidate();
+ return nullptr;
+ }
+
+ ldout(cct, 20) << "refreshed object map: num_objs="
+ << m_on_disk_object_map.size() << dendl;
+ if (m_on_disk_object_map.size() > m_object_count) {
+ // resize op might have been interrupted
+ ldout(cct, 1) << "object map larger than current object count: "
+ << m_on_disk_object_map.size() << " != "
+ << m_object_count << dendl;
+ }
+
+ apply();
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_invalidate() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_disk_object_map.clear();
+ object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count,
+ OBJECT_EXISTS);
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_invalidate>(this);
+ InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+ m_image_ctx, m_snap_id, true, ctx);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_invalidate(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ }
+
+ apply();
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_resize_invalidate() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_on_disk_object_map.clear();
+ object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count,
+ OBJECT_EXISTS);
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_resize_invalidate>(this);
+ InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+ m_image_ctx, m_snap_id, true, ctx);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_resize_invalidate(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ apply();
+ return m_on_finish;
+ }
+
+ send_resize();
+ return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_resize() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ }
+ if (m_truncate_on_disk_object_map) {
+ op.truncate(0);
+ }
+ cls_client::object_map_resize(&op, m_object_count, OBJECT_NONEXISTENT);
+
+ using klass = RefreshRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_resize>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_resize(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to adjust object map size: " << cpp_strerror(*ret_val)
+ << dendl;
+ *ret_val = 0;
+ }
+
+ apply();
+ return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_invalidate_and_close() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ using klass = RefreshRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_invalidate_and_close>(this);
+ InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+ m_image_ctx, m_snap_id, false, ctx);
+
+ lderr(cct) << "object map too large: " << m_object_count << dendl;
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_invalidate_and_close(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val)
+ << dendl;
+ } else {
+ *ret_val = -EFBIG;
+ }
+
+ std::unique_lock object_map_locker{*m_object_map_lock};
+ m_object_map->clear();
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::RefreshRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/RefreshRequest.h b/src/librbd/object_map/RefreshRequest.h
new file mode 100644
index 000000000..0bca85079
--- /dev/null
+++ b/src/librbd/object_map/RefreshRequest.h
@@ -0,0 +1,102 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/bit_vector.hpp"
+#include "common/ceph_mutex.h"
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class RefreshRequest {
+public:
+ static RefreshRequest *create(ImageCtxT &image_ctx,
+ ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map,
+ uint64_t snap_id, Context *on_finish) {
+ return new RefreshRequest(image_ctx, object_map_lock, object_map, snap_id,
+ on_finish);
+ }
+
+ RefreshRequest(ImageCtxT &image_ctx, ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map, uint64_t snap_id,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> -----> LOCK (skip if snapshot)
+ * * |
+ * * v (other errors)
+ * * LOAD * * * * * * * > INVALIDATE ------------\
+ * * | * |
+ * * | * (-EINVAL or too small) |
+ * * | * * * * * * > INVALIDATE_AND_RESIZE |
+ * * | | * |
+ * * | | * |
+ * * | v * |
+ * * | RESIZE * |
+ * * | | * |
+ * * | | * * * * * * * |
+ * * | | * |
+ * * | v v |
+ * * \--------------------> LOCK <-------------/
+ * * |
+ * v v
+ * INVALIDATE_AND_CLOSE ---------------> <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ ceph::shared_mutex* m_object_map_lock;
+ ceph::BitVector<2> *m_object_map;
+ uint64_t m_snap_id;
+ Context *m_on_finish;
+
+ uint64_t m_object_count;
+ ceph::BitVector<2> m_on_disk_object_map;
+ bool m_truncate_on_disk_object_map;
+ bufferlist m_out_bl;
+
+ void send_lock();
+ Context *handle_lock(int *ret_val);
+
+ void send_load();
+ Context *handle_load(int *ret_val);
+
+ void send_invalidate();
+ Context *handle_invalidate(int *ret_val);
+
+ void send_resize_invalidate();
+ Context *handle_resize_invalidate(int *ret_val);
+
+ void send_resize();
+ Context *handle_resize(int *ret_val);
+
+ void send_invalidate_and_close();
+ Context *handle_invalidate_and_close(int *ret_val);
+
+ void apply();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::RefreshRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
diff --git a/src/librbd/object_map/RemoveRequest.cc b/src/librbd/object_map/RemoveRequest.cc
new file mode 100644
index 000000000..a718d81fc
--- /dev/null
+++ b/src/librbd/object_map/RemoveRequest.cc
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/RemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::RemoveRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+RemoveRequest<I>::RemoveRequest(I *image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void RemoveRequest<I>::send() {
+ send_remove_object_map();
+}
+
+template <typename I>
+void RemoveRequest<I>::send_remove_object_map() {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << dendl;
+
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ std::vector<uint64_t> snap_ids;
+ snap_ids.push_back(CEPH_NOSNAP);
+ for (auto it : m_image_ctx->snap_info) {
+ snap_ids.push_back(it.first);
+ }
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_ref_counter == 0);
+
+ for (auto snap_id : snap_ids) {
+ m_ref_counter++;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id));
+ using klass = RemoveRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_remove_object_map>(this);
+
+ int r = m_image_ctx->md_ctx.aio_remove(oid, comp);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+}
+
+template <typename I>
+Context *RemoveRequest<I>::handle_remove_object_map(int *result) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 20) << __func__ << ": r=" << *result << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_ref_counter > 0);
+ m_ref_counter--;
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to remove object map: " << cpp_strerror(*result)
+ << dendl;
+ m_error_result = *result;
+ }
+ if (m_ref_counter > 0) {
+ return nullptr;
+ }
+ }
+ if (m_error_result < 0) {
+ *result = m_error_result;
+ }
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/RemoveRequest.h b/src/librbd/object_map/RemoveRequest.h
new file mode 100644
index 000000000..ce82e603c
--- /dev/null
+++ b/src/librbd/object_map/RemoveRequest.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H
+
+#include "include/buffer.h"
+#include "common/ceph_mutex.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class RemoveRequest {
+public:
+ static RemoveRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+ return new RemoveRequest(image_ctx, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . .
+ * v v .
+ * REMOVE_OBJECT_MAP . (for every snapshot)
+ * | . .
+ * v . . .
+ * <finis>
+ *
+ * @endverbatim
+ */
+
+ RemoveRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+ ImageCtxT *m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result = 0;
+ int m_ref_counter = 0;
+ mutable ceph::mutex m_lock =
+ ceph::make_mutex("object_map::RemoveRequest::m_lock");
+
+ void send_remove_object_map();
+ Context *handle_remove_object_map(int *result);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H
diff --git a/src/librbd/object_map/Request.cc b/src/librbd/object_map/Request.cc
new file mode 100644
index 000000000..1e1aab2ae
--- /dev/null
+++ b/src/librbd/object_map/Request.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/Request.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/RWLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/object_map/InvalidateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::Request: "
+
+namespace librbd {
+namespace object_map {
+
+bool Request::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << this << " should_complete: r=" << r << dendl;
+
+ switch (m_state)
+ {
+ case STATE_REQUEST:
+ if (r == -ETIMEDOUT &&
+ !cct->_conf.get_val<bool>("rbd_invalidate_object_map_on_timeout")) {
+ m_state = STATE_TIMEOUT;
+ return true;
+ } else if (r < 0) {
+ lderr(cct) << "failed to update object map: " << cpp_strerror(r)
+ << dendl;
+ return invalidate();
+ }
+
+ finish_request();
+ return true;
+
+ case STATE_INVALIDATE:
+ ldout(cct, 20) << "INVALIDATE" << dendl;
+ if (r < 0) {
+ lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r)
+ << dendl;
+ }
+ return true;
+
+ default:
+ lderr(cct) << "invalid state: " << m_state << dendl;
+ ceph_abort();
+ break;
+ }
+ return false;
+}
+
+bool Request::invalidate() {
+ bool flags_set;
+ int r = m_image_ctx.test_flags(m_snap_id, RBD_FLAG_OBJECT_MAP_INVALID,
+ &flags_set);
+ if (r < 0 || flags_set) {
+ return true;
+ }
+
+ m_state = STATE_INVALIDATE;
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id,
+ true,
+ create_callback_context());
+ req->send();
+ return false;
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/Request.h b/src/librbd/object_map/Request.h
new file mode 100644
index 000000000..7e9bfb88d
--- /dev/null
+++ b/src/librbd/object_map/Request.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class Request : public AsyncRequest<> {
+public:
+ Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish)
+ : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id),
+ m_state(STATE_REQUEST)
+ {
+ }
+
+ void send() override = 0;
+
+protected:
+ const uint64_t m_snap_id;
+
+ bool should_complete(int r) override;
+ int filter_return_code(int r) const override {
+ if (m_state == STATE_REQUEST) {
+ // never propagate an error back to the caller
+ return 0;
+ }
+ return r;
+ }
+ virtual void finish_request() {
+ }
+
+private:
+ /**
+ * STATE_TIMEOUT --------\
+ * ^ |
+ * | v
+ * <start> ---> STATE_REQUEST ---> <finish>
+ * | ^
+ * v |
+ * STATE_INVALIDATE -------/
+ */
+ enum State {
+ STATE_REQUEST,
+ STATE_TIMEOUT,
+ STATE_INVALIDATE
+ };
+
+ State m_state;
+
+ bool invalidate();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/object_map/ResizeRequest.cc b/src/librbd/object_map/ResizeRequest.cc
new file mode 100644
index 000000000..91a3140ed
--- /dev/null
+++ b/src/librbd/object_map/ResizeRequest.cc
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/ResizeRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::ResizeRequest: "
+
+namespace librbd {
+namespace object_map {
+
+void ResizeRequest::resize(ceph::BitVector<2> *object_map, uint64_t num_objs,
+ uint8_t default_state) {
+ size_t orig_object_map_size = object_map->size();
+ object_map->resize(num_objs);
+ if (num_objs > orig_object_map_size) {
+ auto it = object_map->begin() + orig_object_map_size;
+ auto end_it = object_map->begin() + num_objs;
+ for (;it != end_it; ++it) {
+ *it = default_state;
+ }
+ }
+}
+
+void ResizeRequest::send() {
+ CephContext *cct = m_image_ctx.cct;
+
+ std::unique_lock l{*m_object_map_lock};
+ m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size);
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << this << " resizing on-disk object map: "
+ << "ictx=" << &m_image_ctx << ", "
+ << "oid=" << oid << ", num_objs=" << m_num_objs << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_resize(&op, m_num_objs, m_default_object_state);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void ResizeRequest::finish_request() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " resizing in-memory object map: "
+ << m_num_objs << dendl;
+
+ std::unique_lock object_map_locker{*m_object_map_lock};
+ resize(m_object_map, m_num_objs, m_default_object_state);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/ResizeRequest.h b/src/librbd/object_map/ResizeRequest.h
new file mode 100644
index 000000000..dccdef133
--- /dev/null
+++ b/src/librbd/object_map/ResizeRequest.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+#include "common/bit_vector.hpp"
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class ResizeRequest : public Request {
+public:
+ ResizeRequest(ImageCtx &image_ctx, ceph::shared_mutex *object_map_lock,
+ ceph::BitVector<2> *object_map, uint64_t snap_id,
+ uint64_t new_size, uint8_t default_object_state,
+ Context *on_finish)
+ : Request(image_ctx, snap_id, on_finish),
+ m_object_map_lock(object_map_lock), m_object_map(object_map),
+ m_num_objs(0), m_new_size(new_size),
+ m_default_object_state(default_object_state)
+ {
+ }
+
+ static void resize(ceph::BitVector<2> *object_map, uint64_t num_objs,
+ uint8_t default_state);
+
+ void send() override;
+
+protected:
+ void finish_request() override;
+
+private:
+ ceph::shared_mutex* m_object_map_lock;
+ ceph::BitVector<2> *m_object_map;
+ uint64_t m_num_objs;
+ uint64_t m_new_size;
+ uint8_t m_default_object_state;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotCreateRequest.cc b/src/librbd/object_map/SnapshotCreateRequest.cc
new file mode 100644
index 000000000..3b2e7ee82
--- /dev/null
+++ b/src/librbd/object_map/SnapshotCreateRequest.cc
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+#include <iostream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotCreateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+ const SnapshotCreateRequest::State& state) {
+ switch(state) {
+ case SnapshotCreateRequest::STATE_READ_MAP:
+ os << "READ_MAP";
+ break;
+ case SnapshotCreateRequest::STATE_WRITE_MAP:
+ os << "WRITE_MAP";
+ break;
+ case SnapshotCreateRequest::STATE_ADD_SNAPSHOT:
+ os << "ADD_SNAPSHOT";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+void SnapshotCreateRequest::send() {
+ send_read_map();
+}
+
+bool SnapshotCreateRequest::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0 && m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ if (m_ret_val < 0) {
+ // pass errors down to base class to invalidate the object map
+ return Request::should_complete(r);
+ }
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ bool finished = false;
+ switch (m_state) {
+ case STATE_READ_MAP:
+ send_write_map();
+ break;
+ case STATE_WRITE_MAP:
+ finished = send_add_snapshot();
+ break;
+ case STATE_ADD_SNAPSHOT:
+ update_object_map();
+ finished = true;
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return finished;
+}
+
+void SnapshotCreateRequest::send_read_map() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+ m_state = STATE_READ_MAP;
+
+ // IO is blocked due to the snapshot creation -- consistent to read from disk
+ librados::ObjectReadOperation op;
+ op.read(0, 0, NULL, NULL);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op,
+ &m_read_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotCreateRequest::send_write_map() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+ << dendl;
+ m_state = STATE_WRITE_MAP;
+
+ librados::ObjectWriteOperation op;
+ op.write_full(m_read_bl);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+bool SnapshotCreateRequest::send_add_snapshot() {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) == 0) {
+ return true;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+ m_state = STATE_ADD_SNAPSHOT;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ cls_client::object_map_snap_add(&op);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return false;
+}
+
+void SnapshotCreateRequest::update_object_map() {
+ std::unique_lock object_map_locker{*m_object_map_lock};
+
+ auto it = m_object_map.begin();
+ auto end_it = m_object_map.end();
+ for (; it != end_it; ++it) {
+ if (*it == OBJECT_EXISTS) {
+ *it = OBJECT_EXISTS_CLEAN;
+ }
+ }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotCreateRequest.h b/src/librbd/object_map/SnapshotCreateRequest.h
new file mode 100644
index 000000000..3074d059d
--- /dev/null
+++ b/src/librbd/object_map/SnapshotCreateRequest.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+#include "librbd/object_map/Request.h"
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class SnapshotCreateRequest : public Request {
+public:
+ /**
+ * Snapshot create goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_READ_MAP
+ * |
+ * v (skip)
+ * STATE_WRITE_MAP . . . . . . .
+ * | .
+ * v v
+ * STATE_ADD_SNAPSHOT ---> <finish>
+ *
+ * @endverbatim
+ *
+ * The _ADD_SNAPSHOT state is skipped if the FAST_DIFF feature isn't enabled.
+ */
+ enum State {
+ STATE_READ_MAP,
+ STATE_WRITE_MAP,
+ STATE_ADD_SNAPSHOT
+ };
+
+ SnapshotCreateRequest(ImageCtx &image_ctx, ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map, uint64_t snap_id,
+ Context *on_finish)
+ : Request(image_ctx, snap_id, on_finish),
+ m_object_map_lock(object_map_lock), m_object_map(*object_map),
+ m_ret_val(0) {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ ceph::shared_mutex* m_object_map_lock;
+ ceph::BitVector<2> &m_object_map;
+
+ State m_state = STATE_READ_MAP;
+ bufferlist m_read_bl;
+ int m_ret_val;
+
+ void send_read_map();
+ void send_write_map();
+ bool send_add_snapshot();
+
+ void update_object_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotRemoveRequest.cc b/src/librbd/object_map/SnapshotRemoveRequest.cc
new file mode 100644
index 000000000..1c2ffc753
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRemoveRequest.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotRemoveRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace object_map {
+
+void SnapshotRemoveRequest::send() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_wlocked(m_image_ctx.image_lock));
+
+ if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+ int r = m_image_ctx.get_flags(m_snap_id, &m_flags);
+ ceph_assert(r == 0);
+
+ compute_next_snap_id();
+ load_map();
+ } else {
+ remove_map();
+ }
+}
+
+void SnapshotRemoveRequest::load_map() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << "snap_oid=" << snap_oid << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::object_map_load_start(&op);
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_load_map>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRemoveRequest::handle_load_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::object_map_load_finish(&it, &m_snap_object_map);
+ }
+ if (r == -ENOENT) {
+ // implies we have already deleted this snapshot and handled the
+ // necessary fast-diff cleanup
+ complete(0);
+ return;
+ } else if (r < 0) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ lderr(cct) << "failed to load object map " << oid << ": "
+ << cpp_strerror(r) << dendl;
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ invalidate_next_map();
+ return;
+ }
+
+ remove_snapshot();
+}
+
+void SnapshotRemoveRequest::remove_snapshot() {
+ if ((m_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
+ // snapshot object map exists on disk but is invalid. cannot clean fast-diff
+ // on next snapshot if current snapshot was invalid.
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ invalidate_next_map();
+ return;
+ }
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_next_snap_id));
+ ldout(cct, 5) << "oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_next_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_snap_remove(&op, m_snap_object_map);
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ SnapshotRemoveRequest,
+ &SnapshotRemoveRequest::handle_remove_snapshot>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRemoveRequest::handle_remove_snapshot(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0 && r != -ENOENT) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ m_next_snap_id));
+ lderr(cct) << "failed to remove object map snapshot " << oid << ": "
+ << cpp_strerror(r) << dendl;
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ invalidate_next_map();
+ return;
+ }
+
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ update_object_map();
+ remove_map();
+}
+
+void SnapshotRemoveRequest::invalidate_next_map() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_wlocked(m_image_ctx.image_lock));
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = librbd::util::create_context_callback<
+ SnapshotRemoveRequest,
+ &SnapshotRemoveRequest::handle_invalidate_next_map>(this);
+ InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx,
+ m_next_snap_id, true, ctx);
+ req->send();
+}
+
+void SnapshotRemoveRequest::handle_invalidate_next_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ m_next_snap_id));
+ lderr(cct) << "failed to invalidate object map " << oid << ": "
+ << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ remove_map();
+}
+
+void SnapshotRemoveRequest::remove_map() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 5) << "oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_remove_map>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRemoveRequest::handle_remove_map(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ lderr(cct) << "failed to remove object map " << oid << ": "
+ << cpp_strerror(r) << dendl;
+ complete(r);
+ return;
+ }
+
+ complete(0);
+}
+
+void SnapshotRemoveRequest::compute_next_snap_id() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+
+ m_next_snap_id = CEPH_NOSNAP;
+ std::map<librados::snap_t, SnapInfo>::const_iterator it =
+ m_image_ctx.snap_info.find(m_snap_id);
+ ceph_assert(it != m_image_ctx.snap_info.end());
+
+ ++it;
+ if (it != m_image_ctx.snap_info.end()) {
+ m_next_snap_id = it->first;
+ }
+}
+
+void SnapshotRemoveRequest::update_object_map() {
+ assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ std::unique_lock object_map_locker{*m_object_map_lock};
+ if (m_next_snap_id == m_image_ctx.snap_id && m_next_snap_id == CEPH_NOSNAP) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto it = m_object_map.begin();
+ auto end_it = m_object_map.end();
+ auto snap_it = m_snap_object_map.begin();
+ uint64_t i = 0;
+ for (; it != end_it; ++it) {
+ if (*it == OBJECT_EXISTS_CLEAN &&
+ (i >= m_snap_object_map.size() ||
+ *snap_it == OBJECT_EXISTS)) {
+ *it = OBJECT_EXISTS;
+ }
+ if (i < m_snap_object_map.size()) {
+ ++snap_it;
+ }
+ ++i;
+ }
+ }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotRemoveRequest.h b/src/librbd/object_map/SnapshotRemoveRequest.h
new file mode 100644
index 000000000..1e9c75d81
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRemoveRequest.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/bit_vector.hpp"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+namespace object_map {
+
+class SnapshotRemoveRequest : public AsyncRequest<> {
+public:
+ /**
+ * Snapshot rollback goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start> -----------> STATE_LOAD_MAP ----\
+ * . * |
+ * . * (error) |
+ * . (invalid object map) v |
+ * . . . > STATE_INVALIDATE_NEXT_MAP |
+ * . | |
+ * . | |
+ * . (fast diff disabled) v v
+ * . . . . . . . . . . > STATE_REMOVE_MAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The _LOAD_MAP state is skipped if the fast diff feature is disabled.
+ * If the fast diff feature is enabled and the snapshot is flagged as
+ * invalid, the next snapshot / HEAD object mapis flagged as invalid;
+ * otherwise, the state machine proceeds to remove the object map.
+ */
+
+ SnapshotRemoveRequest(ImageCtx &image_ctx, ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map, uint64_t snap_id,
+ Context *on_finish)
+ : AsyncRequest(image_ctx, on_finish),
+ m_object_map_lock(object_map_lock), m_object_map(*object_map),
+ m_snap_id(snap_id), m_next_snap_id(CEPH_NOSNAP) {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override {
+ return true;
+ }
+
+private:
+ ceph::shared_mutex* m_object_map_lock;
+ ceph::BitVector<2> &m_object_map;
+ uint64_t m_snap_id;
+ uint64_t m_next_snap_id;
+
+ uint64_t m_flags = 0;
+
+ ceph::BitVector<2> m_snap_object_map;
+ bufferlist m_out_bl;
+
+ void load_map();
+ void handle_load_map(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void invalidate_next_map();
+ void handle_invalidate_next_map(int r);
+
+ void remove_map();
+ void handle_remove_map(int r);
+
+ void compute_next_snap_id();
+ void update_object_map();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotRollbackRequest.cc b/src/librbd/object_map/SnapshotRollbackRequest.cc
new file mode 100644
index 000000000..7c2f441cc
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRollbackRequest.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include <iostream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotRollbackRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+ const SnapshotRollbackRequest::State& state) {
+ switch(state) {
+ case SnapshotRollbackRequest::STATE_READ_MAP:
+ os << "READ_MAP";
+ break;
+ case SnapshotRollbackRequest::STATE_INVALIDATE_MAP:
+ os << "INVALIDATE_MAP";
+ break;
+ case SnapshotRollbackRequest::STATE_WRITE_MAP:
+ os << "WRITE_MAP";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+void SnapshotRollbackRequest::send() {
+ send_read_map();
+}
+
+bool SnapshotRollbackRequest::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0 && m_ret_val == 0) {
+ m_ret_val = r;
+ }
+
+ bool finished = false;
+ switch (m_state) {
+ case STATE_READ_MAP:
+ if (r < 0) {
+ // invalidate the snapshot object map
+ send_invalidate_map();
+ } else {
+ send_write_map();
+ }
+ break;
+ case STATE_INVALIDATE_MAP:
+ // invalidate the HEAD object map as well
+ finished = Request::should_complete(m_ret_val);
+ break;
+ case STATE_WRITE_MAP:
+ finished = Request::should_complete(r);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return finished;
+}
+
+void SnapshotRollbackRequest::send_read_map() {
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+ << dendl;
+ m_state = STATE_READ_MAP;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, NULL, NULL);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op,
+ &m_read_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRollbackRequest::send_write_map() {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+
+ CephContext *cct = m_image_ctx.cct;
+ std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id,
+ CEPH_NOSNAP));
+ ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+ << dendl;
+ m_state = STATE_WRITE_MAP;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ op.write_full(m_read_bl);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+void SnapshotRollbackRequest::send_invalidate_map() {
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_INVALIDATE_MAP;
+
+ InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id,
+ false,
+ create_callback_context());
+ req->send();
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotRollbackRequest.h b/src/librbd/object_map/SnapshotRollbackRequest.h
new file mode 100644
index 000000000..e26b1e0a3
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRollbackRequest.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class SnapshotRollbackRequest : public Request {
+public:
+ /**
+ * Snapshot rollback goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (error)
+ * STATE_READ_MAP * * * * > STATE_INVALIDATE_MAP
+ * | |
+ * v v
+ * STATE_WRITE_MAP -------> <finish>
+ *
+ * @endverbatim
+ *
+ * If an error occurs within the READ_MAP state, the associated snapshot's
+ * object map will be flagged as invalid. Otherwise, an error from any state
+ * will result in the HEAD object map being flagged as invalid via the base
+ * class.
+ */
+ enum State {
+ STATE_READ_MAP,
+ STATE_INVALIDATE_MAP,
+ STATE_WRITE_MAP
+ };
+
+ SnapshotRollbackRequest(ImageCtx &image_ctx, uint64_t snap_id,
+ Context *on_finish)
+ : Request(image_ctx, CEPH_NOSNAP, on_finish),
+ m_snap_id(snap_id), m_ret_val(0) {
+ ceph_assert(snap_id != CEPH_NOSNAP);
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ State m_state = STATE_READ_MAP;
+ uint64_t m_snap_id;
+ int m_ret_val;
+
+ bufferlist m_read_bl;
+
+ void send_read_map();
+ void send_invalidate_map();
+ void send_write_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
diff --git a/src/librbd/object_map/Types.h b/src/librbd/object_map/Types.h
new file mode 100644
index 000000000..0ce91bd96
--- /dev/null
+++ b/src/librbd/object_map/Types.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_TYPES_H
+#define CEPH_LIBRBD_OBJECT_MAP_TYPES_H
+
+namespace librbd {
+namespace object_map {
+
+enum DiffState {
+ DIFF_STATE_HOLE = 0, /* unchanged hole */
+ DIFF_STATE_DATA = 1, /* unchanged data */
+ DIFF_STATE_HOLE_UPDATED = 2, /* new hole */
+ DIFF_STATE_DATA_UPDATED = 3 /* new data */
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_TYPES_H
diff --git a/src/librbd/object_map/UnlockRequest.cc b/src/librbd/object_map/UnlockRequest.cc
new file mode 100644
index 000000000..0220ec900
--- /dev/null
+++ b/src/librbd/object_map/UnlockRequest.cc
@@ -0,0 +1,66 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/UnlockRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::UnlockRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_callback;
+
+template <typename I>
+UnlockRequest<I>::UnlockRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void UnlockRequest<I>::send() {
+ send_unlock();
+}
+
+template <typename I>
+void UnlockRequest<I>::send_unlock() {
+ CephContext *cct = m_image_ctx.cct;
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+ ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+ librados::ObjectWriteOperation op;
+ rados::cls::lock::unlock(&op, RBD_LOCK_NAME, "");
+
+ using klass = UnlockRequest<I>;
+ librados::AioCompletion *rados_completion =
+ create_rados_callback<klass, &klass::handle_unlock>(this);
+ int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *UnlockRequest<I>::handle_unlock(int *ret_val) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+ if (*ret_val < 0 && *ret_val != -ENOENT) {
+ lderr(m_image_ctx.cct) << "failed to release object map lock: "
+ << cpp_strerror(*ret_val) << dendl;
+
+ }
+
+ *ret_val = 0;
+ return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::UnlockRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/UnlockRequest.h b/src/librbd/object_map/UnlockRequest.h
new file mode 100644
index 000000000..ae1d9e934
--- /dev/null
+++ b/src/librbd/object_map/UnlockRequest.h
@@ -0,0 +1,47 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class UnlockRequest {
+public:
+ static UnlockRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+ return new UnlockRequest(image_ctx, on_finish);
+ }
+
+ UnlockRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> ----> UNLOCK ----> <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ void send_unlock();
+ Context* handle_unlock(int *ret_val);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::UnlockRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
diff --git a/src/librbd/object_map/UpdateRequest.cc b/src/librbd/object_map/UpdateRequest.cc
new file mode 100644
index 000000000..30a1f2121
--- /dev/null
+++ b/src/librbd/object_map/UpdateRequest.cc
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/UpdateRequest.h"
+#include "include/rbd/object_map_types.h"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "cls/lock/cls_lock_client.h"
+#include <string>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+// keep aligned to bit_vector 4K block sizes
+const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10);
+
+}
+
+template <typename I>
+void UpdateRequest<I>::send() {
+ update_object_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_object_map() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert(ceph_mutex_is_locked(*m_object_map_lock));
+ CephContext *cct = m_image_ctx.cct;
+
+ // break very large requests into manageable batches
+ m_update_end_object_no = std::min(
+ m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE);
+
+ std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", "
+ << "[" << m_update_start_object_no << ","
+ << m_update_end_object_no << ") = "
+ << (m_current_state ?
+ stringify(static_cast<uint32_t>(*m_current_state)) : "")
+ << "->" << static_cast<uint32_t>(m_new_state)
+ << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (m_snap_id == CEPH_NOSNAP) {
+ rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "");
+ }
+ cls_client::object_map_update(&op, m_update_start_object_no,
+ m_update_end_object_no, m_new_state,
+ m_current_state);
+
+ auto rados_completion = librbd::util::create_rados_callback<
+ UpdateRequest<I>, &UpdateRequest<I>::handle_update_object_map>(this);
+ std::vector<librados::snap_t> snaps;
+ int r = m_image_ctx.md_ctx.aio_operate(
+ oid, rados_completion, &op, 0, snaps,
+ (m_trace.valid() ? m_trace.get_info() : nullptr));
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void UpdateRequest<I>::handle_update_object_map(int r) {
+ ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT && m_ignore_enoent) {
+ r = 0;
+ }
+ if (r < 0 && m_ret_val == 0) {
+ m_ret_val = r;
+ }
+
+ {
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ std::unique_lock object_map_locker{*m_object_map_lock};
+ update_in_memory_object_map();
+
+ if (m_update_end_object_no < m_end_object_no) {
+ m_update_start_object_no = m_update_end_object_no;
+ update_object_map();
+ return;
+ }
+ }
+
+ // no more batch updates to send
+ complete(m_ret_val);
+}
+
+template <typename I>
+void UpdateRequest<I>::update_in_memory_object_map() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ ceph_assert(ceph_mutex_is_locked(*m_object_map_lock));
+
+ // rebuilding the object map might update on-disk only
+ if (m_snap_id == m_image_ctx.snap_id) {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ auto it = m_object_map.begin() +
+ std::min(m_update_start_object_no, m_object_map.size());
+ auto end_it = m_object_map.begin() +
+ std::min(m_update_end_object_no, m_object_map.size());
+ for (; it != end_it; ++it) {
+ auto state_ref = *it;
+ uint8_t state = state_ref;
+ if (!m_current_state || state == *m_current_state ||
+ (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
+ state_ref = m_new_state;
+ }
+ }
+ }
+}
+
+template <typename I>
+void UpdateRequest<I>::finish_request() {
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::UpdateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/UpdateRequest.h b/src/librbd/object_map/UpdateRequest.h
new file mode 100644
index 000000000..b5a72d591
--- /dev/null
+++ b/src/librbd/object_map/UpdateRequest.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+#include "common/bit_vector.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Utils.h"
+#include <boost/optional.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class UpdateRequest : public Request {
+public:
+ static UpdateRequest *create(ImageCtx &image_ctx,
+ ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map,
+ uint64_t snap_id, uint64_t start_object_no,
+ uint64_t end_object_no, uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace,
+ bool ignore_enoent, Context *on_finish) {
+ return new UpdateRequest(image_ctx, object_map_lock, object_map, snap_id,
+ start_object_no, end_object_no, new_state,
+ current_state, parent_trace, ignore_enoent,
+ on_finish);
+ }
+
+ UpdateRequest(ImageCtx &image_ctx, ceph::shared_mutex* object_map_lock,
+ ceph::BitVector<2> *object_map, uint64_t snap_id,
+ uint64_t start_object_no, uint64_t end_object_no,
+ uint8_t new_state,
+ const boost::optional<uint8_t> &current_state,
+ const ZTracer::Trace &parent_trace, bool ignore_enoent,
+ Context *on_finish)
+ : Request(image_ctx, snap_id, on_finish),
+ m_object_map_lock(object_map_lock), m_object_map(*object_map),
+ m_start_object_no(start_object_no), m_end_object_no(end_object_no),
+ m_update_start_object_no(start_object_no), m_new_state(new_state),
+ m_current_state(current_state),
+ m_trace(util::create_trace(image_ctx, "update object map", parent_trace)),
+ m_ignore_enoent(ignore_enoent)
+ {
+ m_trace.event("start");
+ }
+ virtual ~UpdateRequest() {
+ m_trace.event("finish");
+ }
+
+ void send() override;
+
+protected:
+ void finish_request() override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |/------------------\
+ * v | (repeat in batches)
+ * UPDATE_OBJECT_MAP -----/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ceph::shared_mutex* m_object_map_lock;
+ ceph::BitVector<2> &m_object_map;
+ uint64_t m_start_object_no;
+ uint64_t m_end_object_no;
+ uint64_t m_update_start_object_no;
+ uint64_t m_update_end_object_no = 0;
+ uint8_t m_new_state;
+ boost::optional<uint8_t> m_current_state;
+ ZTracer::Trace m_trace;
+ bool m_ignore_enoent;
+
+ int m_ret_val = 0;
+
+ void update_object_map();
+ void handle_update_object_map(int r);
+
+ void update_in_memory_object_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::UpdateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
diff --git a/src/librbd/operation/DisableFeaturesRequest.cc b/src/librbd/operation/DisableFeaturesRequest.cc
new file mode 100644
index 000000000..32db4b518
--- /dev/null
+++ b/src/librbd/operation/DisableFeaturesRequest.cc
@@ -0,0 +1,655 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/DisableFeaturesRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/image/SetFlagsRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/journal/RemoveRequest.h"
+#include "librbd/journal/TypeTraits.h"
+#include "librbd/mirror/DisableRequest.h"
+#include "librbd/object_map/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DisableFeaturesRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+DisableFeaturesRequest<I>::DisableFeaturesRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features,
+ bool force)
+ : Request<I>(image_ctx, on_finish, journal_op_tid), m_features(features),
+ m_force(force) {
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features
+ << dendl;
+
+ send_prepare_lock();
+}
+
+template <typename I>
+bool DisableFeaturesRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_prepare_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ image_ctx.state->prepare_lock(create_async_context_callback(
+ image_ctx, create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_prepare_lock>(this)));
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_prepare_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::unique_lock locker{image_ctx.owner_lock};
+ image_ctx.io_image_dispatcher->block_writes(create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_block_writes>(this));
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+ m_writes_blocked = true;
+
+ {
+ std::unique_lock locker{image_ctx.owner_lock};
+ // avoid accepting new requests from peers while we manipulate
+ // the image features
+ if (image_ctx.exclusive_lock != nullptr &&
+ (image_ctx.journal == nullptr ||
+ !image_ctx.journal->is_journal_replaying())) {
+ image_ctx.exclusive_lock->block_requests(0);
+ m_requests_blocked = true;
+ }
+ }
+
+ return send_acquire_exclusive_lock(result);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::send_acquire_exclusive_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ {
+ std::unique_lock locker{image_ctx.owner_lock};
+ // if disabling features w/ exclusive lock supported, we need to
+ // acquire the lock to temporarily block IO against the image
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ m_acquired_lock = true;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_acquire_exclusive_lock>(
+ this, image_ctx.exclusive_lock);
+ image_ctx.exclusive_lock->acquire_lock(ctx);
+ return nullptr;
+ }
+ }
+
+ return handle_acquire_exclusive_lock(result);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_acquire_exclusive_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ image_ctx.owner_lock.lock_shared();
+ if (*result < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl;
+ image_ctx.owner_lock.unlock_shared();
+ return handle_finish(*result);
+ } else if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ lderr(cct) << "failed to acquire exclusive lock" << dendl;
+ *result = image_ctx.exclusive_lock->get_unlocked_op_error();
+ image_ctx.owner_lock.unlock_shared();
+ return handle_finish(*result);
+ }
+
+ do {
+ m_features &= image_ctx.features;
+
+ // interlock object-map and fast-diff together
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) ||
+ ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) {
+ m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF);
+ }
+
+ m_new_features = image_ctx.features & ~m_features;
+ m_features_mask = m_features;
+
+ if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
+ if ((m_new_features & RBD_FEATURE_OBJECT_MAP) != 0 ||
+ (m_new_features & RBD_FEATURE_JOURNALING) != 0) {
+ lderr(cct) << "cannot disable exclusive-lock. object-map "
+ "or journaling must be disabled before "
+ "disabling exclusive-lock." << dendl;
+ *result = -EINVAL;
+ break;
+ }
+ m_features_mask |= (RBD_FEATURE_OBJECT_MAP |
+ RBD_FEATURE_FAST_DIFF |
+ RBD_FEATURE_JOURNALING);
+ }
+ if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) {
+ m_disable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+ }
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) {
+ m_disable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+ }
+ } while (false);
+ image_ctx.owner_lock.unlock_shared();
+
+ if (*result < 0) {
+ return handle_finish(*result);
+ }
+
+ send_get_mirror_mode();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_get_mirror_mode() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ send_append_op_event();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = DisableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_mode>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_get_mirror_mode(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode);
+ }
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to retrieve pool mirror mode: "
+ << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": m_mirror_mode="
+ << m_mirror_mode << dendl;
+
+ send_get_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_get_mirror_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+ send_disable_mirror_image();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_image_get_start(&op, image_ctx.id);
+
+ using klass = DisableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_image>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_get_mirror_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::mirror_image_get_finish(&it, &mirror_image);
+ }
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to retrieve pool mirror image: "
+ << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED &&
+ mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL && !m_force) {
+ lderr(cct) << "cannot disable journaling: journal-based mirroring "
+ << "enabled and mirror pool mode set to image"
+ << dendl;
+ *result = -EINVAL;
+ return handle_finish(*result);
+ }
+
+ if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) {
+ send_close_journal();
+ } else {
+ send_disable_mirror_image();
+ }
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_disable_mirror_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_disable_mirror_image>(this);
+
+ mirror::DisableRequest<I> *req =
+ mirror::DisableRequest<I>::create(&image_ctx, m_force, true, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_disable_mirror_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to disable image mirroring: " << cpp_strerror(*result)
+ << dendl;
+ // not fatal
+ }
+
+ send_close_journal();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_close_journal() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ {
+ std::unique_lock locker{image_ctx.owner_lock};
+ if (image_ctx.journal != nullptr) {
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::swap(m_journal, image_ctx.journal);
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_close_journal>(this);
+
+ m_journal->close(ctx);
+ return;
+ }
+ }
+
+ send_remove_journal();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_close_journal(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to close image journal: " << cpp_strerror(*result)
+ << dendl;
+ }
+
+ ceph_assert(m_journal != nullptr);
+ m_journal->put();
+ m_journal = nullptr;
+
+ send_remove_journal();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_remove_journal() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_remove_journal>(this);
+
+ typename journal::TypeTraits<I>::ContextWQ* context_wq;
+ Journal<I>::get_work_queue(cct, &context_wq);
+
+ journal::RemoveRequest<I> *req = journal::RemoveRequest<I>::create(
+ image_ctx.md_ctx, image_ctx.id, librbd::Journal<>::IMAGE_CLIENT_ID,
+ context_wq, ctx);
+
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_remove_journal(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to remove image journal: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!this->template append_op_event<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_append_op_event>(this)) {
+ send_remove_object_map();
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_remove_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_remove_object_map() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+ send_set_features();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_remove_object_map>(this);
+
+ object_map::RemoveRequest<I> *req =
+ object_map::RemoveRequest<I>::create(&image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_remove_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0 && *result != -ENOENT) {
+ lderr(cct) << "failed to remove object map: " << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ send_set_features();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_set_features() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": new_features="
+ << m_new_features << ", features_mask=" << m_features_mask
+ << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::set_features(&op, m_new_features, m_features_mask);
+
+ using klass = DisableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_features>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_set_features(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -EINVAL && (m_features_mask & RBD_FEATURE_JOURNALING) != 0) {
+ // NOTE: infernalis OSDs will not accept a mask with new features, so
+ // re-attempt with a reduced mask.
+ ldout(cct, 5) << this << " " << __func__
+ << ": re-attempt with a reduced mask" << dendl;
+ m_features_mask &= ~RBD_FEATURE_JOURNALING;
+ send_set_features();
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update features: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_update_flags();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_update_flags() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_disable_flags == 0) {
+ send_notify_update();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": disable_flags="
+ << m_disable_flags << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_update_flags>(this);
+
+ image::SetFlagsRequest<I> *req =
+ image::SetFlagsRequest<I>::create(&image_ctx, 0, m_disable_flags, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_update_flags(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update image flags: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_notify_update();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_notify_update() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_notify_update>(this);
+
+ image_ctx.notify_update(ctx);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_notify_update(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (image_ctx.exclusive_lock == nullptr || !m_acquired_lock) {
+ return handle_finish(*result);
+ }
+
+ send_release_exclusive_lock();
+ return nullptr;
+}
+
+template <typename I>
+void DisableFeaturesRequest<I>::send_release_exclusive_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ DisableFeaturesRequest<I>,
+ &DisableFeaturesRequest<I>::handle_release_exclusive_lock>(
+ this, image_ctx.exclusive_lock);
+
+ image_ctx.exclusive_lock->release_lock(ctx);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_release_exclusive_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ return handle_finish(*result);
+}
+
+template <typename I>
+Context *DisableFeaturesRequest<I>::handle_finish(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ std::unique_lock locker{image_ctx.owner_lock};
+ if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) {
+ image_ctx.exclusive_lock->unblock_requests();
+ }
+
+ image_ctx.io_image_dispatcher->unblock_writes();
+ }
+ image_ctx.state->handle_prepare_lock_complete();
+
+ return this->create_context_finisher(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::DisableFeaturesRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/DisableFeaturesRequest.h b/src/librbd/operation/DisableFeaturesRequest.h
new file mode 100644
index 000000000..719a03399
--- /dev/null
+++ b/src/librbd/operation/DisableFeaturesRequest.h
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H
+
+#include "librbd/ImageCtx.h"
+#include "librbd/operation/Request.h"
+#include "cls/rbd/cls_rbd_client.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class DisableFeaturesRequest : public Request<ImageCtxT> {
+public:
+ static DisableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features, bool force) {
+ return new DisableFeaturesRequest(image_ctx, on_finish, journal_op_tid,
+ features, force);
+ }
+
+ DisableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid, uint64_t features, bool force);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::UpdateFeaturesEvent(op_tid, m_features, false);
+ }
+
+private:
+ /**
+ * DisableFeatures goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PREPARE_LOCK
+ * |
+ * v
+ * STATE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_ACQUIRE_EXCLUSIVE_LOCK (skip if not
+ * | required)
+ * | (disbling journaling)
+ * \-------------------\
+ * | |
+ * | V
+ * | STATE_GET_MIRROR_MODE
+ * |(not |
+ * | disabling v
+ * | journaling) STATE_GET_MIRROR_IMAGE
+ * | |
+ * | v
+ * | STATE_DISABLE_MIRROR_IMAGE (skip if not
+ * | | required)
+ * | v
+ * | STATE_CLOSE_JOURNAL
+ * | |
+ * | v
+ * | STATE_REMOVE_JOURNAL
+ * | |
+ * |/-------------------/
+ * |
+ * v
+ * STATE_APPEND_OP_EVENT (skip if journaling
+ * | disabled)
+ * v
+ * STATE_REMOVE_OBJECT_MAP (skip if not
+ * | disabling object map)
+ * v
+ * STATE_SET_FEATURES
+ * |
+ * v
+ * STATE_UPDATE_FLAGS
+ * |
+ * v
+ * STATE_NOTIFY_UPDATE
+ * |
+ * v
+ * STATE_REALEASE_EXCLUSIVE_LOCK (skip if not
+ * | required)
+ * | (unblock writes)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+
+ uint64_t m_features;
+ bool m_force;
+
+ bool m_acquired_lock = false;
+ bool m_writes_blocked = false;
+ bool m_image_lock_acquired = false;
+ bool m_requests_blocked = false;
+
+ uint64_t m_new_features = 0;
+ uint64_t m_disable_flags = 0;
+ uint64_t m_features_mask = 0;
+
+ decltype(ImageCtxT::journal) m_journal = nullptr;
+ cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ bufferlist m_out_bl;
+
+ void send_prepare_lock();
+ Context *handle_prepare_lock(int *result);
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ Context *send_acquire_exclusive_lock(int *result);
+ Context *handle_acquire_exclusive_lock(int *result);
+
+ void send_get_mirror_mode();
+ Context *handle_get_mirror_mode(int *result);
+
+ void send_get_mirror_image();
+ Context *handle_get_mirror_image(int *result);
+
+ void send_disable_mirror_image();
+ Context *handle_disable_mirror_image(int *result);
+
+ void send_close_journal();
+ Context *handle_close_journal(int *result);
+
+ void send_remove_journal();
+ Context *handle_remove_journal(int *result);
+
+ void send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_remove_object_map();
+ Context *handle_remove_object_map(int *result);
+
+ void send_set_features();
+ Context *handle_set_features(int *result);
+
+ void send_update_flags();
+ Context *handle_update_flags(int *result);
+
+ void send_notify_update();
+ Context *handle_notify_update(int *result);
+
+ void send_release_exclusive_lock();
+ Context *handle_release_exclusive_lock(int *result);
+
+ Context *handle_finish(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::DisableFeaturesRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H
diff --git a/src/librbd/operation/EnableFeaturesRequest.cc b/src/librbd/operation/EnableFeaturesRequest.cc
new file mode 100644
index 000000000..8e3dad94b
--- /dev/null
+++ b/src/librbd/operation/EnableFeaturesRequest.cc
@@ -0,0 +1,494 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/EnableFeaturesRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/image/SetFlagsRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/journal/CreateRequest.h"
+#include "librbd/journal/TypeTraits.h"
+#include "librbd/mirror/EnableRequest.h"
+#include "librbd/object_map/CreateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::EnableFeaturesRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+EnableFeaturesRequest<I>::EnableFeaturesRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features)
+ : Request<I>(image_ctx, on_finish, journal_op_tid), m_features(features) {
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features
+ << dendl;
+ send_prepare_lock();
+}
+
+template <typename I>
+bool EnableFeaturesRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_prepare_lock() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ image_ctx.state->prepare_lock(create_async_context_callback(
+ image_ctx, create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_prepare_lock>(this)));
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_prepare_lock(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ std::unique_lock locker{image_ctx.owner_lock};
+ image_ctx.io_image_dispatcher->block_writes(create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_block_writes>(this));
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+ m_writes_blocked = true;
+
+ send_get_mirror_mode();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_get_mirror_mode() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if ((m_features & RBD_FEATURE_JOURNALING) == 0) {
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_get_mirror_mode>(this);
+ ctx->complete(-ENOENT);
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::mirror_mode_get_start(&op);
+
+ using klass = EnableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_get_mirror_mode>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_get_mirror_mode(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ if (*result == 0) {
+ auto it = m_out_bl.cbegin();
+ *result = cls_client::mirror_mode_get_finish(&it, &mirror_mode);
+ } else if (*result == -ENOENT) {
+ *result = 0;
+ }
+
+ if (*result < 0) {
+ lderr(cct) << "failed to retrieve pool mirror mode: "
+ << cpp_strerror(*result) << dendl;
+ return handle_finish(*result);
+ }
+
+ m_enable_mirroring = (mirror_mode == cls::rbd::MIRROR_MODE_POOL);
+
+ bool create_journal = false;
+ do {
+ std::unique_lock locker{image_ctx.owner_lock};
+
+ // avoid accepting new requests from peers while we manipulate
+ // the image features
+ if (image_ctx.exclusive_lock != nullptr &&
+ (image_ctx.journal == nullptr ||
+ !image_ctx.journal->is_journal_replaying())) {
+ image_ctx.exclusive_lock->block_requests(0);
+ m_requests_blocked = true;
+ }
+
+ m_features &= ~image_ctx.features;
+
+ // interlock object-map and fast-diff together
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) ||
+ ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) {
+ m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF);
+ }
+
+ m_new_features = image_ctx.features | m_features;
+ m_features_mask = m_features;
+
+ if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) {
+ if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot enable object-map. exclusive-lock must be "
+ "enabled before enabling object-map." << dendl;
+ *result = -EINVAL;
+ break;
+ }
+ m_enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+ m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_FAST_DIFF);
+ }
+ if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) {
+ m_enable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+ m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP);
+ }
+
+ if ((m_features & RBD_FEATURE_JOURNALING) != 0) {
+ if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+ lderr(cct) << "cannot enable journaling. exclusive-lock must be "
+ "enabled before enabling journaling." << dendl;
+ *result = -EINVAL;
+ break;
+ }
+ m_features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
+ create_journal = true;
+ }
+ } while (false);
+
+ if (*result < 0) {
+ return handle_finish(*result);
+ }
+ if (create_journal) {
+ send_create_journal();
+ return nullptr;
+ }
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_create_journal() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ journal::TagData tag_data(librbd::Journal<>::LOCAL_MIRROR_UUID);
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_create_journal>(this);
+
+ typename journal::TypeTraits<I>::ContextWQ* context_wq;
+ Journal<I>::get_work_queue(cct, &context_wq);
+
+ journal::CreateRequest<I> *req = journal::CreateRequest<I>::create(
+ image_ctx.md_ctx, image_ctx.id,
+ image_ctx.config.template get_val<uint64_t>("rbd_journal_order"),
+ image_ctx.config.template get_val<uint64_t>("rbd_journal_splay_width"),
+ image_ctx.config.template get_val<std::string>("rbd_journal_pool"),
+ cls::journal::Tag::TAG_CLASS_NEW, tag_data,
+ librbd::Journal<>::IMAGE_CLIENT_ID, context_wq, ctx);
+
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_create_journal(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to create journal: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!this->template append_op_event<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_append_op_event>(this)) {
+ send_update_flags();
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_update_flags();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_update_flags() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_enable_flags == 0) {
+ send_set_features();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << ": enable_flags="
+ << m_enable_flags << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_update_flags>(this);
+
+ image::SetFlagsRequest<I> *req =
+ image::SetFlagsRequest<I>::create(&image_ctx, m_enable_flags,
+ m_enable_flags, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_update_flags(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update image flags: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_set_features();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_set_features() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": new_features="
+ << m_new_features << ", features_mask=" << m_features_mask
+ << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::set_features(&op, m_new_features, m_features_mask);
+
+ using klass = EnableFeaturesRequest<I>;
+ librados::AioCompletion *comp =
+ create_rados_callback<klass, &klass::handle_set_features>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_set_features(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update features: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_create_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_create_object_map() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (((image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0) ||
+ ((m_features & RBD_FEATURE_OBJECT_MAP) == 0)) {
+ send_enable_mirror_image();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_create_object_map>(this);
+
+ object_map::CreateRequest<I> *req =
+ object_map::CreateRequest<I>::create(&image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_create_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to create object map: " << cpp_strerror(*result)
+ << dendl;
+ return handle_finish(*result);
+ }
+
+ send_enable_mirror_image();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_enable_mirror_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!m_enable_mirroring) {
+ send_notify_update();
+ return;
+ }
+
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_enable_mirror_image>(this);
+
+ auto req = mirror::EnableRequest<I>::create(
+ &image_ctx, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", false, ctx);
+ req->send();
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_enable_mirror_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to enable mirroring: " << cpp_strerror(*result)
+ << dendl;
+ // not fatal
+ }
+
+ send_notify_update();
+ return nullptr;
+}
+
+template <typename I>
+void EnableFeaturesRequest<I>::send_notify_update() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ EnableFeaturesRequest<I>,
+ &EnableFeaturesRequest<I>::handle_notify_update>(this);
+
+ image_ctx.notify_update(ctx);
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_notify_update(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ return handle_finish(*result);
+}
+
+template <typename I>
+Context *EnableFeaturesRequest<I>::handle_finish(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ {
+ std::unique_lock locker{image_ctx.owner_lock};
+
+ if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) {
+ image_ctx.exclusive_lock->unblock_requests();
+ }
+ if (m_writes_blocked) {
+ image_ctx.io_image_dispatcher->unblock_writes();
+ }
+ }
+ image_ctx.state->handle_prepare_lock_complete();
+
+ return this->create_context_finisher(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::EnableFeaturesRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/EnableFeaturesRequest.h b/src/librbd/operation/EnableFeaturesRequest.h
new file mode 100644
index 000000000..1c91b4dc7
--- /dev/null
+++ b/src/librbd/operation/EnableFeaturesRequest.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H
+
+#include "librbd/operation/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class EnableFeaturesRequest : public Request<ImageCtxT> {
+public:
+ static EnableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid,
+ uint64_t features) {
+ return new EnableFeaturesRequest(image_ctx, on_finish, journal_op_tid,
+ features);
+ }
+
+ EnableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid, uint64_t features);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::UpdateFeaturesEvent(op_tid, m_features, true);
+ }
+
+private:
+ /**
+ * EnableFeatures goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PREPARE_LOCK
+ * |
+ * v
+ * STATE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_GET_MIRROR_MODE
+ * |
+ * v
+ * STATE_CREATE_JOURNAL (skip if not
+ * | required)
+ * v
+ * STATE_APPEND_OP_EVENT (skip if journaling
+ * | disabled)
+ * v
+ * STATE_UPDATE_FLAGS
+ * |
+ * v
+ * STATE_SET_FEATURES
+ * |
+ * v
+ * STATE_CREATE_OBJECT_MAP (skip if not
+ * | required)
+ * v
+ * STATE_ENABLE_MIRROR_IMAGE
+ * |
+ * V
+ * STATE_NOTIFY_UPDATE
+ * |
+ * | (unblock writes)
+ * v
+ * <finish>
+ * @endverbatim
+ *
+ */
+
+ uint64_t m_features;
+
+ bool m_enable_mirroring = false;
+ bool m_requests_blocked = false;
+ bool m_writes_blocked = false;
+
+ uint64_t m_new_features = 0;
+ uint64_t m_enable_flags = 0;
+ uint64_t m_features_mask = 0;
+
+ bufferlist m_out_bl;
+
+ void send_prepare_lock();
+ Context *handle_prepare_lock(int *result);
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_get_mirror_mode();
+ Context *handle_get_mirror_mode(int *result);
+
+ void send_create_journal();
+ Context *handle_create_journal(int *result);
+
+ void send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_update_flags();
+ Context *handle_update_flags(int *result);
+
+ void send_set_features();
+ Context *handle_set_features(int *result);
+
+ void send_create_object_map();
+ Context *handle_create_object_map(int *result);
+
+ void send_enable_mirror_image();
+ Context *handle_enable_mirror_image(int *result);
+
+ void send_notify_update();
+ Context *handle_notify_update(int *result);
+
+ Context *handle_finish(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::EnableFeaturesRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H
diff --git a/src/librbd/operation/FlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc
new file mode 100644
index 000000000..764552217
--- /dev/null
+++ b/src/librbd/operation/FlattenRequest.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/FlattenRequest.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/image/DetachParentRequest.h"
+#include "librbd/Types.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::FlattenRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+class C_FlattenObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_FlattenObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ IOContext io_context, uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_io_context(io_context),
+ m_object_no(object_no) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ CephContext *cct = image_ctx.cct;
+
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ ldout(cct, 1) << "lost exclusive lock during flatten" << dendl;
+ return -ERESTART;
+ }
+
+ {
+ std::shared_lock image_lock{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr &&
+ !image_ctx.object_map->object_may_not_exist(m_object_no)) {
+ // can skip because the object already exists
+ return 1;
+ }
+ }
+
+ if (!io::util::trigger_copyup(
+ &image_ctx, m_object_no, m_io_context, this)) {
+ // stop early if the parent went away - it just means
+ // another flatten finished first or the image was resized
+ return 1;
+ }
+
+ return 0;
+ }
+
+private:
+ IOContext m_io_context;
+ uint64_t m_object_no;
+};
+
+template <typename I>
+bool FlattenRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void FlattenRequest<I>::send_op() {
+ flatten_objects();
+}
+
+template <typename I>
+void FlattenRequest<I>::flatten_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ auto ctx = create_context_callback<
+ FlattenRequest<I>,
+ &FlattenRequest<I>::handle_flatten_objects>(this);
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_FlattenObject<I> >(),
+ boost::lambda::_1, &image_ctx, image_ctx.get_data_io_context(),
+ boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, m_overlap_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void FlattenRequest<I>::handle_flatten_objects(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "flatten operation interrupted" << dendl;
+ this->complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "flatten encountered an error: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_child();
+}
+
+template <typename I>
+void FlattenRequest<I>::detach_child() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ // should have been canceled prior to releasing lock
+ image_ctx.owner_lock.lock_shared();
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // if there are no snaps, remove from the children object as well
+ // (if snapshots remain, they have their own parent info, and the child
+ // will be removed when the last snap goes away)
+ image_ctx.image_lock.lock_shared();
+ if ((image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
+ !image_ctx.snaps.empty()) {
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ detach_parent();
+ return;
+ }
+ image_ctx.image_lock.unlock_shared();
+
+ ldout(cct, 5) << dendl;
+ auto ctx = create_context_callback<
+ FlattenRequest<I>,
+ &FlattenRequest<I>::handle_detach_child>(this);
+ auto req = image::DetachChildRequest<I>::create(image_ctx, ctx);
+ req->send();
+ image_ctx.owner_lock.unlock_shared();
+}
+
+template <typename I>
+void FlattenRequest<I>::handle_detach_child(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "detach encountered an error: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_parent();
+}
+
+template <typename I>
+void FlattenRequest<I>::detach_parent() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ // should have been canceled prior to releasing lock
+ image_ctx.owner_lock.lock_shared();
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // stop early if the parent went away - it just means
+ // another flatten finished first, so this one is useless.
+ image_ctx.image_lock.lock_shared();
+ if (!image_ctx.parent) {
+ ldout(cct, 5) << "image already flattened" << dendl;
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ this->complete(0);
+ return;
+ }
+ image_ctx.image_lock.unlock_shared();
+
+ // remove parent from this (base) image
+ auto ctx = create_context_callback<
+ FlattenRequest<I>,
+ &FlattenRequest<I>::handle_detach_parent>(this);
+ auto req = image::DetachParentRequest<I>::create(image_ctx, ctx);
+ req->send();
+ image_ctx.owner_lock.unlock_shared();
+}
+
+template <typename I>
+void FlattenRequest<I>::handle_detach_parent(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "remove parent encountered an error: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ this->complete(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::FlattenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/FlattenRequest.h b/src/librbd/operation/FlattenRequest.h
new file mode 100644
index 000000000..cdbb4c9e7
--- /dev/null
+++ b/src/librbd/operation/FlattenRequest.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
+
+#include "librbd/operation/Request.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class FlattenRequest : public Request<ImageCtxT>
+{
+public:
+ FlattenRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t overlap_objects, ProgressContext &prog_ctx)
+ : Request<ImageCtxT>(image_ctx, on_finish),
+ m_overlap_objects(overlap_objects), m_prog_ctx(prog_ctx) {
+ }
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::FlattenEvent(op_tid);
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * FLATTEN_OBJECTS
+ * |
+ * v
+ * DETACH_CHILD
+ * |
+ * v
+ * DETACH_PARENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ uint64_t m_overlap_objects;
+ ProgressContext &m_prog_ctx;
+
+ void flatten_objects();
+ void handle_flatten_objects(int r);
+
+ void detach_child();
+ void handle_detach_child(int r);
+
+ void detach_parent();
+ void handle_detach_parent(int r);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::FlattenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
diff --git a/src/librbd/operation/MetadataRemoveRequest.cc b/src/librbd/operation/MetadataRemoveRequest.cc
new file mode 100644
index 000000000..c5d6141ad
--- /dev/null
+++ b/src/librbd/operation/MetadataRemoveRequest.cc
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/MetadataRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MetadataRemoveRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+MetadataRemoveRequest<I>::MetadataRemoveRequest(I &image_ctx,
+ Context *on_finish,
+ const std::string &key)
+ : Request<I>(image_ctx, on_finish), m_key(key) {
+}
+
+template <typename I>
+void MetadataRemoveRequest<I>::send_op() {
+ send_metadata_remove();
+}
+
+template <typename I>
+bool MetadataRemoveRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void MetadataRemoveRequest<I>::send_metadata_remove() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::metadata_remove(&op, m_key);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::MetadataRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/MetadataRemoveRequest.h b/src/librbd/operation/MetadataRemoveRequest.h
new file mode 100644
index 000000000..1d7f2a46a
--- /dev/null
+++ b/src/librbd/operation/MetadataRemoveRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offremove:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class MetadataRemoveRequest : public Request<ImageCtxT> {
+public:
+ MetadataRemoveRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const std::string &key);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::MetadataRemoveEvent(op_tid, m_key);
+ }
+
+private:
+ std::string m_key;
+
+ void send_metadata_remove();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::MetadataRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H
diff --git a/src/librbd/operation/MetadataSetRequest.cc b/src/librbd/operation/MetadataSetRequest.cc
new file mode 100644
index 000000000..5fb939352
--- /dev/null
+++ b/src/librbd/operation/MetadataSetRequest.cc
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/MetadataSetRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MetadataSetRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+MetadataSetRequest<I>::MetadataSetRequest(I &image_ctx,
+ Context *on_finish,
+ const std::string &key,
+ const std::string &value)
+ : Request<I>(image_ctx, on_finish), m_key(key), m_value(value) {
+}
+
+template <typename I>
+void MetadataSetRequest<I>::send_op() {
+ send_metadata_set();
+}
+
+template <typename I>
+bool MetadataSetRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void MetadataSetRequest<I>::send_metadata_set() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << this << " " << __func__ << dendl;
+
+ m_data[m_key].append(m_value);
+ librados::ObjectWriteOperation op;
+ cls_client::metadata_set(&op, m_data);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::MetadataSetRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/MetadataSetRequest.h b/src/librbd/operation/MetadataSetRequest.h
new file mode 100644
index 000000000..5f8daa2f1
--- /dev/null
+++ b/src/librbd/operation/MetadataSetRequest.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "include/buffer.h"
+#include <string>
+#include <map>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class MetadataSetRequest : public Request<ImageCtxT> {
+public:
+ MetadataSetRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const std::string &key, const std::string &value);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::MetadataSetEvent(op_tid, m_key, m_value);
+ }
+
+private:
+ std::string m_key;
+ std::string m_value;
+ std::map<std::string, bufferlist> m_data;
+
+ void send_metadata_set();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::MetadataSetRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H
diff --git a/src/librbd/operation/MigrateRequest.cc b/src/librbd/operation/MigrateRequest.cc
new file mode 100644
index 000000000..2b9adb773
--- /dev/null
+++ b/src/librbd/operation/MigrateRequest.cc
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/MigrateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/deep_copy/ObjectCopyRequest.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MigrateRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_async_context_callback;
+
+namespace {
+
+template <typename I>
+class C_MigrateObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_MigrateObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ IOContext io_context, uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_io_context(io_context),
+ m_object_no(object_no) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ CephContext *cct = image_ctx.cct;
+
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ ldout(cct, 1) << "lost exclusive lock during migrate" << dendl;
+ return -ERESTART;
+ }
+
+ start_async_op();
+ return 0;
+ }
+
+private:
+ IOContext m_io_context;
+ uint64_t m_object_no;
+
+ io::AsyncOperation *m_async_op = nullptr;
+
+ void start_async_op() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ ceph_assert(m_async_op == nullptr);
+ m_async_op = new io::AsyncOperation();
+ m_async_op->start_op(image_ctx);
+
+ if (!image_ctx.io_image_dispatcher->writes_blocked()) {
+ migrate_object();
+ return;
+ }
+
+ auto ctx = create_async_context_callback(
+ image_ctx, create_context_callback<
+ C_MigrateObject<I>, &C_MigrateObject<I>::handle_start_async_op>(this));
+ m_async_op->finish_op();
+ delete m_async_op;
+ m_async_op = nullptr;
+ image_ctx.io_image_dispatcher->wait_on_writes_unblocked(ctx);
+ }
+
+ void handle_start_async_op(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to start async op: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ start_async_op();
+ }
+
+ bool is_within_overlap_bounds() {
+ I &image_ctx = this->m_image_ctx;
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ auto overlap = std::min(image_ctx.size, image_ctx.migration_info.overlap);
+ return overlap > 0 &&
+ Striper::get_num_objects(image_ctx.layout, overlap) > m_object_no;
+ }
+
+ void migrate_object() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ CephContext *cct = image_ctx.cct;
+
+ auto ctx = create_context_callback<
+ C_MigrateObject<I>, &C_MigrateObject<I>::handle_migrate_object>(this);
+
+ if (is_within_overlap_bounds()) {
+ bufferlist bl;
+ auto req = new io::ObjectWriteRequest<I>(&image_ctx, m_object_no, 0,
+ std::move(bl), m_io_context, 0,
+ 0, std::nullopt, {}, ctx);
+
+ ldout(cct, 20) << "copyup object req " << req << ", object_no "
+ << m_object_no << dendl;
+
+ req->send();
+ } else {
+ ceph_assert(image_ctx.parent != nullptr);
+
+ uint32_t flags = deep_copy::OBJECT_COPY_REQUEST_FLAG_MIGRATION;
+ if (image_ctx.migration_info.flatten) {
+ flags |= deep_copy::OBJECT_COPY_REQUEST_FLAG_FLATTEN;
+ }
+
+ auto req = deep_copy::ObjectCopyRequest<I>::create(
+ image_ctx.parent, &image_ctx, 0, 0, image_ctx.migration_info.snap_map,
+ m_object_no, flags, nullptr, ctx);
+
+ ldout(cct, 20) << "deep copy object req " << req << ", object_no "
+ << m_object_no << dendl;
+ req->send();
+ }
+ }
+
+ void handle_migrate_object(int r) {
+ CephContext *cct = this->m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ m_async_op->finish_op();
+ delete m_async_op;
+ this->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+void MigrateRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ migrate_objects();
+}
+
+template <typename I>
+bool MigrateRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+
+ return true;
+}
+
+template <typename I>
+void MigrateRequest<I>::migrate_objects() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ uint64_t overlap_objects = get_num_overlap_objects();
+
+ ldout(cct, 10) << "from 0 to " << overlap_objects << dendl;
+
+ auto ctx = create_context_callback<
+ MigrateRequest<I>, &MigrateRequest<I>::handle_migrate_objects>(this);
+
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_MigrateObject<I> >(),
+ boost::lambda::_1, &image_ctx, image_ctx.get_data_io_context(),
+ boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, overlap_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void MigrateRequest<I>::handle_migrate_objects(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to migrate objects: " << cpp_strerror(r) << dendl;
+ }
+
+ this->complete(r);
+}
+
+template <typename I>
+uint64_t MigrateRequest<I>::get_num_overlap_objects() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ auto overlap = image_ctx.migration_info.overlap;
+
+ return overlap > 0 ?
+ Striper::get_num_objects(image_ctx.layout, overlap) : 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::MigrateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/MigrateRequest.h b/src/librbd/operation/MigrateRequest.h
new file mode 100644
index 000000000..a143b579c
--- /dev/null
+++ b/src/librbd/operation/MigrateRequest.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "librbd/Types.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class MigrateRequest : public Request<ImageCtxT>
+{
+public:
+ MigrateRequest(ImageCtxT &image_ctx, Context *on_finish,
+ ProgressContext &prog_ctx)
+ : Request<ImageCtxT>(image_ctx, on_finish), m_prog_ctx(prog_ctx) {
+ }
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ ceph_abort();
+ return journal::UnknownEvent();
+ }
+
+private:
+ /**
+ * Migrate goes through the following state machine to copy objects
+ * from the parent (migrating source) image:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * MIGRATE_OBJECTS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+
+ ProgressContext &m_prog_ctx;
+
+ void migrate_objects();
+ void handle_migrate_objects(int r);
+
+ uint64_t get_num_overlap_objects();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::MigrateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H
diff --git a/src/librbd/operation/ObjectMapIterate.cc b/src/librbd/operation/ObjectMapIterate.cc
new file mode 100644
index 000000000..50db3df85
--- /dev/null
+++ b/src/librbd/operation/ObjectMapIterate.cc
@@ -0,0 +1,308 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/ObjectMapIterate.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "librbd/Utils.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ObjectMapIterateRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+class C_VerifyObjectCallback : public C_AsyncObjectThrottle<I> {
+public:
+ C_VerifyObjectCallback(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t snap_id, uint64_t object_no,
+ ObjectIterateWork<I> handle_mismatch,
+ std::atomic_flag *invalidate)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx),
+ m_snap_id(snap_id), m_object_no(object_no),
+ m_oid(image_ctx->get_object_name(m_object_no)),
+ m_handle_mismatch(handle_mismatch),
+ m_invalidate(invalidate)
+ {
+ m_io_ctx.dup(image_ctx->data_ctx);
+ m_io_ctx.snap_set_read(CEPH_SNAPDIR);
+ }
+
+ void complete(int r) override {
+ I &image_ctx = this->m_image_ctx;
+ if (should_complete(r)) {
+ ldout(image_ctx.cct, 20) << m_oid << " C_VerifyObjectCallback completed "
+ << dendl;
+ m_io_ctx.close();
+
+ this->finish(r);
+ delete this;
+ }
+ }
+
+ int send() override {
+ send_list_snaps();
+ return 0;
+ }
+
+private:
+ librados::IoCtx m_io_ctx;
+ uint64_t m_snap_id;
+ uint64_t m_object_no;
+ std::string m_oid;
+ ObjectIterateWork<I> m_handle_mismatch;
+ std::atomic_flag *m_invalidate;
+
+ librados::snap_set_t m_snap_set;
+ int m_snap_list_ret = 0;
+
+ bool should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ if (r == 0) {
+ r = m_snap_list_ret;
+ }
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << m_oid << " C_VerifyObjectCallback::should_complete: "
+ << "encountered an error: " << cpp_strerror(r) << dendl;
+ return true;
+ }
+
+ ldout(cct, 20) << m_oid << " C_VerifyObjectCallback::should_complete: "
+ << " r="
+ << r << dendl;
+ return object_map_action(get_object_state());
+ }
+
+ void send_list_snaps() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ ldout(image_ctx.cct, 5) << m_oid
+ << " C_VerifyObjectCallback::send_list_snaps"
+ << dendl;
+
+ librados::ObjectReadOperation op;
+ op.list_snaps(&m_snap_set, &m_snap_list_ret);
+
+ librados::AioCompletion *comp = util::create_rados_callback(this);
+ int r = m_io_ctx.aio_operate(m_oid, comp, &op, NULL);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ uint8_t get_object_state() {
+ I &image_ctx = this->m_image_ctx;
+ std::shared_lock image_locker{image_ctx.image_lock};
+ for (std::vector<librados::clone_info_t>::const_iterator r =
+ m_snap_set.clones.begin(); r != m_snap_set.clones.end(); ++r) {
+ librados::snap_t from_snap_id;
+ librados::snap_t to_snap_id;
+ if (r->cloneid == librados::SNAP_HEAD) {
+ from_snap_id = next_valid_snap_id(m_snap_set.seq + 1);
+ to_snap_id = librados::SNAP_HEAD;
+ } else {
+ from_snap_id = next_valid_snap_id(r->snaps[0]);
+ to_snap_id = r->snaps[r->snaps.size()-1];
+ }
+
+ if (to_snap_id < m_snap_id) {
+ continue;
+ } else if (m_snap_id < from_snap_id) {
+ break;
+ }
+
+ if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 &&
+ from_snap_id != m_snap_id) {
+ return OBJECT_EXISTS_CLEAN;
+ }
+ return OBJECT_EXISTS;
+ }
+ return OBJECT_NONEXISTENT;
+ }
+
+ uint64_t next_valid_snap_id(uint64_t snap_id) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ std::map<librados::snap_t, SnapInfo>::iterator it =
+ image_ctx.snap_info.lower_bound(snap_id);
+ if (it == image_ctx.snap_info.end()) {
+ return CEPH_NOSNAP;
+ }
+ return it->first;
+ }
+
+ bool object_map_action(uint8_t new_state) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ std::shared_lock image_locker{image_ctx.image_lock};
+ ceph_assert(image_ctx.object_map != nullptr);
+
+ uint8_t state = (*image_ctx.object_map)[m_object_no];
+ ldout(cct, 10) << "C_VerifyObjectCallback::object_map_action"
+ << " object " << image_ctx.get_object_name(m_object_no)
+ << " state " << (int)state
+ << " new_state " << (int)new_state << dendl;
+
+ if (state != new_state) {
+ int r = 0;
+
+ ceph_assert(m_handle_mismatch);
+ r = m_handle_mismatch(image_ctx, m_object_no, state, new_state);
+ if (r) {
+ lderr(cct) << "object map error: object "
+ << image_ctx.get_object_name(m_object_no)
+ << " marked as " << (int)state << ", but should be "
+ << (int)new_state << dendl;
+ m_invalidate->test_and_set();
+ } else {
+ ldout(cct, 1) << "object map inconsistent: object "
+ << image_ctx.get_object_name(m_object_no)
+ << " marked as " << (int)state << ", but should be "
+ << (int)new_state << dendl;
+ }
+ }
+
+ return true;
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+void ObjectMapIterateRequest<I>::send() {
+ if (!m_image_ctx.data_ctx.is_valid()) {
+ this->async_complete(-ENODEV);
+ return;
+ }
+
+ send_verify_objects();
+}
+
+template <typename I>
+bool ObjectMapIterateRequest<I>::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+
+ if (r == -ENODEV) {
+ lderr(cct) << "missing data pool" << dendl;
+ return true;
+ }
+
+ if (r < 0) {
+ lderr(cct) << "object map operation encountered an error: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ switch (m_state) {
+ case STATE_VERIFY_OBJECTS:
+ if (m_invalidate.test_and_set()) {
+ send_invalidate_object_map();
+ return false;
+ } else if (r == 0) {
+ return true;
+ }
+ break;
+
+ case STATE_INVALIDATE_OBJECT_MAP:
+ if (r == 0) {
+ return true;
+ }
+ break;
+
+ default:
+ ceph_abort();
+ break;
+ }
+
+ if (r < 0) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+void ObjectMapIterateRequest<I>::send_verify_objects() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ CephContext *cct = m_image_ctx.cct;
+
+ uint64_t snap_id;
+ uint64_t num_objects;
+ {
+ std::shared_lock l{m_image_ctx.image_lock};
+ snap_id = m_image_ctx.snap_id;
+ num_objects = Striper::get_num_objects(m_image_ctx.layout,
+ m_image_ctx.get_image_size(snap_id));
+ }
+ ldout(cct, 5) << this << " send_verify_objects" << dendl;
+
+ m_state = STATE_VERIFY_OBJECTS;
+
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_VerifyObjectCallback<I> >(),
+ boost::lambda::_1, &m_image_ctx, snap_id,
+ boost::lambda::_2, m_handle_mismatch, &m_invalidate));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, m_image_ctx, context_factory, this->create_callback_context(),
+ &m_prog_ctx, 0, num_objects);
+ throttle->start_ops(
+ m_image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+uint64_t ObjectMapIterateRequest<I>::get_image_size() const {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ if (m_image_ctx.snap_id == CEPH_NOSNAP) {
+ if (!m_image_ctx.resize_reqs.empty()) {
+ return m_image_ctx.resize_reqs.front()->get_image_size();
+ } else {
+ return m_image_ctx.size;
+ }
+ }
+ return m_image_ctx.get_image_size(m_image_ctx.snap_id);
+}
+
+template <typename I>
+void ObjectMapIterateRequest<I>::send_invalidate_object_map() {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 5) << this << " send_invalidate_object_map" << dendl;
+ m_state = STATE_INVALIDATE_OBJECT_MAP;
+
+ object_map::InvalidateRequest<I>*req =
+ object_map::InvalidateRequest<I>::create(m_image_ctx, m_image_ctx.snap_id,
+ true,
+ this->create_callback_context());
+
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ req->send();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::ObjectMapIterateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/ObjectMapIterate.h b/src/librbd/operation/ObjectMapIterate.h
new file mode 100644
index 000000000..14215902a
--- /dev/null
+++ b/src/librbd/operation/ObjectMapIterate.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H
+#define CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H
+
+#include <iostream>
+#include <atomic>
+
+#include "include/int_types.h"
+#include "include/rbd/object_map_types.h"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+using ObjectIterateWork = bool(*)(ImageCtxT &image_ctx,
+ uint64_t object_no,
+ uint8_t current_state,
+ uint8_t new_state);
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectMapIterateRequest : public AsyncRequest<ImageCtxT> {
+public:
+ ObjectMapIterateRequest(ImageCtxT &image_ctx, Context *on_finish,
+ ProgressContext &prog_ctx,
+ ObjectIterateWork<ImageCtxT> handle_mismatch)
+ : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx),
+ m_prog_ctx(prog_ctx), m_handle_mismatch(handle_mismatch)
+ {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ enum State {
+ STATE_VERIFY_OBJECTS,
+ STATE_INVALIDATE_OBJECT_MAP
+ };
+
+ ImageCtxT &m_image_ctx;
+ ProgressContext &m_prog_ctx;
+ ObjectIterateWork<ImageCtxT> m_handle_mismatch;
+ std::atomic_flag m_invalidate = ATOMIC_FLAG_INIT;
+ State m_state = STATE_VERIFY_OBJECTS;
+
+ void send_verify_objects();
+ void send_invalidate_object_map();
+
+ uint64_t get_image_size() const;
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::ObjectMapIterateRequest<librbd::ImageCtx>;
+
+#endif
diff --git a/src/librbd/operation/RebuildObjectMapRequest.cc b/src/librbd/operation/RebuildObjectMapRequest.cc
new file mode 100644
index 000000000..5deb182e5
--- /dev/null
+++ b/src/librbd/operation/RebuildObjectMapRequest.cc
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/RebuildObjectMapRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/operation/TrimRequest.h"
+#include "librbd/operation/ObjectMapIterate.h"
+#include "librbd/Utils.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::RebuildObjectMapRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send() {
+ send_resize_object_map();
+}
+
+template <typename I>
+bool RebuildObjectMapRequest<I>::should_complete(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ switch (m_state) {
+ case STATE_RESIZE_OBJECT_MAP:
+ ldout(cct, 5) << "RESIZE_OBJECT_MAP" << dendl;
+ if (r == -ESTALE && !m_attempted_trim) {
+ // objects are still flagged as in-use -- delete them
+ m_attempted_trim = true;
+ send_trim_image();
+ return false;
+ } else if (r == 0) {
+ send_verify_objects();
+ }
+ break;
+
+ case STATE_TRIM_IMAGE:
+ ldout(cct, 5) << "TRIM_IMAGE" << dendl;
+ if (r == 0) {
+ send_resize_object_map();
+ }
+ break;
+
+ case STATE_VERIFY_OBJECTS:
+ ldout(cct, 5) << "VERIFY_OBJECTS" << dendl;
+ if (r == 0) {
+ send_save_object_map();
+ }
+ break;
+
+ case STATE_SAVE_OBJECT_MAP:
+ ldout(cct, 5) << "SAVE_OBJECT_MAP" << dendl;
+ if (r == 0) {
+ send_update_header();
+ }
+ break;
+ case STATE_UPDATE_HEADER:
+ ldout(cct, 5) << "UPDATE_HEADER" << dendl;
+ if (r == 0) {
+ return true;
+ }
+ break;
+
+ default:
+ ceph_abort();
+ break;
+ }
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "rebuild object map operation interrupted" << dendl;
+ return true;
+ } else if (r < 0) {
+ lderr(cct) << "rebuild object map encountered an error: " << cpp_strerror(r)
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_resize_object_map() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ CephContext *cct = m_image_ctx.cct;
+
+ m_image_ctx.image_lock.lock_shared();
+ ceph_assert(m_image_ctx.object_map != nullptr);
+
+ uint64_t size = get_image_size();
+ uint64_t num_objects = Striper::get_num_objects(m_image_ctx.layout, size);
+
+ if (m_image_ctx.object_map->size() == num_objects) {
+ m_image_ctx.image_lock.unlock_shared();
+ send_verify_objects();
+ return;
+ }
+
+ ldout(cct, 5) << this << " send_resize_object_map" << dendl;
+ m_state = STATE_RESIZE_OBJECT_MAP;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ m_image_ctx.object_map->aio_resize(size, OBJECT_NONEXISTENT,
+ this->create_callback_context());
+ m_image_ctx.image_lock.unlock_shared();
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_trim_image() {
+ CephContext *cct = m_image_ctx.cct;
+
+ std::shared_lock l{m_image_ctx.owner_lock};
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+ ldout(cct, 5) << this << " send_trim_image" << dendl;
+ m_state = STATE_TRIM_IMAGE;
+
+ uint64_t new_size;
+ uint64_t orig_size;
+ {
+ std::shared_lock l{m_image_ctx.image_lock};
+ ceph_assert(m_image_ctx.object_map != nullptr);
+
+ new_size = get_image_size();
+ orig_size = m_image_ctx.get_object_size() *
+ m_image_ctx.object_map->size();
+ }
+ TrimRequest<I> *req = TrimRequest<I>::create(m_image_ctx,
+ this->create_callback_context(),
+ orig_size, new_size, m_prog_ctx);
+ req->send();
+}
+
+template <typename I>
+bool update_object_map(I& image_ctx, uint64_t object_no, uint8_t current_state,
+ uint8_t new_state) {
+ CephContext *cct = image_ctx.cct;
+ uint64_t snap_id = image_ctx.snap_id;
+
+ current_state = (*image_ctx.object_map)[object_no];
+ if (current_state == OBJECT_EXISTS && new_state == OBJECT_NONEXISTENT &&
+ snap_id == CEPH_NOSNAP) {
+ // might be writing object to OSD concurrently
+ new_state = current_state;
+ }
+
+ if (new_state != current_state) {
+ ldout(cct, 15) << image_ctx.get_object_name(object_no)
+ << " rebuild updating object map "
+ << static_cast<uint32_t>(current_state) << "->"
+ << static_cast<uint32_t>(new_state) << dendl;
+ image_ctx.object_map->set_state(object_no, new_state, current_state);
+ }
+ return false;
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_verify_objects() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ CephContext *cct = m_image_ctx.cct;
+
+ m_state = STATE_VERIFY_OBJECTS;
+ ldout(cct, 5) << this << " send_verify_objects" << dendl;
+
+ ObjectMapIterateRequest<I> *req =
+ new ObjectMapIterateRequest<I>(m_image_ctx,
+ this->create_callback_context(),
+ m_prog_ctx, update_object_map);
+
+ req->send();
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_save_object_map() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 5) << this << " send_save_object_map" << dendl;
+ m_state = STATE_SAVE_OBJECT_MAP;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ std::shared_lock image_locker{m_image_ctx.image_lock};
+ ceph_assert(m_image_ctx.object_map != nullptr);
+ m_image_ctx.object_map->aio_save(this->create_callback_context());
+}
+
+template <typename I>
+void RebuildObjectMapRequest<I>::send_update_header() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock));
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(m_image_ctx.exclusive_lock == nullptr ||
+ m_image_ctx.exclusive_lock->is_lock_owner());
+
+ ldout(m_image_ctx.cct, 5) << this << " send_update_header" << dendl;
+ m_state = STATE_UPDATE_HEADER;
+
+ librados::ObjectWriteOperation op;
+
+ uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID | RBD_FLAG_FAST_DIFF_INVALID;
+ cls_client::set_flags(&op, m_image_ctx.snap_id, 0, flags);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+
+ std::unique_lock image_locker{m_image_ctx.image_lock};
+ m_image_ctx.update_flags(m_image_ctx.snap_id, flags, false);
+}
+
+template <typename I>
+uint64_t RebuildObjectMapRequest<I>::get_image_size() const {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock));
+ if (m_image_ctx.snap_id == CEPH_NOSNAP) {
+ if (!m_image_ctx.resize_reqs.empty()) {
+ return m_image_ctx.resize_reqs.front()->get_image_size();
+ } else {
+ return m_image_ctx.size;
+ }
+ }
+ return m_image_ctx.get_image_size(m_image_ctx.snap_id);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/RebuildObjectMapRequest.h b/src/librbd/operation/RebuildObjectMapRequest.h
new file mode 100644
index 000000000..c7f1aa3b7
--- /dev/null
+++ b/src/librbd/operation/RebuildObjectMapRequest.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class RebuildObjectMapRequest : public AsyncRequest<ImageCtxT> {
+public:
+
+ RebuildObjectMapRequest(ImageCtxT &image_ctx, Context *on_finish,
+ ProgressContext &prog_ctx)
+ : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx),
+ m_prog_ctx(prog_ctx), m_attempted_trim(false)
+ {
+ }
+
+ void send() override;
+
+protected:
+ bool should_complete(int r) override;
+
+private:
+ /**
+ * Rebuild object map goes through the following state machine to
+ * verify per-object state:
+ *
+ * <start>
+ * . | . . . . . . . . . .
+ * . | . .
+ * . v v .
+ * . STATE_RESIZE_OBJECT_MAP . . . > STATE_TRIM_IMAGE
+ * . |
+ * . v
+ * . . . > STATE_VERIFY_OBJECTS
+ * |
+ * v
+ * STATE_SAVE_OBJECT_MAP
+ * |
+ * v
+ * STATE_UPDATE_HEADER
+ *
+ * The _RESIZE_OBJECT_MAP state will be skipped if the object map
+ * is appropriately sized for the image. The _TRIM_IMAGE state will
+ * only be hit if the resize failed due to an in-use object.
+ */
+ enum State {
+ STATE_RESIZE_OBJECT_MAP,
+ STATE_TRIM_IMAGE,
+ STATE_VERIFY_OBJECTS,
+ STATE_SAVE_OBJECT_MAP,
+ STATE_UPDATE_HEADER
+ };
+
+ ImageCtxT &m_image_ctx;
+ ProgressContext &m_prog_ctx;
+ State m_state = STATE_RESIZE_OBJECT_MAP;
+ bool m_attempted_trim;
+
+ void send_resize_object_map();
+ void send_trim_image();
+ void send_verify_objects();
+ void send_save_object_map();
+ void send_update_header();
+
+ uint64_t get_image_size() const;
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/operation/RenameRequest.cc b/src/librbd/operation/RenameRequest.cc
new file mode 100644
index 000000000..15bcd819c
--- /dev/null
+++ b/src/librbd/operation/RenameRequest.cc
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/RenameRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::RenameRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename RenameRequest<I>::State& state) {
+ switch(state) {
+ case RenameRequest<I>::STATE_READ_DIRECTORY:
+ os << "READ_DIRECTORY";
+ break;
+ case RenameRequest<I>::STATE_READ_SOURCE_HEADER:
+ os << "READ_SOURCE_HEADER";
+ break;
+ case RenameRequest<I>::STATE_WRITE_DEST_HEADER:
+ os << "WRITE_DEST_HEADER";
+ break;
+ case RenameRequest<I>::STATE_UPDATE_DIRECTORY:
+ os << "UPDATE_DIRECTORY";
+ break;
+ case RenameRequest<I>::STATE_REMOVE_SOURCE_HEADER:
+ os << "REMOVE_SOURCE_HEADER";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+RenameRequest<I>::RenameRequest(I &image_ctx, Context *on_finish,
+ const std::string &dest_name)
+ : Request<I>(image_ctx, on_finish), m_dest_name(dest_name),
+ m_source_oid(image_ctx.old_format ? util::old_header_name(image_ctx.name) :
+ util::id_obj_name(image_ctx.name)),
+ m_dest_oid(image_ctx.old_format ? util::old_header_name(dest_name) :
+ util::id_obj_name(dest_name)) {
+}
+
+template <typename I>
+void RenameRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ if (image_ctx.old_format) {
+ send_read_source_header();
+ return;
+ }
+ send_read_directory();
+}
+
+template <typename I>
+bool RenameRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ r = filter_return_code(r);
+ if (r < 0) {
+ if (r == -EEXIST) {
+ ldout(cct, 1) << "image already exists" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+ }
+
+ if (m_state == STATE_READ_DIRECTORY) {
+ std::string name;
+ auto it = m_source_name_bl.cbegin();
+ r = cls_client::dir_get_name_finish(&it, &name);
+ if (r < 0) {
+ lderr(cct) << "could not read directory: " << cpp_strerror(r) << dendl;
+ return true;
+ }
+ bool update = false;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ update = image_ctx.name != name;
+ }
+ if (update) {
+ image_ctx.set_image_name(name);
+ m_source_oid = util::id_obj_name(name);
+ }
+ } else if (m_state == STATE_UPDATE_DIRECTORY) {
+ // update in-memory name before removing source header
+ apply();
+ } else if (m_state == STATE_REMOVE_SOURCE_HEADER) {
+ return true;
+ }
+
+ std::shared_lock owner_lock{image_ctx.owner_lock};
+ switch (m_state) {
+ case STATE_READ_DIRECTORY:
+ send_read_source_header();
+ break;
+ case STATE_READ_SOURCE_HEADER:
+ send_write_destination_header();
+ break;
+ case STATE_WRITE_DEST_HEADER:
+ send_update_directory();
+ break;
+ case STATE_UPDATE_DIRECTORY:
+ send_remove_source_header();
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+int RenameRequest<I>::filter_return_code(int r) const {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_state == STATE_READ_SOURCE_HEADER && r == -ENOENT) {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.name == m_dest_name) {
+ // signal that replay raced with itself
+ return -EEXIST;
+ }
+ } else if (m_state == STATE_REMOVE_SOURCE_HEADER && r < 0) {
+ if (r != -ENOENT) {
+ lderr(cct) << "warning: couldn't remove old source object ("
+ << m_source_oid << ")" << dendl;
+ }
+ return 0;
+ }
+ return r;
+}
+
+template <typename I>
+void RenameRequest<I>::send_read_directory() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_READ_DIRECTORY;
+
+ librados::ObjectReadOperation op;
+ cls_client::dir_get_name_start(&op, image_ctx.id);
+
+ auto comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, comp, &op,
+ &m_source_name_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_read_source_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_READ_SOURCE_HEADER;
+
+ librados::ObjectReadOperation op;
+ op.read(0, 0, NULL, NULL);
+
+ // TODO: old code read omap values but there are no omap values on the
+ // old format header nor the new format id object
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op,
+ &m_header_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_write_destination_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_WRITE_DEST_HEADER;
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ op.write_full(m_header_bl);
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(m_dest_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_update_directory() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_UPDATE_DIRECTORY;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ bufferlist cmd_bl;
+ bufferlist empty_bl;
+ encode(static_cast<__u8>(CEPH_OSD_TMAP_SET), cmd_bl);
+ encode(m_dest_name, cmd_bl);
+ encode(empty_bl, cmd_bl);
+ encode(static_cast<__u8>(CEPH_OSD_TMAP_RM), cmd_bl);
+ encode(image_ctx.name, cmd_bl);
+ op.tmap_update(cmd_bl);
+ } else {
+ cls_client::dir_rename_image(&op, image_ctx.name, m_dest_name,
+ image_ctx.id);
+ }
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_remove_source_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_REMOVE_SOURCE_HEADER;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::apply() {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.set_image_name(m_dest_name);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::RenameRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/RenameRequest.h b/src/librbd/operation/RenameRequest.h
new file mode 100644
index 000000000..11fdec648
--- /dev/null
+++ b/src/librbd/operation/RenameRequest.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_RENAME_REQUEST_H
+#define CEPH_LIBRBD_RENAME_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class RenameRequest : public Request<ImageCtxT>
+{
+public:
+ /**
+ * Rename goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_READ_DIRECTORY
+ * |
+ * v
+ * STATE_READ_SOURCE_HEADER
+ * |
+ * v
+ * STATE_WRITE_DEST_HEADER
+ * |
+ * v
+ * STATE_UPDATE_DIRECTORY
+ * |
+ * v
+ * STATE_REMOVE_SOURCE_HEADER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+ enum State {
+ STATE_READ_DIRECTORY,
+ STATE_READ_SOURCE_HEADER,
+ STATE_WRITE_DEST_HEADER,
+ STATE_UPDATE_DIRECTORY,
+ STATE_REMOVE_SOURCE_HEADER
+ };
+
+ RenameRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const std::string &dest_name);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ int filter_return_code(int r) const override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::RenameEvent(op_tid, m_dest_name);
+ }
+
+private:
+ std::string m_dest_name;
+
+ std::string m_source_oid;
+ std::string m_dest_oid;
+
+ State m_state = STATE_READ_DIRECTORY;
+
+ bufferlist m_source_name_bl;
+ bufferlist m_header_bl;
+
+ void send_read_directory();
+ void send_read_source_header();
+ void send_write_destination_header();
+ void send_update_directory();
+ void send_remove_source_header();
+
+ void apply();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::RenameRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_RENAME_REQUEST_H
diff --git a/src/librbd/operation/Request.cc b/src/librbd/operation/Request.cc
new file mode 100644
index 000000000..269c8a4f9
--- /dev/null
+++ b/src/librbd/operation/Request.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/Request.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Request: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+Request<I>::Request(I &image_ctx, Context *on_finish, uint64_t journal_op_tid)
+ : AsyncRequest<I>(image_ctx, on_finish), m_op_tid(journal_op_tid) {
+}
+
+template <typename I>
+void Request<I>::send() {
+ [[maybe_unused]] I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ // automatically create the event if we don't need to worry
+ // about affecting concurrent IO ops
+ if (can_affect_io() || !append_op_event()) {
+ send_op();
+ }
+}
+
+template <typename I>
+Context *Request<I>::create_context_finisher(int r) {
+ // automatically commit the event if required (delete after commit)
+ if (m_appended_op_event && !m_committed_op_event &&
+ commit_op_event(r)) {
+ return nullptr;
+ }
+
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+ return util::create_context_callback<Request<I>, &Request<I>::finish>(this);
+}
+
+template <typename I>
+void Request<I>::finish_and_destroy(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ // automatically commit the event if required (delete after commit)
+ if (m_appended_op_event && !m_committed_op_event &&
+ commit_op_event(r)) {
+ return;
+ }
+
+ AsyncRequest<I>::finish_and_destroy(r);
+}
+
+template <typename I>
+void Request<I>::finish(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(!m_appended_op_event || m_committed_op_event);
+ AsyncRequest<I>::finish(r);
+}
+
+template <typename I>
+bool Request<I>::append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending()) {
+ append_op_event(util::create_context_callback<
+ Request<I>, &Request<I>::handle_op_event_safe>(this));
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+bool Request<I>::commit_op_event(int r) {
+ I &image_ctx = this->m_image_ctx;
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ if (!m_appended_op_event) {
+ return false;
+ }
+
+ ceph_assert(m_op_tid != 0);
+ ceph_assert(!m_committed_op_event);
+ m_committed_op_event = true;
+
+ if (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending()) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ // ops will be canceled / completed before closing journal
+ ceph_assert(image_ctx.journal->is_journal_ready());
+ image_ctx.journal->commit_op_event(m_op_tid, r,
+ new C_CommitOpEvent(this, r));
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void Request<I>::handle_commit_op_event(int r, int original_ret_val) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r)
+ << dendl;
+ }
+ if (original_ret_val < 0) {
+ r = original_ret_val;
+ }
+ finish(r);
+}
+
+template <typename I>
+void Request<I>::replay_op_ready(Context *on_safe) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+ ceph_assert(m_op_tid != 0);
+
+ m_appended_op_event = true;
+ image_ctx.journal->replay_op_ready(
+ m_op_tid, util::create_async_context_callback(image_ctx, on_safe));
+}
+
+template <typename I>
+void Request<I>::append_op_event(Context *on_safe) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << dendl;
+
+ m_op_tid = image_ctx.journal->allocate_op_tid();
+ image_ctx.journal->append_op_event(
+ m_op_tid, journal::EventEntry{create_event(m_op_tid)},
+ new C_AppendOpEvent(this, on_safe));
+}
+
+template <typename I>
+void Request<I>::handle_op_event_safe(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ delete this;
+ } else {
+ ceph_assert(!can_affect_io());
+
+ // haven't started the request state machine yet
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ send_op();
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+#ifndef TEST_F
+template class librbd::operation::Request<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/operation/Request.h b/src/librbd/operation/Request.h
new file mode 100644
index 000000000..e32b49644
--- /dev/null
+++ b/src/librbd/operation/Request.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_REQUEST_H
+
+#include "librbd/AsyncRequest.h"
+#include "include/Context.h"
+#include "common/RWLock.h"
+#include "librbd/Utils.h"
+#include "librbd/Journal.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class Request : public AsyncRequest<ImageCtxT> {
+public:
+ Request(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t journal_op_tid = 0);
+
+ void send();
+
+protected:
+ void finish(int r) override;
+ virtual void send_op() = 0;
+
+ virtual bool can_affect_io() const {
+ return false;
+ }
+ virtual journal::Event create_event(uint64_t op_tid) const = 0;
+
+ template <typename T, Context*(T::*MF)(int*)>
+ bool append_op_event(T *request) {
+ ImageCtxT &image_ctx = this->m_image_ctx;
+
+ ceph_assert(can_affect_io());
+ std::scoped_lock locker{image_ctx.owner_lock, image_ctx.image_lock};
+ if (image_ctx.journal != nullptr) {
+ if (image_ctx.journal->is_journal_replaying()) {
+ Context *ctx = util::create_context_callback<T, MF>(request);
+ replay_op_ready(ctx);
+ return true;
+ } else if (image_ctx.journal->is_journal_appending()) {
+ Context *ctx = util::create_context_callback<T, MF>(request);
+ append_op_event(ctx);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool append_op_event();
+
+ // NOTE: temporary until converted to new state machine format
+ Context *create_context_finisher(int r);
+ void finish_and_destroy(int r) override;
+
+private:
+ struct C_AppendOpEvent : public Context {
+ Request *request;
+ Context *on_safe;
+ C_AppendOpEvent(Request *request, Context *on_safe)
+ : request(request), on_safe(on_safe) {
+ }
+ void finish(int r) override {
+ if (r >= 0) {
+ request->m_appended_op_event = true;
+ }
+ on_safe->complete(r);
+ }
+ };
+
+ struct C_CommitOpEvent : public Context {
+ Request *request;
+ int ret_val;
+ C_CommitOpEvent(Request *request, int ret_val)
+ : request(request), ret_val(ret_val) {
+ }
+ void finish(int r) override {
+ request->handle_commit_op_event(r, ret_val);
+ delete request;
+ }
+ };
+
+ uint64_t m_op_tid = 0;
+ bool m_appended_op_event = false;
+ bool m_committed_op_event = false;
+
+ void replay_op_ready(Context *on_safe);
+ void append_op_event(Context *on_safe);
+ void handle_op_event_safe(int r);
+
+ bool commit_op_event(int r);
+ void handle_commit_op_event(int r, int original_ret_val);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::Request<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_REQUEST_H
diff --git a/src/librbd/operation/ResizeRequest.cc b/src/librbd/operation/ResizeRequest.cc
new file mode 100644
index 000000000..e4e76dacd
--- /dev/null
+++ b/src/librbd/operation/ResizeRequest.cc
@@ -0,0 +1,466 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/operation/TrimRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::ResizeRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+ResizeRequest<I>::ResizeRequest(I &image_ctx, Context *on_finish,
+ uint64_t new_size, bool allow_shrink, ProgressContext &prog_ctx,
+ uint64_t journal_op_tid, bool disable_journal)
+ : Request<I>(image_ctx, on_finish, journal_op_tid),
+ m_original_size(0), m_new_size(new_size), m_allow_shrink(allow_shrink),
+ m_prog_ctx(prog_ctx), m_new_parent_overlap(0), m_disable_journal(disable_journal),
+ m_xlist_item(this)
+{
+}
+
+template <typename I>
+ResizeRequest<I>::~ResizeRequest() {
+ I &image_ctx = this->m_image_ctx;
+ ResizeRequest *next_req = NULL;
+ {
+ std::unique_lock image_locker{image_ctx.image_lock};
+ ceph_assert(m_xlist_item.remove_myself());
+ if (!image_ctx.resize_reqs.empty()) {
+ next_req = image_ctx.resize_reqs.front();
+ }
+ }
+
+ if (next_req != NULL) {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ next_req->send();
+ }
+}
+
+template <typename I>
+void ResizeRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ {
+ std::unique_lock image_locker{image_ctx.image_lock};
+ if (!m_xlist_item.is_on_list()) {
+ image_ctx.resize_reqs.push_back(&m_xlist_item);
+ if (image_ctx.resize_reqs.front() != this) {
+ return;
+ }
+ }
+
+ ceph_assert(image_ctx.resize_reqs.front() == this);
+ m_original_size = image_ctx.size;
+ compute_parent_overlap();
+ }
+
+ Request<I>::send();
+}
+
+template <typename I>
+void ResizeRequest<I>::send_op() {
+ [[maybe_unused]] I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ if (this->is_canceled()) {
+ this->async_complete(-ERESTART);
+ } else {
+ send_pre_block_writes();
+ }
+}
+
+template <typename I>
+void ResizeRequest<I>::send_pre_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ image_ctx.io_image_dispatcher->block_writes(create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_pre_block_writes>(this));
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_pre_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ image_ctx.io_image_dispatcher->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ return send_append_op_event();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_new_size < m_original_size && !m_allow_shrink) {
+ ldout(cct, 1) << "shrinking the image is not permitted" << dendl;
+ image_ctx.io_image_dispatcher->unblock_writes();
+ this->async_complete(-EINVAL);
+ return nullptr;
+ }
+
+ if (m_disable_journal || !this->template append_op_event<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_append_op_event>(this)) {
+ return send_grow_object_map();
+ }
+
+ ldout(cct, 5) << dendl;
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ image_ctx.io_image_dispatcher->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ return send_grow_object_map();
+}
+
+template <typename I>
+void ResizeRequest<I>::send_trim_image() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ TrimRequest<I> *req = TrimRequest<I>::create(
+ image_ctx, create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_trim_image>(this),
+ m_original_size, m_new_size, m_prog_ctx);
+ req->send();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_trim_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result == -ERESTART) {
+ ldout(cct, 5) << "resize operation interrupted" << dendl;
+ return this->create_context_finisher(*result);
+ } else if (*result < 0) {
+ lderr(cct) << "failed to trim image: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_post_block_writes();
+ return nullptr;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_flush_cache() {
+ I &image_ctx = this->m_image_ctx;
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ auto ctx = create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_flush_cache>(this);
+ auto aio_comp = io::AioCompletion::create_and_start(
+ ctx, util::get_image_ctx(&image_ctx), io::AIO_TYPE_FLUSH);
+ auto req = io::ImageDispatchSpec::create_flush(
+ image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp,
+ io::FLUSH_SOURCE_INTERNAL, {});
+ req->send();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_flush_cache(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to flush cache: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_invalidate_cache();
+ return nullptr;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_invalidate_cache() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ // need to invalidate since we're deleting objects, and
+ // ObjectCacher doesn't track non-existent objects
+ image_ctx.io_image_dispatcher->invalidate_cache(create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_invalidate_cache>(this));
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_invalidate_cache(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ // ignore busy error -- writeback was successfully flushed so we might be
+ // wasting some cache space for trimmed objects, but they will get purged
+ // eventually. Most likely cause of the issue was a in-flight cache read
+ if (*result < 0 && *result != -EBUSY) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result)
+ << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_trim_image();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::send_grow_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ {
+ std::unique_lock image_locker{image_ctx.image_lock};
+ m_shrink_size_visible = true;
+ }
+
+ if (m_original_size == m_new_size) {
+ image_ctx.io_image_dispatcher->unblock_writes();
+ return this->create_context_finisher(0);
+ } else if (m_new_size < m_original_size) {
+ image_ctx.io_image_dispatcher->unblock_writes();
+ send_flush_cache();
+ return nullptr;
+ }
+
+ image_ctx.owner_lock.lock_shared();
+ image_ctx.image_lock.lock_shared();
+ if (image_ctx.object_map == nullptr) {
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+
+ // IO is still blocked
+ send_update_header();
+ return nullptr;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ image_ctx.object_map->aio_resize(
+ m_new_size, OBJECT_NONEXISTENT, create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_grow_object_map>(this));
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_grow_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to resize object map: "
+ << cpp_strerror(*result) << dendl;
+ image_ctx.io_image_dispatcher->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ // IO is still blocked
+ send_update_header();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::send_shrink_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ image_ctx.owner_lock.lock_shared();
+ image_ctx.image_lock.lock_shared();
+ if (image_ctx.object_map == nullptr || m_new_size > m_original_size) {
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+
+ update_size_and_overlap();
+ return this->create_context_finisher(0);
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "original_size=" << m_original_size << ", "
+ << "new_size=" << m_new_size << dendl;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ image_ctx.object_map->aio_resize(
+ m_new_size, OBJECT_NONEXISTENT, create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_shrink_object_map>(this));
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ return nullptr;
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_shrink_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to resize object map: "
+ << cpp_strerror(*result) << dendl;
+ image_ctx.io_image_dispatcher->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ update_size_and_overlap();
+ return this->create_context_finisher(0);
+}
+
+template <typename I>
+void ResizeRequest<I>::send_post_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ image_ctx.io_image_dispatcher->block_writes(create_context_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_post_block_writes>(this));
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_post_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ image_ctx.io_image_dispatcher->unblock_writes();
+ lderr(cct) << "failed to block writes prior to header update: "
+ << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_update_header();
+ return nullptr;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_update_header() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "original_size=" << m_original_size << ", "
+ << "new_size=" << m_new_size << dendl;;
+
+ // should have been canceled prior to releasing lock
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ // rewrite only the size field of the header
+ ceph_le64 new_size = init_le64(m_new_size);
+ bufferlist bl;
+ bl.append(reinterpret_cast<const char*>(&new_size), sizeof(new_size));
+ op.write(offsetof(rbd_obj_header_ondisk, image_size), bl);
+ } else {
+ cls_client::set_size(&op, m_new_size);
+ }
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ ResizeRequest<I>, &ResizeRequest<I>::handle_update_header>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+ rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *ResizeRequest<I>::handle_update_header(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to update image header: " << cpp_strerror(*result)
+ << dendl;
+ image_ctx.io_image_dispatcher->unblock_writes();
+ return this->create_context_finisher(*result);
+ }
+
+ return send_shrink_object_map();
+}
+
+template <typename I>
+void ResizeRequest<I>::compute_parent_overlap() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ if (image_ctx.parent == NULL) {
+ m_new_parent_overlap = 0;
+ } else {
+ m_new_parent_overlap = std::min(m_new_size, image_ctx.parent_md.overlap);
+ }
+}
+
+template <typename I>
+void ResizeRequest<I>::update_size_and_overlap() {
+ I &image_ctx = this->m_image_ctx;
+ {
+ std::unique_lock image_locker{image_ctx.image_lock};
+ image_ctx.size = m_new_size;
+
+ if (image_ctx.parent != NULL && m_new_size < m_original_size) {
+ image_ctx.parent_md.overlap = m_new_parent_overlap;
+ }
+ }
+
+ // blocked by PRE_BLOCK_WRITES (grow) or POST_BLOCK_WRITES (shrink) state
+ image_ctx.io_image_dispatcher->unblock_writes();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::ResizeRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/ResizeRequest.h b/src/librbd/operation/ResizeRequest.h
new file mode 100644
index 000000000..f5e2f807f
--- /dev/null
+++ b/src/librbd/operation/ResizeRequest.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "include/xlist.h"
+
+namespace librbd
+{
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class ResizeRequest : public Request<ImageCtxT> {
+public:
+ static ResizeRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t new_size, bool allow_shrink,
+ ProgressContext &prog_ctx, uint64_t journal_op_tid,
+ bool disable_journal) {
+ return new ResizeRequest(image_ctx, on_finish, new_size, allow_shrink, prog_ctx,
+ journal_op_tid, disable_journal);
+ }
+
+ ResizeRequest(ImageCtxT &image_ctx, Context *on_finish, uint64_t new_size,
+ bool allow_shrink, ProgressContext &prog_ctx, uint64_t journal_op_tid,
+ bool disable_journal);
+ ~ResizeRequest() override;
+
+ inline bool shrinking() const {
+ return (m_shrink_size_visible && m_new_size < m_original_size);
+ }
+
+ inline uint64_t get_image_size() const {
+ return m_new_size;
+ }
+
+ void send() override;
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override {
+ return true;
+ }
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::ResizeEvent(op_tid, m_new_size);
+ }
+
+private:
+ /**
+ * Resize goes through the following state machine to resize the image
+ * and update the object map:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PRE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_APPEND_OP_EVENT (skip if journaling
+ * | disabled)
+ * |
+ * | (grow)
+ * |\--------> STATE_GROW_OBJECT_MAP (skip if object map
+ * | | disabled)
+ * | v
+ * | STATE_UPDATE_HEADER ----------------------------\
+ * | (unblock writes) |
+ * | |
+ * | (unblock writes) |
+ * | |
+ * | (shrink) |
+ * |\--------> STATE_FLUSH_CACHE |
+ * | | |
+ * | v |
+ * | STATE_INVALIDATE_CACHE |
+ * | | |
+ * | v |
+ * | STATE_TRIM_IMAGE |
+ * | | |
+ * | v |
+ * | STATE_POST_BLOCK_WRITES |
+ * | | |
+ * | v |
+ * | STATE_UPDATE_HEADER |
+ * | | |
+ * | v |
+ * | STATE_SHRINK_OBJECT_MAP (skip if object map |
+ * | | disabled) |
+ * | | (unblock writes) |
+ * | (no change) v |
+ * \------------> <finish> <-----------------------------------/
+ *
+ * @endverbatim
+ *
+ * The _OBJECT_MAP states are skipped if the object map isn't enabled.
+ * The state machine will immediately transition to _FINISHED if there
+ * are no objects to trim.
+ */
+
+ uint64_t m_original_size;
+ uint64_t m_new_size;
+ bool m_allow_shrink = true;
+ ProgressContext &m_prog_ctx;
+ uint64_t m_new_parent_overlap;
+ bool m_shrink_size_visible = false;
+ bool m_disable_journal = false;
+
+ typename xlist<ResizeRequest<ImageCtxT>*>::item m_xlist_item;
+
+ void send_pre_block_writes();
+ Context *handle_pre_block_writes(int *result);
+
+ Context *send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_flush_cache();
+ Context *handle_flush_cache(int *result);
+
+ void send_invalidate_cache();
+ Context *handle_invalidate_cache(int *result);
+
+ void send_trim_image();
+ Context *handle_trim_image(int *result);
+
+ Context *send_grow_object_map();
+ Context *handle_grow_object_map(int *result);
+
+ Context *send_shrink_object_map();
+ Context *handle_shrink_object_map(int *result);
+
+ void send_post_block_writes();
+ Context *handle_post_block_writes(int *result);
+
+ void send_update_header();
+ Context *handle_update_header(int *result);
+
+ void compute_parent_overlap();
+ void update_size_and_overlap();
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::ResizeRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotCreateRequest.cc b/src/librbd/operation/SnapshotCreateRequest.cc
new file mode 100644
index 000000000..866ef7d61
--- /dev/null
+++ b/src/librbd/operation/SnapshotCreateRequest.cc
@@ -0,0 +1,449 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/mirror/snapshot/SetImageStateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotCreateRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+SnapshotCreateRequest<I>::SnapshotCreateRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t journal_op_tid,
+ uint64_t flags,
+ ProgressContext &prog_ctx)
+ : Request<I>(image_ctx, on_finish, journal_op_tid),
+ m_snap_namespace(snap_namespace), m_snap_name(snap_name),
+ m_skip_object_map(flags & SNAP_CREATE_FLAG_SKIP_OBJECT_MAP),
+ m_skip_notify_quiesce(flags & SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE),
+ m_ignore_notify_quiesce_error(flags & SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR),
+ m_prog_ctx(prog_ctx) {
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(cct) << "missing data pool" << dendl;
+ this->async_complete(-ENODEV);
+ return;
+ }
+
+ send_notify_quiesce();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_notify_quiesce() {
+ if (m_skip_notify_quiesce) {
+ send_suspend_requests();
+ return;
+ }
+
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ image_ctx.image_watcher->notify_quiesce(
+ &m_request_id, m_prog_ctx, create_async_context_callback(
+ image_ctx, create_context_callback<SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_notify_quiesce>(this)));
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_notify_quiesce(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0 && !m_ignore_notify_quiesce_error) {
+ lderr(cct) << "failed to notify quiesce: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ send_notify_unquiesce();
+ return nullptr;
+ }
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ send_suspend_requests();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_suspend_requests() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ // TODO suspend (shrink) resize to ensure consistent RBD mirror
+ send_suspend_aio();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_suspend_requests(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ // TODO
+ send_suspend_aio();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_suspend_aio() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ image_ctx.io_image_dispatcher->block_writes(create_context_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_suspend_aio>(this));
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_suspend_aio(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ save_result(result);
+ return send_notify_unquiesce();
+ }
+
+ m_writes_blocked = true;
+
+ send_append_op_event();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_append_op_event() {
+ I &image_ctx = this->m_image_ctx;
+ if (!this->template append_op_event<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_append_op_event>(this)) {
+ send_allocate_snap_id();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_append_op_event(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ return send_notify_unquiesce();
+ }
+
+ send_allocate_snap_id();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_allocate_snap_id() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_allocate_snap_id>(this);
+ image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_allocate_snap_id(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << ", "
+ << "snap_id=" << m_snap_id << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to allocate snapshot id: " << cpp_strerror(*result)
+ << dendl;
+ save_result(result);
+ return send_notify_unquiesce();
+ }
+
+ send_create_snap();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_snap() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // save current size / parent info for creating snapshot record in ImageCtx
+ m_size = image_ctx.size;
+ m_parent_info = image_ctx.parent_md;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ cls_client::old_snapshot_add(&op, m_snap_id, m_snap_name);
+ } else {
+ cls_client::snapshot_add(&op, m_snap_id, m_snap_name, m_snap_namespace);
+ }
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_create_snap>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+ rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_create_snap(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ESTALE) {
+ send_allocate_snap_id();
+ return nullptr;
+ } else if (*result < 0) {
+ save_result(result);
+ send_release_snap_id();
+ return nullptr;
+ }
+
+ return send_create_object_map();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::send_create_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ image_ctx.image_lock.lock_shared();
+ if (image_ctx.object_map == nullptr || m_skip_object_map) {
+ image_ctx.image_lock.unlock_shared();
+
+ return send_create_image_state();
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ image_ctx.object_map->snapshot_add(
+ m_snap_id, create_context_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_create_object_map>(this));
+ image_ctx.image_lock.unlock_shared();
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_create_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to snapshot object map: "
+ << cpp_strerror(*result) << dendl;
+
+ save_result(result);
+ update_snap_context();
+ return send_notify_unquiesce();
+ }
+
+ return send_create_image_state();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::send_create_image_state() {
+ I &image_ctx = this->m_image_ctx;
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &m_snap_namespace);
+ if (mirror_ns == nullptr || !mirror_ns->is_primary()) {
+ update_snap_context();
+ return send_notify_unquiesce();
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ auto req = mirror::snapshot::SetImageStateRequest<I>::create(
+ &image_ctx, m_snap_id, create_context_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_create_image_state>(this));
+ req->send();
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_create_image_state(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ update_snap_context();
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to create image state: "
+ << cpp_strerror(*result) << dendl;
+ save_result(result);
+ }
+
+ return send_notify_unquiesce();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_release_snap_id() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ ceph_assert(m_snap_id != CEPH_NOSNAP);
+
+ librados::AioCompletion *rados_completion = create_rados_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_release_snap_id>(this);
+ image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+ rados_completion->release();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_release_snap_id(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ return send_notify_unquiesce();
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::send_notify_unquiesce() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (m_writes_blocked) {
+ image_ctx.io_image_dispatcher->unblock_writes();
+ }
+
+ if (m_skip_notify_quiesce) {
+ return this->create_context_finisher(m_ret_val);
+ }
+
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ image_ctx.image_watcher->notify_unquiesce(
+ m_request_id, create_context_callback<
+ SnapshotCreateRequest<I>,
+ &SnapshotCreateRequest<I>::handle_notify_unquiesce>(this));
+
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotCreateRequest<I>::handle_notify_unquiesce(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to notify unquiesce: " << cpp_strerror(*result)
+ << dendl;
+ // ignore error
+ }
+
+ *result = m_ret_val;
+ return this->create_context_finisher(m_ret_val);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::update_snap_context() {
+ I &image_ctx = this->m_image_ctx;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::unique_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.get_snap_info(m_snap_id) != NULL) {
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ // immediately add a reference to the new snapshot
+ utime_t snap_time = ceph_clock_now();
+ image_ctx.add_snap(m_snap_namespace, m_snap_name, m_snap_id, m_size,
+ m_parent_info, RBD_PROTECTION_STATUS_UNPROTECTED,
+ 0, snap_time);
+
+ // immediately start using the new snap context if we
+ // own the exclusive lock
+ std::vector<snapid_t> snaps;
+ snaps.push_back(m_snap_id);
+ snaps.insert(snaps.end(), image_ctx.snapc.snaps.begin(),
+ image_ctx.snapc.snaps.end());
+
+ image_ctx.snapc.seq = m_snap_id;
+ image_ctx.snapc.snaps.swap(snaps);
+ image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(
+ image_ctx.snapc.seq, image_ctx.snaps);
+ image_ctx.rebuild_data_io_context();
+
+ if (!image_ctx.migration_info.empty()) {
+ auto it = image_ctx.migration_info.snap_map.find(CEPH_NOSNAP);
+ ceph_assert(it != image_ctx.migration_info.snap_map.end());
+ ceph_assert(!it->second.empty());
+ if (it->second[0] == CEPH_NOSNAP) {
+ ldout(cct, 5) << this << " " << __func__
+ << ": updating migration snap_map" << dendl;
+ it->second[0] = m_snap_id;
+ }
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotCreateRequest.h b/src/librbd/operation/SnapshotCreateRequest.h
new file mode 100644
index 000000000..d306ee21b
--- /dev/null
+++ b/src/librbd/operation/SnapshotCreateRequest.h
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Types.h"
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotCreateRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Create goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_NOTIFY_QUIESCE * * * * * * * * * * * * *
+ * | *
+ * v *
+ * STATE_SUSPEND_REQUESTS *
+ * | *
+ * v *
+ * STATE_SUSPEND_AIO * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * STATE_APPEND_OP_EVENT (skip if journal *
+ * | disabled) *
+ * (retry) v *
+ * . . . > STATE_ALLOCATE_SNAP_ID *
+ * . | *
+ * . v *
+ * . . . . STATE_CREATE_SNAP * * * * * * * * * * * *
+ * | * *
+ * v * *
+ * STATE_CREATE_OBJECT_MAP (skip if * *
+ * | disabled) * *
+ * v * *
+ * STATE_CREATE_IMAGE_STATE (skip if * *
+ * | not mirror * *
+ * | snapshot) * *
+ * | v *
+ * | STATE_RELEASE_SNAP_ID *
+ * | | *
+ * | v *
+ * \------------> STATE_NOTIFY_UNQUIESCE < * *
+ * |
+ * v
+ * <finish>
+ * @endverbatim
+ *
+ * The _CREATE_STATE state may repeat back to the _ALLOCATE_SNAP_ID state
+ * if a stale snapshot context is allocated. If the create operation needs
+ * to abort, the error path is followed to record the result in the journal
+ * (if enabled) and bubble the originating error code back to the client.
+ */
+ SnapshotCreateRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, uint64_t journal_op_tid,
+ uint64_t flags, ProgressContext &prog_ctx);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override {
+ return true;
+ }
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapCreateEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ bool m_skip_object_map;
+ bool m_skip_notify_quiesce;
+ bool m_ignore_notify_quiesce_error;
+ ProgressContext &m_prog_ctx;
+
+ uint64_t m_request_id = 0;
+ int m_ret_val = 0;
+ bool m_writes_blocked = false;
+
+ uint64_t m_snap_id = CEPH_NOSNAP;
+ uint64_t m_size;
+ ParentImageInfo m_parent_info;
+
+ void send_notify_quiesce();
+ Context *handle_notify_quiesce(int *result);
+
+ void send_suspend_requests();
+ Context *handle_suspend_requests(int *result);
+
+ void send_suspend_aio();
+ Context *handle_suspend_aio(int *result);
+
+ void send_append_op_event();
+ Context *handle_append_op_event(int *result);
+
+ void send_allocate_snap_id();
+ Context *handle_allocate_snap_id(int *result);
+
+ void send_create_snap();
+ Context *handle_create_snap(int *result);
+
+ Context *send_create_object_map();
+ Context *handle_create_object_map(int *result);
+
+ Context *send_create_image_state();
+ Context *handle_create_image_state(int *result);
+
+ void send_release_snap_id();
+ Context *handle_release_snap_id(int *result);
+
+ Context *send_notify_unquiesce();
+ Context *handle_notify_unquiesce(int *result);
+
+ void update_snap_context();
+
+ void save_result(int *result) {
+ if (m_ret_val == 0 && *result < 0) {
+ m_ret_val = *result;
+ }
+ }
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotLimitRequest.cc b/src/librbd/operation/SnapshotLimitRequest.cc
new file mode 100644
index 000000000..17aed5f6a
--- /dev/null
+++ b/src/librbd/operation/SnapshotLimitRequest.cc
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotLimitRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotLimitRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+SnapshotLimitRequest<I>::SnapshotLimitRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t limit)
+ : Request<I>(image_ctx, on_finish), m_snap_limit(limit) {
+}
+
+template <typename I>
+void SnapshotLimitRequest<I>::send_op() {
+ send_limit_snaps();
+}
+
+template <typename I>
+bool SnapshotLimitRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotLimitRequest<I>::send_limit_snaps() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ librados::ObjectWriteOperation op;
+ cls_client::snapshot_set_limit(&op, m_snap_limit);
+
+ librados::AioCompletion *rados_completion =
+ this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+ &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotLimitRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotLimitRequest.h b/src/librbd/operation/SnapshotLimitRequest.h
new file mode 100644
index 000000000..09622a459
--- /dev/null
+++ b/src/librbd/operation/SnapshotLimitRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotLimitRequest : public Request<ImageCtxT> {
+public:
+ SnapshotLimitRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t limit);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapLimitEvent(op_tid, m_snap_limit);
+ }
+
+private:
+ uint64_t m_snap_limit;
+
+ void send_limit_snaps();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotLimitRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H
diff --git a/src/librbd/operation/SnapshotProtectRequest.cc b/src/librbd/operation/SnapshotProtectRequest.cc
new file mode 100644
index 000000000..f3b9e7e0b
--- /dev/null
+++ b/src/librbd/operation/SnapshotProtectRequest.cc
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotProtectRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename SnapshotProtectRequest<I>::State& state) {
+ switch(state) {
+ case SnapshotProtectRequest<I>::STATE_PROTECT_SNAP:
+ os << "PROTECT_SNAP";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotProtectRequest<I>::SnapshotProtectRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_state(STATE_PROTECT_SNAP) {
+}
+
+template <typename I>
+void SnapshotProtectRequest<I>::send_op() {
+ send_protect_snap();
+}
+
+template <typename I>
+bool SnapshotProtectRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EBUSY) {
+ ldout(cct, 1) << "snapshot is already protected" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotProtectRequest<I>::send_protect_snap() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ int r = verify_and_send_protect_snap();
+ if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+}
+
+template <typename I>
+int SnapshotProtectRequest<I>::verify_and_send_protect_snap() {
+ I &image_ctx = this->m_image_ctx;
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ CephContext *cct = image_ctx.cct;
+ if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) {
+ lderr(cct) << "image must support layering" << dendl;
+ return -ENOSYS;
+ }
+
+ uint64_t snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name);
+ if (snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+
+ bool is_protected;
+ int r = image_ctx.is_snap_protected(snap_id, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_protected) {
+ return -EBUSY;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, snap_id,
+ RBD_PROTECTION_STATUS_PROTECTED);
+
+ librados::AioCompletion *rados_completion =
+ this->create_callback_completion();
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+ &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotProtectRequest.h b/src/librbd/operation/SnapshotProtectRequest.h
new file mode 100644
index 000000000..bef80229a
--- /dev/null
+++ b/src/librbd/operation/SnapshotProtectRequest.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotProtectRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Protect goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_PROTECT_SNAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+ enum State {
+ STATE_PROTECT_SNAP
+ };
+
+ SnapshotProtectRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapProtectEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ State m_state;
+
+ void send_protect_snap();
+
+ int verify_and_send_protect_snap();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRemoveRequest.cc b/src/librbd/operation/SnapshotRemoveRequest.cc
new file mode 100644
index 000000000..b78be8a0a
--- /dev/null
+++ b/src/librbd/operation/SnapshotRemoveRequest.cc
@@ -0,0 +1,505 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/DetachChildRequest.h"
+#include "librbd/mirror/snapshot/RemoveImageStateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRemoveRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+SnapshotRemoveRequest<I>::SnapshotRemoveRequest(
+ I &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, uint64_t snap_id)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_snap_id(snap_id) {
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_op() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.snap_info.find(m_snap_id) == image_ctx.snap_info.end()) {
+ lderr(cct) << "snapshot doesn't exist" << dendl;
+ this->async_complete(-ENOENT);
+ return;
+ }
+ }
+
+ trash_snap();
+}
+
+template <typename I>
+bool SnapshotRemoveRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0 && r != -EBUSY) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::trash_snap() {
+ I &image_ctx = this->m_image_ctx;
+ if (image_ctx.old_format) {
+ release_snap_id();
+ return;
+ } else if (cls::rbd::get_snap_namespace_type(m_snap_namespace) ==
+ cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) {
+ get_snap();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::snapshot_trash_add(&op, m_snap_id);
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_trash_snap>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_trash_snap(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ // trash / clone v2 not supported
+ detach_child();
+ return;
+ } else if (r < 0 && r != -EEXIST) {
+ lderr(cct) << "failed to move snapshot to trash: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ m_trashed_snapshot = true;
+ get_snap();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::get_snap() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::snapshot_get_start(&op, m_snap_id);
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_get_snap>(this);
+ m_out_bl.clear();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_get_snap(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == 0) {
+ cls::rbd::SnapshotInfo snap_info;
+
+ auto it = m_out_bl.cbegin();
+ r = cls_client::snapshot_get_finish(&it, &snap_info);
+ m_child_attached = (snap_info.child_count > 0);
+ if (r == 0 && m_child_attached) {
+ list_children();
+ return;
+ }
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to retrieve snapshot: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::list_children() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::children_list_start(&op, m_snap_id);
+
+ m_out_bl.clear();
+ m_child_images.clear();
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_list_children>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_list_children(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = cls_client::children_list_finish(&it, &m_child_images);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to retrieve child: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ detach_stale_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::detach_stale_child() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ for (auto& child_image : m_child_images) {
+ m_child_attached = true;
+ IoCtx ioctx;
+ int r = util::create_ioctx(image_ctx.md_ctx, "child image",
+ child_image.pool_id,
+ child_image.pool_namespace, &ioctx);
+ if (r == -ENOENT) {
+ librados::ObjectWriteOperation op;
+ cls_client::child_detach(&op, m_snap_id,
+ {child_image.pool_id,
+ child_image.pool_namespace,
+ child_image.image_id});
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_detach_stale_child>(this);
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ return;
+ } else if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+ }
+
+ detach_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_detach_stale_child(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to detach stale child: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ m_child_attached = false;
+ list_children();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::detach_child() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ bool detach_child = false;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ cls::rbd::ParentImageSpec our_pspec;
+ int r = image_ctx.get_parent_spec(m_snap_id, &our_pspec);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "No such snapshot" << dendl;
+ } else {
+ lderr(cct) << "failed to retrieve parent spec" << dendl;
+ }
+
+ this->async_complete(r);
+ return;
+ }
+
+ if (image_ctx.parent_md.spec != our_pspec &&
+ (scan_for_parents(our_pspec) == -ENOENT)) {
+ // no other references to the parent image
+ detach_child = true;
+ }
+ }
+
+ if (!detach_child) {
+ // HEAD image or other snapshots still associated with parent
+ remove_object_map();
+ return;
+ }
+
+ ldout(cct, 5) << dendl;
+ auto ctx = create_context_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_detach_child>(this);
+ auto req = image::DetachChildRequest<I>::create(image_ctx, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_detach_child(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to detach child from parent: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_object_map();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_object_map() {
+ I &image_ctx = this->m_image_ctx;
+ if (m_child_attached) {
+ // if a clone v2 child is attached to this snapshot, we cannot
+ // proceed. It's only an error if the snap was already in the trash
+ this->complete(m_trashed_snapshot ? 0 : -EBUSY);
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+
+ {
+ std::shared_lock owner_lock{image_ctx.owner_lock};
+ std::unique_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr) {
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_remove_object_map>(this);
+ image_ctx.object_map->snapshot_remove(m_snap_id, ctx);
+ return;
+ }
+ }
+
+ // object map disabled
+ remove_image_state();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_remove_object_map(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove snapshot object map: " << cpp_strerror(r)
+ << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_image_state();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_image_state() {
+ I &image_ctx = this->m_image_ctx;
+ auto type = cls::rbd::get_snap_namespace_type(m_snap_namespace);
+
+ if (type != cls::rbd::SNAPSHOT_NAMESPACE_TYPE_MIRROR) {
+ release_snap_id();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = create_context_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_remove_image_state>(this);
+ auto req = mirror::snapshot::RemoveImageStateRequest<I>::create(
+ &image_ctx, m_snap_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_remove_image_state(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove image state: " << cpp_strerror(r)
+ << dendl;
+ if (r != -ENOENT) {
+ this->complete(r);
+ return;
+ }
+ }
+
+ release_snap_id();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::release_snap_id() {
+ I &image_ctx = this->m_image_ctx;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ remove_snap();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_snap_id << dendl;
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_release_snap_id>(this);
+ image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, aio_comp);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_release_snap_id(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "failed to release snap id: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_snap();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_snap() {
+ I &image_ctx = this->m_image_ctx;
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ cls_client::old_snapshot_remove(&op, m_snap_name);
+ } else {
+ cls_client::snapshot_remove(&op, m_snap_id);
+ }
+
+ auto aio_comp = create_rados_callback<
+ SnapshotRemoveRequest<I>,
+ &SnapshotRemoveRequest<I>::handle_remove_snap>(this);
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::handle_remove_snap(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove snapshot: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ remove_snap_context();
+ this->complete(0);
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_snap_context() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ std::unique_lock image_locker{image_ctx.image_lock};
+ image_ctx.rm_snap(m_snap_namespace, m_snap_name, m_snap_id);
+}
+
+template <typename I>
+int SnapshotRemoveRequest<I>::scan_for_parents(
+ cls::rbd::ParentImageSpec &pspec) {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ if (pspec.pool_id != -1) {
+ map<uint64_t, SnapInfo>::iterator it;
+ for (it = image_ctx.snap_info.begin();
+ it != image_ctx.snap_info.end(); ++it) {
+ // skip our snap id (if checking base image, CEPH_NOSNAP won't match)
+ if (it->first == m_snap_id) {
+ continue;
+ }
+ if (it->second.parent.spec == pspec) {
+ break;
+ }
+ }
+ if (it == image_ctx.snap_info.end()) {
+ return -ENOENT;
+ }
+ }
+ return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRemoveRequest.h b/src/librbd/operation/SnapshotRemoveRequest.h
new file mode 100644
index 000000000..17638a529
--- /dev/null
+++ b/src/librbd/operation/SnapshotRemoveRequest.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "include/buffer.h"
+#include "librbd/Types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRemoveRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * TRASH_SNAP
+ * |
+ * v (skip if unsupported)
+ * GET_SNAP
+ * |
+ * v (skip if unnecessary)
+ * LIST_CHILDREN <-------------\
+ * | |
+ * v (skip if unnecessary) | (repeat as needed)
+ * DETACH_STALE_CHILD ---------/
+ * |
+ * v (skip if unnecessary)
+ * DETACH_CHILD
+ * |
+ * v (skip if disabled/in-use)
+ * REMOVE_OBJECT_MAP
+ * |
+ * v (skip if not mirror snpashot)
+ * REMOVE_IMAGE_STATE
+ * |
+ * v (skip if in-use)
+ * RELEASE_SNAP_ID
+ * |
+ * v (skip if in-use)
+ * REMOVE_SNAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ static SnapshotRemoveRequest *create(
+ ImageCtxT &image_ctx, const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name, uint64_t snap_id, Context *on_finish) {
+ return new SnapshotRemoveRequest(image_ctx, on_finish, snap_namespace,
+ snap_name, snap_id);
+ }
+
+ SnapshotRemoveRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t snap_id);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapRemoveEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ cls::rbd::ChildImageSpecs m_child_images;
+ std::string m_snap_name;
+ uint64_t m_snap_id;
+ bool m_trashed_snapshot = false;
+ bool m_child_attached = false;
+
+ ceph::bufferlist m_out_bl;
+
+ void trash_snap();
+ void handle_trash_snap(int r);
+
+ void get_snap();
+ void handle_get_snap(int r);
+
+ void list_children();
+ void handle_list_children(int r);
+
+ void detach_stale_child();
+ void handle_detach_stale_child(int r);
+
+ void detach_child();
+ void handle_detach_child(int r);
+
+ void remove_object_map();
+ void handle_remove_object_map(int r);
+
+ void remove_image_state();
+ void handle_remove_image_state(int r);
+
+ void release_snap_id();
+ void handle_release_snap_id(int r);
+
+ void remove_snap();
+ void handle_remove_snap(int r);
+
+ void remove_snap_context();
+ int scan_for_parents(cls::rbd::ParentImageSpec &pspec);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRenameRequest.cc b/src/librbd/operation/SnapshotRenameRequest.cc
new file mode 100644
index 000000000..e9257f18c
--- /dev/null
+++ b/src/librbd/operation/SnapshotRenameRequest.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRenameRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRenameRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename SnapshotRenameRequest<I>::State& state) {
+ switch(state) {
+ case SnapshotRenameRequest<I>::STATE_RENAME_SNAP:
+ os << "RENAME_SNAP";
+ break;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRenameRequest<I>::SnapshotRenameRequest(I &image_ctx,
+ Context *on_finish,
+ uint64_t snap_id,
+ const std::string &snap_name)
+ : Request<I>(image_ctx, on_finish), m_snap_id(snap_id),
+ m_snap_name(snap_name), m_state(STATE_RENAME_SNAP) {
+}
+
+template <typename I>
+journal::Event SnapshotRenameRequest<I>::create_event(uint64_t op_tid) const {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ std::string src_snap_name;
+ auto snap_info_it = image_ctx.snap_info.find(m_snap_id);
+ if (snap_info_it != image_ctx.snap_info.end()) {
+ src_snap_name = snap_info_it->second.name;
+ }
+
+ return journal::SnapRenameEvent(op_tid, m_snap_id, src_snap_name,
+ m_snap_name);
+}
+
+template <typename I>
+void SnapshotRenameRequest<I>::send_op() {
+ send_rename_snap();
+}
+
+template <typename I>
+bool SnapshotRenameRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EEXIST) {
+ ldout(cct, 1) << "snapshot already exists" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ }
+ return true;
+}
+
+template <typename I>
+void SnapshotRenameRequest<I>::send_rename_snap() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ librados::ObjectWriteOperation op;
+ if (image_ctx.old_format) {
+ cls_client::old_snapshot_rename(&op, m_snap_id, m_snap_name);
+ } else {
+ cls_client::snapshot_rename(&op, m_snap_id, m_snap_name);
+ }
+
+ librados::AioCompletion *rados_completion = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+ rados_completion, &op);
+ ceph_assert(r == 0);
+ rados_completion->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRenameRequest.h b/src/librbd/operation/SnapshotRenameRequest.h
new file mode 100644
index 000000000..697772e02
--- /dev/null
+++ b/src/librbd/operation/SnapshotRenameRequest.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRenameRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Rename goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_RENAME_SNAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ */
+ enum State {
+ STATE_RENAME_SNAP
+ };
+
+ SnapshotRenameRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t snap_id, const std::string &snap_name);
+
+ journal::Event create_event(uint64_t op_tid) const override;
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+private:
+ uint64_t m_snap_id;
+ std::string m_snap_name;
+ State m_state;
+
+ void send_rename_snap();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRollbackRequest.cc b/src/librbd/operation/SnapshotRollbackRequest.cc
new file mode 100644
index 000000000..87c5212de
--- /dev/null
+++ b/src/librbd/operation/SnapshotRollbackRequest.cc
@@ -0,0 +1,424 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRollbackRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRollbackRequest: "
+
+namespace librbd {
+namespace operation {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace {
+
+template <typename I>
+class C_RollbackObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_RollbackObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t snap_id, uint64_t object_num,
+ uint64_t head_num_objects,
+ decltype(I::object_map) snap_object_map)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snap_id(snap_id),
+ m_object_num(object_num), m_head_num_objects(head_num_objects),
+ m_snap_object_map(snap_object_map) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 20) << "C_RollbackObject: " << __func__ << ": object_num="
+ << m_object_num << dendl;
+
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (m_object_num < m_head_num_objects &&
+ m_snap_object_map != nullptr &&
+ !image_ctx.object_map->object_may_exist(m_object_num) &&
+ !m_snap_object_map->object_may_exist(m_object_num)) {
+ return 1;
+ }
+ }
+
+ std::string oid = image_ctx.get_object_name(m_object_num);
+
+ librados::ObjectWriteOperation op;
+ op.selfmanaged_snap_rollback(m_snap_id);
+
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(this);
+ image_ctx.data_ctx.aio_operate(oid, rados_completion, &op);
+ rados_completion->release();
+ return 0;
+ }
+
+private:
+ uint64_t m_snap_id;
+ uint64_t m_object_num;
+ uint64_t m_head_num_objects;
+ decltype(I::object_map) m_snap_object_map;
+};
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRollbackRequest<I>::SnapshotRollbackRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t snap_id,
+ uint64_t snap_size,
+ ProgressContext &prog_ctx)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_snap_id(snap_id),
+ m_snap_size(snap_size), m_prog_ctx(prog_ctx),
+ m_object_map(nullptr), m_snap_object_map(nullptr) {
+}
+
+template <typename I>
+SnapshotRollbackRequest<I>::~SnapshotRollbackRequest() {
+ I &image_ctx = this->m_image_ctx;
+ if (m_blocking_writes) {
+ image_ctx.io_image_dispatcher->unblock_writes();
+ }
+ if (m_object_map) {
+ m_object_map->put();
+ m_object_map = nullptr;
+ }
+ if (m_snap_object_map) {
+ m_snap_object_map->put();
+ m_snap_object_map = nullptr;
+ }
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_op() {
+ send_block_writes();
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_block_writes() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_blocking_writes = true;
+ image_ctx.io_image_dispatcher->block_writes(create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_block_writes>(this));
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_block_writes(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_resize_image();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_resize_image() {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t current_size;
+ {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::shared_lock image_locker{image_ctx.image_lock};
+ current_size = image_ctx.get_image_size(CEPH_NOSNAP);
+ }
+
+ m_head_num_objects = Striper::get_num_objects(image_ctx.layout, current_size);
+
+ if (current_size == m_snap_size) {
+ send_get_snap_object_map();
+ return;
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_resize_image>(this);
+ ResizeRequest<I> *req = ResizeRequest<I>::create(image_ctx, ctx, m_snap_size,
+ true, m_no_op_prog_ctx, 0, true);
+ req->send();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_resize_image(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to resize image for rollback: "
+ << cpp_strerror(*result) << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ send_get_snap_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_get_snap_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t flags = 0;
+ bool object_map_enabled;
+ CephContext *cct = image_ctx.cct;
+ {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::shared_lock image_locker{image_ctx.image_lock};
+ object_map_enabled = (image_ctx.object_map != nullptr);
+ int r = image_ctx.get_flags(m_snap_id, &flags);
+ if (r < 0) {
+ object_map_enabled = false;
+ }
+ }
+ if (object_map_enabled &&
+ (flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
+ lderr(cct) << "warning: object-map is invalid for snapshot" << dendl;
+ object_map_enabled = false;
+ }
+ if (!object_map_enabled) {
+ send_rollback_object_map();
+ return;
+ }
+
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_snap_object_map = image_ctx.create_object_map(m_snap_id);
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_get_snap_object_map>(this);
+ m_snap_object_map->open(ctx);
+ return;
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_get_snap_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to open object map: "
+ << cpp_strerror(*result) << dendl;
+ m_snap_object_map->put();
+ m_snap_object_map = nullptr;
+ }
+
+ send_rollback_object_map();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_rollback_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr) {
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_rollback_object_map>(this);
+ image_ctx.object_map->rollback(m_snap_id, ctx);
+ return;
+ }
+ }
+
+ send_rollback_objects();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_rollback_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to roll back object "
+ << "map: " << cpp_strerror(*result) << dendl;
+
+ ceph_assert(m_object_map == nullptr);
+ apply();
+ return this->create_context_finisher(*result);
+ }
+
+ send_rollback_objects();
+ return nullptr;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_rollback_objects() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ uint64_t num_objects;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ num_objects = Striper::get_num_objects(image_ctx.layout,
+ image_ctx.get_current_size());
+ }
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_rollback_objects>(this);
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_RollbackObject<I> >(),
+ boost::lambda::_1, &image_ctx, m_snap_id, boost::lambda::_2,
+ m_head_num_objects, m_snap_object_map));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, num_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_rollback_objects(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result == -ERESTART) {
+ ldout(cct, 5) << "snapshot rollback operation interrupted" << dendl;
+ return this->create_context_finisher(*result);
+ } else if (*result < 0) {
+ lderr(cct) << "failed to rollback objects: " << cpp_strerror(*result)
+ << dendl;
+ return this->create_context_finisher(*result);
+ }
+
+ return send_refresh_object_map();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::send_refresh_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool object_map_enabled;
+ {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::shared_lock image_locker{image_ctx.image_lock};
+ object_map_enabled = (image_ctx.object_map != nullptr);
+ }
+ if (!object_map_enabled) {
+ return send_invalidate_cache();
+ }
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_object_map = image_ctx.create_object_map(CEPH_NOSNAP);
+
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_refresh_object_map>(this);
+ m_object_map->open(ctx);
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_refresh_object_map(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << this << " " << __func__ << ": failed to open object map: "
+ << cpp_strerror(*result) << dendl;
+ m_object_map->put();
+ m_object_map = nullptr;
+ apply();
+
+ return this->create_context_finisher(*result);
+ }
+
+ return send_invalidate_cache();
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::send_invalidate_cache() {
+ I &image_ctx = this->m_image_ctx;
+
+ apply();
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ if(m_object_map != nullptr) {
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_invalidate_cache>(this, m_object_map);
+ image_ctx.io_image_dispatcher->invalidate_cache(ctx);
+ }
+ else {
+ Context *ctx = create_context_callback<
+ SnapshotRollbackRequest<I>,
+ &SnapshotRollbackRequest<I>::handle_invalidate_cache>(this);
+ image_ctx.io_image_dispatcher->invalidate_cache(ctx);
+ }
+ return nullptr;
+}
+
+template <typename I>
+Context *SnapshotRollbackRequest<I>::handle_invalidate_cache(int *result) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl;
+
+ if (*result < 0) {
+ lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result)
+ << dendl;
+ }
+ return this->create_context_finisher(*result);
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::apply() {
+ I &image_ctx = this->m_image_ctx;
+
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::unique_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr) {
+ std::swap(m_object_map, image_ctx.object_map);
+ }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRollbackRequest.h b/src/librbd/operation/SnapshotRollbackRequest.h
new file mode 100644
index 000000000..e58a618f2
--- /dev/null
+++ b/src/librbd/operation/SnapshotRollbackRequest.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/journal/Types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRollbackRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Rollback goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start> ---------\
+ * |
+ * v
+ * STATE_BLOCK_WRITES
+ * |
+ * v
+ * STATE_RESIZE_IMAGE (skip if resize not
+ * | required)
+ * v
+ * STATE_GET_SNAP_OBJECT_MAP (skip if object)
+ * | map disabled)
+ * v
+ * STATE_ROLLBACK_OBJECT_MAP (skip if object
+ * | map disabled)
+ * v
+ * STATE_ROLLBACK_OBJECTS
+ * |
+ * v
+ * STATE_REFRESH_OBJECT_MAP (skip if object
+ * | map disabled)
+ * v
+ * STATE_INVALIDATE_CACHE (skip if cache
+ * | disabled)
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The _RESIZE_IMAGE state is skipped if the image doesn't need to be resized.
+ * The _ROLLBACK_OBJECT_MAP state is skipped if the object map isn't enabled.
+ * The _INVALIDATE_CACHE state is skipped if the cache isn't enabled.
+ */
+
+ SnapshotRollbackRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name,
+ uint64_t snap_id,
+ uint64_t snap_size, ProgressContext &prog_ctx);
+ ~SnapshotRollbackRequest() override;
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override {
+ return true;
+ }
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapRollbackEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ uint64_t m_snap_id;
+ uint64_t m_snap_size;
+ uint64_t m_head_num_objects;
+ ProgressContext &m_prog_ctx;
+
+ NoOpProgressContext m_no_op_prog_ctx;
+
+ bool m_blocking_writes = false;
+ decltype(ImageCtxT::object_map) m_object_map;
+ decltype(ImageCtxT::object_map) m_snap_object_map;
+
+ void send_block_writes();
+ Context *handle_block_writes(int *result);
+
+ void send_resize_image();
+ Context *handle_resize_image(int *result);
+
+ void send_get_snap_object_map();
+ Context *handle_get_snap_object_map(int *result);
+
+ void send_rollback_object_map();
+ Context *handle_rollback_object_map(int *result);
+
+ void send_rollback_objects();
+ Context *handle_rollback_objects(int *result);
+
+ Context *send_refresh_object_map();
+ Context *handle_refresh_object_map(int *result);
+
+ Context *send_invalidate_cache();
+ Context *handle_invalidate_cache(int *result);
+
+ void apply();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
diff --git a/src/librbd/operation/SnapshotUnprotectRequest.cc b/src/librbd/operation/SnapshotUnprotectRequest.cc
new file mode 100644
index 000000000..76caf68f3
--- /dev/null
+++ b/src/librbd/operation/SnapshotUnprotectRequest.cc
@@ -0,0 +1,353 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include <list>
+#include <set>
+#include <vector>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotUnprotectRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+typedef std::pair<int64_t, std::string> Pool;
+typedef std::vector<Pool> Pools;
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+ const typename SnapshotUnprotectRequest<I>::State& state) {
+ switch(state) {
+ case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_START:
+ os << "UNPROTECT_SNAP_START";
+ break;
+ case SnapshotUnprotectRequest<I>::STATE_SCAN_POOL_CHILDREN:
+ os << "SCAN_POOL_CHILDREN";
+ break;
+ case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_FINISH:
+ os << "UNPROTECT_SNAP_FINISH";
+ break;
+ case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_ROLLBACK:
+ os << "UNPROTECT_SNAP_ROLLBACK";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+ break;
+ }
+ return os;
+}
+
+template <typename I>
+class C_ScanPoolChildren : public C_AsyncObjectThrottle<I> {
+public:
+ C_ScanPoolChildren(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ const cls::rbd::ParentImageSpec &pspec, const Pools &pools,
+ size_t pool_idx)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_pspec(pspec),
+ m_pool(pools[pool_idx]) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " scanning pool '" << m_pool.second << "'"
+ << dendl;
+
+ librados::Rados rados(image_ctx.md_ctx);
+ int64_t base_tier;
+ int r = rados.pool_get_base_tier(m_pool.first, &base_tier);
+ if (r == -ENOENT) {
+ ldout(cct, 1) << "pool '" << m_pool.second << "' no longer exists"
+ << dendl;
+ return 1;
+ } else if (r < 0) {
+ lderr(cct) << "error retrieving base tier for pool '"
+ << m_pool.second << "'" << dendl;
+ return r;
+ }
+ if (m_pool.first != base_tier) {
+ // pool is a cache; skip it
+ return 1;
+ }
+
+ r = util::create_ioctx(image_ctx.md_ctx, "child image", m_pool.first, {},
+ &m_pool_ioctx);
+ if (r == -ENOENT) {
+ return 1;
+ } else if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectReadOperation op;
+ cls_client::get_children_start(&op, m_pspec);
+
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(this);
+ r = m_pool_ioctx.aio_operate(RBD_CHILDREN, rados_completion, &op,
+ &m_children_bl);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return 0;
+ }
+
+protected:
+ void finish(int r) override {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (r == 0) {
+ auto it = m_children_bl.cbegin();
+ r= cls_client::get_children_finish(&it, &m_children);
+ }
+
+ ldout(cct, 10) << this << " retrieved children: r=" << r << dendl;
+ if (r == -ENOENT) {
+ // no children -- proceed with unprotect
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << "cannot get children for pool '" << m_pool.second << "'"
+ << dendl;
+ } else {
+ lderr(cct) << "cannot unprotect: at least " << m_children.size() << " "
+ << "child(ren) [" << joinify(m_children.begin(),
+ m_children.end(),
+ std::string(",")) << "] "
+ << "in pool '" << m_pool.second << "'" << dendl;
+ r = -EBUSY;
+ }
+ C_AsyncObjectThrottle<I>::finish(r);
+ }
+
+private:
+ cls::rbd::ParentImageSpec m_pspec;
+ Pool m_pool;
+
+ IoCtx m_pool_ioctx;
+ std::set<std::string> m_children;
+ bufferlist m_children_bl;
+};
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotUnprotectRequest<I>::SnapshotUnprotectRequest(I &image_ctx,
+ Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name)
+ : Request<I>(image_ctx, on_finish), m_snap_namespace(snap_namespace),
+ m_snap_name(snap_name), m_state(STATE_UNPROTECT_SNAP_START),
+ m_ret_val(0), m_snap_id(CEPH_NOSNAP) {
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_op() {
+ send_unprotect_snap_start();
+}
+
+template <typename I>
+bool SnapshotUnprotectRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+ << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EINVAL) {
+ ldout(cct, 1) << "snapshot is already unprotected" << dendl;
+ } else {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ }
+
+ // use a different state machine once an error is encountered
+ if (m_ret_val < 0) {
+ return should_complete_error();
+ }
+
+ std::shared_lock owner_lock{image_ctx.owner_lock};
+ bool finished = false;
+ switch (m_state) {
+ case STATE_UNPROTECT_SNAP_START:
+ send_scan_pool_children();
+ break;
+ case STATE_SCAN_POOL_CHILDREN:
+ send_unprotect_snap_finish();
+ break;
+ case STATE_UNPROTECT_SNAP_FINISH:
+ finished = true;
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return finished;
+}
+
+template <typename I>
+bool SnapshotUnprotectRequest<I>::should_complete_error() {
+ I &image_ctx = this->m_image_ctx;
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ CephContext *cct = image_ctx.cct;
+ lderr(cct) << this << " " << __func__ << ": "
+ << "ret_val=" << m_ret_val << dendl;
+
+ bool finished = true;
+ if (m_state == STATE_SCAN_POOL_CHILDREN ||
+ m_state == STATE_UNPROTECT_SNAP_FINISH) {
+ send_unprotect_snap_rollback();
+ finished = false;
+ }
+ return finished;
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_start() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ int r = verify_and_send_unprotect_snap_start();
+ if (r < 0) {
+ this->async_complete(r);
+ return;
+ }
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_scan_pool_children() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+ m_state = STATE_SCAN_POOL_CHILDREN;
+
+ // search all pools for children depending on this snapshot
+ // TODO add async version of wait_for_latest_osdmap
+ librados::Rados rados(image_ctx.md_ctx);
+ rados.wait_for_latest_osdmap();
+
+ // protect against pools being renamed/deleted
+ std::list<Pool> pool_list;
+ rados.pool_list2(pool_list);
+
+ cls::rbd::ParentImageSpec pspec(image_ctx.md_ctx.get_id(),
+ image_ctx.md_ctx.get_namespace(),
+ image_ctx.id, m_snap_id);
+ Pools pools(pool_list.begin(), pool_list.end());
+
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_ScanPoolChildren<I> >(),
+ boost::lambda::_1, &image_ctx, pspec, pools, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ nullptr, image_ctx, context_factory, ctx, NULL, 0, pools.size());
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_finish() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_state = STATE_UNPROTECT_SNAP_FINISH;
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, m_snap_id,
+ RBD_PROTECTION_STATUS_UNPROTECTED);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_rollback() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " " << __func__ << dendl;
+
+ m_state = STATE_UNPROTECT_SNAP_ROLLBACK;
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, m_snap_id,
+ RBD_PROTECTION_STATUS_PROTECTED);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+int SnapshotUnprotectRequest<I>::verify_and_send_unprotect_snap_start() {
+ I &image_ctx = this->m_image_ctx;
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ CephContext *cct = image_ctx.cct;
+ if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) {
+ lderr(cct) << "image must support layering" << dendl;
+ return -ENOSYS;
+ }
+
+ m_snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name);
+ if (m_snap_id == CEPH_NOSNAP) {
+ return -ENOENT;
+ }
+
+ bool is_unprotected;
+ int r = image_ctx.is_snap_unprotected(m_snap_id, &is_unprotected);
+ if (r < 0) {
+ return r;
+ }
+
+ if (is_unprotected) {
+ lderr(cct) << "snapshot is already unprotected" << dendl;
+ return -EINVAL;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls_client::set_protection_status(&op, m_snap_id,
+ RBD_PROTECTION_STATUS_UNPROTECTING);
+
+ librados::AioCompletion *comp = this->create_callback_completion();
+ r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+
+ // TODO legacy code threw a notification post UNPROTECTING update -- required?
+ return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotUnprotectRequest.h b/src/librbd/operation/SnapshotUnprotectRequest.h
new file mode 100644
index 000000000..19cc6d32b
--- /dev/null
+++ b/src/librbd/operation/SnapshotUnprotectRequest.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotUnprotectRequest : public Request<ImageCtxT> {
+public:
+ /**
+ * Snap Unprotect goes through the following state machine:
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * STATE_UNPROTECT_SNAP_START
+ * |
+ * v
+ * STATE_SCAN_POOL_CHILDREN * * * * > STATE_UNPROTECT_SNAP_ROLLBACK
+ * | |
+ * v |
+ * STATE_UNPROTECT_SNAP_FINISH |
+ * | |
+ * v |
+ * <finish> <----------------------------/
+ *
+ * @endverbatim
+ *
+ * If the unprotect operation needs to abort, the error path is followed
+ * to rollback the unprotect in-progress status on the image.
+ */
+ enum State {
+ STATE_UNPROTECT_SNAP_START,
+ STATE_SCAN_POOL_CHILDREN,
+ STATE_UNPROTECT_SNAP_FINISH,
+ STATE_UNPROTECT_SNAP_ROLLBACK
+ };
+
+ SnapshotUnprotectRequest(ImageCtxT &image_ctx, Context *on_finish,
+ const cls::rbd::SnapshotNamespace &snap_namespace,
+ const std::string &snap_name);
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+
+ int filter_return_code(int r) const override {
+ if (m_ret_val < 0) {
+ return m_ret_val;
+ }
+ return 0;
+ }
+
+ journal::Event create_event(uint64_t op_tid) const override {
+ return journal::SnapUnprotectEvent(op_tid, m_snap_namespace, m_snap_name);
+ }
+
+private:
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+ State m_state;
+
+ int m_ret_val;
+ uint64_t m_snap_id;
+
+ bool should_complete_error();
+
+ void send_unprotect_snap_start();
+ void send_scan_pool_children();
+ void send_unprotect_snap_finish();
+ void send_unprotect_snap_rollback();
+
+ int verify_and_send_unprotect_snap_start();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
diff --git a/src/librbd/operation/SparsifyRequest.cc b/src/librbd/operation/SparsifyRequest.cc
new file mode 100644
index 000000000..5d9837c3e
--- /dev/null
+++ b/src/librbd/operation/SparsifyRequest.cc
@@ -0,0 +1,514 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SparsifyRequest.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/err.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include "librbd/io/ObjectRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+bool may_be_trimmed(const std::map<uint64_t,uint64_t> &extent_map,
+ const bufferlist &bl, size_t sparse_size,
+ uint64_t *new_end_ptr) {
+ if (extent_map.empty()) {
+ *new_end_ptr = 0;
+ return true;
+ }
+
+ uint64_t end = extent_map.rbegin()->first + extent_map.rbegin()->second;
+ uint64_t new_end = end;
+ uint64_t bl_off = bl.length();
+
+ for (auto it = extent_map.rbegin(); it != extent_map.rend(); it++) {
+ auto off = it->first;
+ auto len = it->second;
+
+ new_end = p2roundup<uint64_t>(off + len, sparse_size);
+
+ uint64_t extent_left = len;
+ uint64_t sub_len = len % sparse_size;
+ if (sub_len == 0) {
+ sub_len = sparse_size;
+ }
+ while (extent_left > 0) {
+ ceph_assert(bl_off >= sub_len);
+ bl_off -= sub_len;
+ bufferlist sub_bl;
+ sub_bl.substr_of(bl, bl_off, sub_len);
+ if (!sub_bl.is_zero()) {
+ break;
+ }
+ new_end -= sparse_size;
+ extent_left -= sub_len;
+ sub_len = sparse_size;
+ }
+ if (extent_left > 0) {
+ break;
+ }
+ }
+
+ if (new_end < end) {
+ *new_end_ptr = new_end;
+ return true;
+ }
+
+ return false;
+}
+
+} // anonymous namespace
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::SparsifyObject: " << this \
+ << " " << m_oid << " " << __func__ << ": "
+
+template <typename I>
+class C_SparsifyObject : public C_AsyncObjectThrottle<I> {
+public:
+
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (not supported)
+ * SPARSIFY * * * * * * * * * * * * > READ < * * * * * * * * * * (concurrent
+ * | | * update is
+ * | (object map disabled) | (can trim) * detected)
+ * |------------------------\ V *
+ * | | PRE UPDATE OBJECT MAP *
+ * | (object map enabled) | | (if needed) *
+ * v | V *
+ * PRE UPDATE OBJECT MAP | TRIM * * * * * * * * * * *
+ * | | |
+ * v | V
+ * CHECK EXISTS | POST UPDATE OBJECT MAP
+ * | | | (if needed)
+ * v | |
+ * POST UPDATE OBJECT MAP | |
+ * | | |
+ * v | |
+ * <finish> <------------------/<-------/
+ *
+ * @endverbatim
+ *
+ */
+
+ C_SparsifyObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no, size_t sparse_size)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_cct(image_ctx->cct),
+ m_object_no(object_no), m_sparse_size(sparse_size),
+ m_oid(image_ctx->get_object_name(object_no)) {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ ldout(m_cct, 20) << dendl;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(m_cct) << "missing data pool" << dendl;
+ return -ENODEV;
+ }
+
+ if (image_ctx.exclusive_lock != nullptr &&
+ !image_ctx.exclusive_lock->is_lock_owner()) {
+ ldout(m_cct, 1) << "lost exclusive lock during sparsify" << dendl;
+ return -ERESTART;
+ }
+
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr &&
+ !image_ctx.object_map->object_may_exist(m_object_no)) {
+ // can skip because the object does not exist
+ return 1;
+ }
+
+ uint64_t overlap_objects = 0;
+ uint64_t overlap;
+ int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &overlap);
+ if (r == 0 && overlap > 0) {
+ overlap_objects = Striper::get_num_objects(image_ctx.layout, overlap);
+ }
+ m_remove_empty = (m_object_no >= overlap_objects);
+ }
+
+ send_sparsify();
+ return 0;
+ }
+
+ void send_sparsify() {
+ I &image_ctx = this->m_image_ctx;
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::sparsify(&op, m_sparse_size, m_remove_empty);
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_sparsify>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_sparsify(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ m_trying_trim = true;
+ send_read();
+ return;
+ }
+
+ if (r == -ENOENT) {
+ finish_op(0);
+ return;
+ }
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl;
+ finish_op(r);
+ return;
+ }
+
+ send_pre_update_object_map();
+ }
+
+ void send_pre_update_object_map() {
+ I &image_ctx = this->m_image_ctx;
+
+ if (m_trying_trim) {
+ if (!m_remove_empty || m_new_end != 0 ||
+ !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ send_trim();
+ return;
+ }
+ } else if (!m_remove_empty ||
+ !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ finish_op(0);
+ return;
+ }
+
+ ldout(m_cct, 20) << dendl;
+
+ image_ctx.owner_lock.lock_shared();
+ image_ctx.image_lock.lock_shared();
+ if (image_ctx.object_map == nullptr) {
+ // possible that exclusive lock was lost in background
+ lderr(m_cct) << "object map is not initialized" << dendl;
+
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ finish_op(-EINVAL);
+ return;
+ }
+
+ int r;
+ m_finish_op_ctx = image_ctx.exclusive_lock->start_op(&r);
+ if (m_finish_op_ctx == nullptr) {
+ lderr(m_cct) << "lost exclusive lock" << dendl;
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ finish_op(r);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ C_SparsifyObject<I>,
+ &C_SparsifyObject<I>::handle_pre_update_object_map>(this);
+
+ bool sent = image_ctx.object_map->template aio_update<
+ Context, &Context::complete>(CEPH_NOSNAP, m_object_no, OBJECT_PENDING,
+ OBJECT_EXISTS, {}, false, ctx);
+
+ // NOTE: state machine might complete before we reach here
+ image_ctx.image_lock.unlock_shared();
+ image_ctx.owner_lock.unlock_shared();
+ if (!sent) {
+ finish_op(0);
+ }
+ }
+
+ void handle_pre_update_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update object map: " << cpp_strerror(r)
+ << dendl;
+ finish_op(r);
+ return;
+ }
+
+ if (m_trying_trim) {
+ send_trim();
+ } else {
+ send_check_exists();
+ }
+ }
+
+ void send_check_exists() {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ op.stat(NULL, NULL, NULL);
+ m_bl.clear();
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_check_exists>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_check_exists(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "stat failed: " << cpp_strerror(r) << dendl;
+ finish_op(r);
+ return;
+ }
+
+ send_post_update_object_map(r == 0);
+ }
+
+ void send_post_update_object_map(bool exists) {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ C_SparsifyObject<I>,
+ &C_SparsifyObject<I>::handle_post_update_object_map>(this);
+ bool sent;
+ {
+ std::shared_lock owner_locker{image_ctx.owner_lock};
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ assert(image_ctx.exclusive_lock->is_lock_owner());
+ assert(image_ctx.object_map != nullptr);
+
+ sent = image_ctx.object_map->template aio_update<
+ Context, &Context::complete>(CEPH_NOSNAP, m_object_no,
+ exists ? OBJECT_EXISTS : OBJECT_NONEXISTENT,
+ OBJECT_PENDING, {}, false, ctx);
+ }
+ if (!sent) {
+ ctx->complete(0);
+ }
+ }
+
+ void handle_post_update_object_map(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_cct) << "failed to update object map: " << cpp_strerror(r)
+ << dendl;
+ finish_op(r);
+ return;
+ }
+
+ finish_op(0);
+ }
+
+ void send_read() {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ librados::ObjectReadOperation op;
+ m_bl.clear();
+ op.sparse_read(0, image_ctx.layout.object_size, &m_extent_map, &m_bl,
+ nullptr);
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_read>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_read(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ r = 0;
+ } else {
+ lderr(m_cct) << "failed to read object: " << cpp_strerror(r) << dendl;
+ }
+ finish_op(r);
+ return;
+ }
+
+ if (!may_be_trimmed(m_extent_map, m_bl, m_sparse_size, &m_new_end)) {
+ finish_op(0);
+ return;
+ }
+
+ send_pre_update_object_map();
+ }
+
+ void send_trim() {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << dendl;
+
+ ceph_assert(m_new_end < image_ctx.layout.object_size);
+
+ librados::ObjectWriteOperation op;
+ m_bl.clear();
+ m_bl.append_zero(image_ctx.layout.object_size - m_new_end);
+ op.cmpext(m_new_end, m_bl, nullptr);
+ if (m_new_end == 0 && m_remove_empty) {
+ op.remove();
+ } else {
+ op.truncate(m_new_end);
+ }
+
+ auto comp = create_rados_callback<
+ C_SparsifyObject, &C_SparsifyObject::handle_trim>(this);
+ int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void handle_trim(int r) {
+ I &image_ctx = this->m_image_ctx;
+
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (r <= -MAX_ERRNO) {
+ m_finish_op_ctx->complete(0);
+ m_finish_op_ctx = nullptr;
+ send_read();
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to trim: " << cpp_strerror(r) << dendl;
+ finish_op(r);
+ return;
+ }
+
+ if (!m_remove_empty || m_new_end != 0 ||
+ !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+ finish_op(0);
+ return;
+ }
+
+ send_post_update_object_map(false);
+ }
+
+ void finish_op(int r) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ if (m_finish_op_ctx != nullptr) {
+ m_finish_op_ctx->complete(0);
+ }
+ this->complete(r);
+ }
+
+private:
+ CephContext *m_cct;
+ uint64_t m_object_no;
+ size_t m_sparse_size;
+ std::string m_oid;
+
+ bool m_remove_empty = false;
+ bool m_trying_trim = false;
+ bufferlist m_bl;
+ std::map<uint64_t,uint64_t> m_extent_map;
+ uint64_t m_new_end = 0;
+ Context *m_finish_op_ctx = nullptr;
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::SparsifyRequest: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+bool SparsifyRequest<I>::should_complete(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+ }
+ return true;
+}
+
+template <typename I>
+void SparsifyRequest<I>::send_op() {
+ sparsify_objects();
+}
+
+template <typename I>
+void SparsifyRequest<I>::sparsify_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << dendl;
+
+ assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ uint64_t objects = 0;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ objects = image_ctx.get_object_count(CEPH_NOSNAP);
+ }
+
+ auto ctx = create_context_callback<
+ SparsifyRequest<I>,
+ &SparsifyRequest<I>::handle_sparsify_objects>(this);
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_SparsifyObject<I> >(),
+ boost::lambda::_1, &image_ctx, boost::lambda::_2, m_sparse_size));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void SparsifyRequest<I>::handle_sparsify_objects(int r) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "sparsify operation interrupted" << dendl;
+ this->complete(r);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "sparsify encountered an error: " << cpp_strerror(r) << dendl;
+ this->complete(r);
+ return;
+ }
+
+ this->complete(0);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SparsifyRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SparsifyRequest.h b/src/librbd/operation/SparsifyRequest.h
new file mode 100644
index 000000000..74f9eb727
--- /dev/null
+++ b/src/librbd/operation/SparsifyRequest.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "common/snap_types.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SparsifyRequest : public Request<ImageCtxT>
+{
+public:
+ SparsifyRequest(ImageCtxT &image_ctx, size_t sparse_size, Context *on_finish,
+ ProgressContext &prog_ctx)
+ : Request<ImageCtxT>(image_ctx, on_finish), m_sparse_size(sparse_size),
+ m_prog_ctx(prog_ctx) {
+ }
+
+protected:
+ void send_op() override;
+ bool should_complete(int r) override;
+ bool can_affect_io() const override {
+ return true;
+ }
+ journal::Event create_event(uint64_t op_tid) const override {
+ ceph_abort();
+ return journal::UnknownEvent();
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SPARSIFY OBJECTS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ size_t m_sparse_size;
+ ProgressContext &m_prog_ctx;
+
+ void sparsify_objects();
+ void handle_sparsify_objects(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SparsifyRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H
diff --git a/src/librbd/operation/TrimRequest.cc b/src/librbd/operation/TrimRequest.cc
new file mode 100644
index 000000000..b8ecf10ac
--- /dev/null
+++ b/src/librbd/operation/TrimRequest.cc
@@ -0,0 +1,373 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/TrimRequest.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "common/ContextCompletion.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::TrimRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+class C_CopyupObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_CopyupObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ IOContext io_context, uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_io_context(io_context),
+ m_object_no(object_no)
+ {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ string oid = image_ctx.get_object_name(m_object_no);
+ ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
+
+ auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard(
+ &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, m_object_no, 0,
+ image_ctx.layout.object_size, m_io_context,
+ io::OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE, 0, {}, this);
+ object_dispatch_spec->send();
+ return 0;
+ }
+private:
+ IOContext m_io_context;
+ uint64_t m_object_no;
+};
+
+template <typename I>
+class C_RemoveObject : public C_AsyncObjectThrottle<I> {
+public:
+ C_RemoveObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no)
+ {
+ }
+
+ int send() override {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr &&
+ !image_ctx.object_map->object_may_exist(m_object_no)) {
+ return 1;
+ }
+ }
+
+ string oid = image_ctx.get_object_name(m_object_no);
+ ldout(image_ctx.cct, 10) << "removing " << oid << dendl;
+
+ librados::AioCompletion *rados_completion =
+ util::create_rados_callback(this);
+ int r = image_ctx.data_ctx.aio_remove(oid, rados_completion);
+ ceph_assert(r == 0);
+ rados_completion->release();
+ return 0;
+ }
+
+private:
+ uint64_t m_object_no;
+};
+
+template <typename I>
+TrimRequest<I>::TrimRequest(I &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx)
+ : AsyncRequest<I>(image_ctx, on_finish), m_new_size(new_size),
+ m_prog_ctx(prog_ctx)
+{
+ uint64_t period = image_ctx.get_stripe_period();
+ uint64_t new_num_periods = ((m_new_size + period - 1) / period);
+ m_delete_off = std::min(new_num_periods * period, original_size);
+ // first object we can delete free and clear
+ m_delete_start = new_num_periods * image_ctx.get_stripe_count();
+ m_delete_start_min = m_delete_start;
+ m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size);
+
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 10) << this << " trim image " << original_size << " -> "
+ << m_new_size << " periods " << new_num_periods
+ << " discard to offset " << m_delete_off
+ << " delete objects " << m_delete_start
+ << " to " << m_num_objects << dendl;
+}
+
+template <typename I>
+bool TrimRequest<I>::should_complete(int r)
+{
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+ ldout(cct, 5) << this << " should_complete: r=" << r << dendl;
+ if (r == -ERESTART) {
+ ldout(cct, 5) << "trim operation interrupted" << dendl;
+ return true;
+ } else if (r < 0) {
+ lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl;
+ return true;
+ }
+
+ std::shared_lock owner_lock{image_ctx.owner_lock};
+ switch (m_state) {
+ case STATE_PRE_TRIM:
+ ldout(cct, 5) << " PRE_TRIM" << dendl;
+ send_copyup_objects();
+ break;
+
+ case STATE_COPYUP_OBJECTS:
+ ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
+ send_remove_objects();
+ break;
+
+ case STATE_REMOVE_OBJECTS:
+ ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
+ send_post_trim();
+ break;
+
+ case STATE_POST_TRIM:
+ ldout(cct, 5) << " POST_TRIM" << dendl;
+ send_clean_boundary();
+ break;
+
+ case STATE_CLEAN_BOUNDARY:
+ ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
+ send_finish(0);
+ break;
+
+ case STATE_FINISHED:
+ ldout(cct, 5) << "FINISHED" << dendl;
+ return true;
+
+ default:
+ lderr(cct) << "invalid state: " << m_state << dendl;
+ ceph_abort();
+ break;
+ }
+ return false;
+}
+
+template <typename I>
+void TrimRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ if (!image_ctx.data_ctx.is_valid()) {
+ lderr(cct) << "missing data pool" << dendl;
+ send_finish(-ENODEV);
+ return;
+ }
+
+ send_pre_trim();
+}
+
+template<typename I>
+void TrimRequest<I>::send_pre_trim() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ if (m_delete_start >= m_num_objects) {
+ send_clean_boundary();
+ return;
+ }
+
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr) {
+ ldout(image_ctx.cct, 5) << this << " send_pre_trim: "
+ << " delete_start_min=" << m_delete_start_min
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_PRE_TRIM;
+
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+ CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING,
+ OBJECT_EXISTS, {}, false, this)) {
+ return;
+ }
+ }
+ }
+
+ send_copyup_objects();
+}
+
+template<typename I>
+void TrimRequest<I>::send_copyup_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ IOContext io_context;
+ bool has_snapshots;
+ uint64_t parent_overlap;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+
+ io_context = image_ctx.get_data_io_context();
+ has_snapshots = !image_ctx.snaps.empty();
+ int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
+ ceph_assert(r == 0);
+ }
+
+ // copyup is only required for portion of image that overlaps parent
+ uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout,
+ parent_overlap);
+
+ // TODO: protect against concurrent shrink and snap create?
+ // skip to remove if no copyup is required.
+ if (copyup_end <= m_delete_start || !has_snapshots) {
+ send_remove_objects();
+ return;
+ }
+
+ uint64_t copyup_start = m_delete_start;
+ m_delete_start = copyup_end;
+
+ ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
+ << " start object=" << copyup_start << ", "
+ << " end object=" << copyup_end << dendl;
+ m_state = STATE_COPYUP_OBJECTS;
+
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
+ boost::lambda::_1, &image_ctx, io_context, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+ copyup_end);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void TrimRequest<I>::send_remove_objects() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
+ << " delete_start=" << m_delete_start
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_REMOVE_OBJECTS;
+
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
+ boost::lambda::_1, &image_ctx, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
+ m_num_objects);
+ throttle->start_ops(
+ image_ctx.config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template<typename I>
+void TrimRequest<I>::send_post_trim() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map != nullptr) {
+ ldout(image_ctx.cct, 5) << this << " send_post_trim:"
+ << " delete_start_min=" << m_delete_start_min
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_POST_TRIM;
+
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+ CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT,
+ OBJECT_PENDING, {}, false, this)) {
+ return;
+ }
+ }
+ }
+
+ send_clean_boundary();
+}
+
+template <typename I>
+void TrimRequest<I>::send_clean_boundary() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ CephContext *cct = image_ctx.cct;
+ if (m_delete_off <= m_new_size) {
+ send_finish(0);
+ return;
+ }
+
+ // should have been canceled prior to releasing lock
+ ceph_assert(image_ctx.exclusive_lock == nullptr ||
+ image_ctx.exclusive_lock->is_lock_owner());
+ uint64_t delete_len = m_delete_off - m_new_size;
+ ldout(image_ctx.cct, 5) << this << " send_clean_boundary: "
+ << " delete_off=" << m_delete_off
+ << " length=" << delete_len << dendl;
+ m_state = STATE_CLEAN_BOUNDARY;
+
+ IOContext io_context;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ io_context = image_ctx.get_data_io_context();
+ }
+
+ // discard the weird boundary
+ std::vector<ObjectExtent> extents;
+ Striper::file_to_extents(cct, image_ctx.format_string,
+ &image_ctx.layout, m_new_size, delete_len, 0,
+ extents);
+
+ ContextCompletion *completion =
+ new ContextCompletion(this->create_async_callback_context(), true);
+ for (vector<ObjectExtent>::iterator p = extents.begin();
+ p != extents.end(); ++p) {
+ ldout(cct, 20) << " ex " << *p << dendl;
+ Context *req_comp = new C_ContextCompletion(*completion);
+
+ if (p->offset == 0) {
+ // treat as a full object delete on the boundary
+ p->length = image_ctx.layout.object_size;
+ }
+
+ auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard(
+ &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, p->objectno, p->offset,
+ p->length, io_context, 0, 0, {}, req_comp);
+ object_dispatch_spec->send();
+ }
+ completion->finish_adding_requests();
+}
+
+template <typename I>
+void TrimRequest<I>::send_finish(int r) {
+ m_state = STATE_FINISHED;
+ this->async_complete(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::TrimRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/TrimRequest.h b/src/librbd/operation/TrimRequest.h
new file mode 100644
index 000000000..8526046c9
--- /dev/null
+++ b/src/librbd/operation/TrimRequest.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
+
+#include "librbd/AsyncRequest.h"
+
+namespace librbd
+{
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class TrimRequest : public AsyncRequest<ImageCtxT>
+{
+public:
+ static TrimRequest *create(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx) {
+ return new TrimRequest(image_ctx, on_finish, original_size, new_size,
+ prog_ctx);
+ }
+
+ TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx);
+
+ void send() override;
+
+protected:
+ /**
+ * Trim goes through the following state machine to remove whole objects,
+ * clean partially trimmed objects, and update the object map:
+ *
+ * @verbatim
+ *
+ * <start> . . . . . . . . . . . . . . . . .
+ * | .
+ * v (skip if not needed) .
+ * STATE_PRE_TRIM .
+ * | .
+ * v (skip if not needed) .
+ * STATE_COPYUP_OBJECTS .
+ * | .
+ * v (skip if not needed) .
+ * STATE_REMOVE_OBJECTS .
+ * | .
+ * v (skip if not needed) .
+ * STATE_POST_TRIM .
+ * | .
+ * v (skip if not needed) .
+ * STATE_CLEAN_BOUNDARY .
+ * | .
+ * v .
+ * STATE_FINISHED < . . . . . . . . . . . . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * The _COPYUP_OBJECTS state is skipped if there is no parent overlap
+ * within the new image size and the image does not have any snapshots.
+ * The _PRE_TRIM/_POST_TRIM states are skipped if the object map
+ * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects
+ * are removed. The _CLEAN_BOUNDARY state is skipped if no boundary
+ * objects are cleaned. The state machine will immediately transition
+ * to _FINISHED state if there are no bytes to trim.
+ */
+
+ enum State {
+ STATE_PRE_TRIM,
+ STATE_COPYUP_OBJECTS,
+ STATE_REMOVE_OBJECTS,
+ STATE_POST_TRIM,
+ STATE_CLEAN_BOUNDARY,
+ STATE_FINISHED
+ };
+
+ bool should_complete(int r) override;
+
+ State m_state = STATE_PRE_TRIM;
+
+private:
+ uint64_t m_delete_start;
+ uint64_t m_delete_start_min = 0;
+ uint64_t m_num_objects;
+ uint64_t m_delete_off;
+ uint64_t m_new_size;
+ ProgressContext &m_prog_ctx;
+
+ void send_pre_trim();
+ void send_copyup_objects();
+ void send_remove_objects();
+ void send_post_trim();
+
+ void send_clean_boundary();
+ void send_finish(int r);
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::TrimRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
diff --git a/src/librbd/plugin/Api.cc b/src/librbd/plugin/Api.cc
new file mode 100644
index 000000000..67303be3f
--- /dev/null
+++ b/src/librbd/plugin/Api.cc
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Timer.h"
+#include "librbd/plugin/Api.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/Utils.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+
+namespace librbd {
+namespace plugin {
+
+template <typename I>
+void Api<I>::read_parent(
+ I *image_ctx, uint64_t object_no, io::ReadExtents* extents,
+ librados::snap_t snap_id, const ZTracer::Trace &trace,
+ Context* on_finish) {
+ io::util::read_parent<I>(image_ctx, object_no, extents, snap_id, trace,
+ on_finish);
+}
+
+template <typename I>
+void Api<I>::execute_image_metadata_set(
+ I *image_ctx, const std::string &key,
+ const std::string &value, Context *on_finish) {
+ ImageCtx* ictx = util::get_image_ctx(image_ctx);
+ ictx->operations->execute_metadata_set(key, value, on_finish);
+}
+
+template <typename I>
+void Api<I>::execute_image_metadata_remove(
+ I *image_ctx, const std::string &key, Context *on_finish) {
+ ImageCtx* ictx = util::get_image_ctx(image_ctx);
+ ictx->operations->execute_metadata_remove(key, on_finish);
+}
+
+template <typename I>
+void Api<I>::get_image_timer_instance(
+ CephContext *cct, SafeTimer **timer, ceph::mutex **timer_lock) {
+ ImageCtx::get_timer_instance(cct, timer, timer_lock);
+}
+
+template <typename I>
+bool Api<I>::test_image_features(I *image_ctx, uint64_t features) {
+ return image_ctx->test_features(features);
+}
+
+template <typename I>
+void Api<I>::update_aio_comp(io::AioCompletion* aio_comp,
+ uint32_t request_count,
+ io::ReadResult &read_result,
+ io::Extents &image_extents) {
+ aio_comp->set_request_count(request_count);
+ aio_comp->read_result = std::move(read_result);
+ aio_comp->read_result.set_image_extents(image_extents);
+ start_in_flight_io(aio_comp);
+}
+
+template <typename I>
+void Api<I>::update_aio_comp(
+ io::AioCompletion* aio_comp, uint32_t request_count) {
+ aio_comp->set_request_count(request_count);
+ start_in_flight_io(aio_comp);
+}
+
+template <typename I>
+io::ReadResult::C_ImageReadRequest* Api<I>::create_image_read_request(
+ io::AioCompletion* aio_comp, uint64_t buffer_offset,
+ const Extents& image_extents) {
+ return new io::ReadResult::C_ImageReadRequest(
+ aio_comp, buffer_offset, image_extents);
+}
+
+template <typename I>
+io::C_AioRequest* Api<I>::create_aio_request(io::AioCompletion* aio_comp) {
+ io::C_AioRequest *req_comp = new io::C_AioRequest(aio_comp);
+ return req_comp;
+}
+
+template <typename I>
+void Api<I>::start_in_flight_io(io::AioCompletion* aio_comp) {
+ if (!aio_comp->async_op.started()) {
+ aio_comp->start_op();
+ }
+}
+
+} // namespace plugin
+} // namespace librbd
+
+template class librbd::plugin::Api<librbd::ImageCtx>;
diff --git a/src/librbd/plugin/Api.h b/src/librbd/plugin/Api.h
new file mode 100644
index 000000000..04f77e5c3
--- /dev/null
+++ b/src/librbd/plugin/Api.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_PLUGIN_API_H
+#define CEPH_LIBRBD_PLUGIN_API_H
+
+#include "common/Timer.h"
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "librbd/io/Types.h"
+#include "librbd/io/ReadResult.h"
+
+namespace ZTracer { struct Trace; }
+
+namespace librbd {
+
+namespace io {
+class AioCompletion;
+class C_AioRequest;
+}
+
+struct ImageCtx;
+
+namespace plugin {
+
+template <typename ImageCtxT>
+struct Api {
+ using Extents = librbd::io::Extents;
+
+ Api() {}
+ virtual ~Api() {}
+
+ virtual void read_parent(
+ ImageCtxT *image_ctx, uint64_t object_no, io::ReadExtents* extents,
+ librados::snap_t snap_id, const ZTracer::Trace &trace,
+ Context* on_finish);
+
+ virtual void execute_image_metadata_set(
+ ImageCtxT *image_ctx,
+ const std::string &key,
+ const std::string &value,
+ Context *on_finish);
+
+ virtual void execute_image_metadata_remove(
+ ImageCtxT *image_ctx,
+ const std::string &key,
+ Context *on_finish);
+
+ virtual void get_image_timer_instance(
+ CephContext *cct, SafeTimer **timer,
+ ceph::mutex **timer_lock);
+
+ virtual bool test_image_features(
+ ImageCtxT *image_ctx,
+ uint64_t features);
+
+ virtual void update_aio_comp(
+ io::AioCompletion* aio_comp,
+ uint32_t request_count,
+ io::ReadResult& read_result,
+ io::Extents &image_extents);
+
+ virtual void update_aio_comp(
+ io::AioCompletion* aio_comp,
+ uint32_t request_count);
+
+ virtual io::ReadResult::C_ImageReadRequest* create_image_read_request(
+ io::AioCompletion* aio_comp, uint64_t buffer_offset,
+ const Extents& image_extents);
+
+ virtual io::C_AioRequest* create_aio_request(io::AioCompletion* aio_comp);
+
+private:
+ void start_in_flight_io(io::AioCompletion* aio_comp);
+};
+
+} // namespace plugin
+} // namespace librbd
+
+extern template class librbd::plugin::Api<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_PLUGIN_API_H
diff --git a/src/librbd/plugin/ParentCache.cc b/src/librbd/plugin/ParentCache.cc
new file mode 100644
index 000000000..3eba430ab
--- /dev/null
+++ b/src/librbd/plugin/ParentCache.cc
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/plugin/ParentCache.h"
+#include "ceph_ver.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/PluginRegistry.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/ParentCacheObjectDispatch.h"
+
+extern "C" {
+
+const char *__ceph_plugin_version() {
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct, const std::string& type,
+ const std::string& name) {
+ auto plugin_registry = cct->get_plugin_registry();
+ return plugin_registry->add(
+ type, name, new librbd::plugin::ParentCache<librbd::ImageCtx>(cct));
+}
+
+} // extern "C"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::plugin::ParentCache: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace plugin {
+
+template <typename I>
+void ParentCache<I>::init(I* image_ctx, Api<I>& api,
+ cache::ImageWritebackInterface& image_writeback,
+ PluginHookPoints& hook_points_list,
+ Context* on_finish) {
+ bool parent_cache_enabled = image_ctx->config.template get_val<bool>(
+ "rbd_parent_cache_enabled");
+ if (image_ctx->child == nullptr || !parent_cache_enabled ||
+ !image_ctx->data_ctx.is_valid()) {
+ on_finish->complete(0);
+ return;
+ }
+
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ auto parent_cache = cache::ParentCacheObjectDispatch<I>::create(
+ image_ctx, api);
+ on_finish = new LambdaContext([this, on_finish, parent_cache](int r) {
+ if (r < 0) {
+ // the object dispatcher will handle cleanup if successfully initialized
+ delete parent_cache;
+ }
+
+ handle_init_parent_cache(r, on_finish);
+ });
+ parent_cache->init(on_finish);
+}
+
+template <typename I>
+void ParentCache<I>::handle_init_parent_cache(int r, Context* on_finish) {
+ ldout(cct, 5) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "Failed to initialize parent cache object dispatch layer: "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+} // namespace plugin
+} // namespace librbd
+
+template class librbd::plugin::ParentCache<librbd::ImageCtx>;
diff --git a/src/librbd/plugin/ParentCache.h b/src/librbd/plugin/ParentCache.h
new file mode 100644
index 000000000..1039efff9
--- /dev/null
+++ b/src/librbd/plugin/ParentCache.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_PLUGIN_PARENT_CACHE_H
+#define CEPH_LIBRBD_PLUGIN_PARENT_CACHE_H
+
+#include "librbd/plugin/Types.h"
+#include "include/Context.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace plugin {
+
+template <typename ImageCtxT>
+class ParentCache : public Interface<ImageCtxT> {
+public:
+ ParentCache(CephContext* cct) : Interface<ImageCtxT>(cct) {
+ }
+
+ void init(ImageCtxT* image_ctx, Api<ImageCtxT>& api,
+ cache::ImageWritebackInterface& image_writeback,
+ PluginHookPoints& hook_points_list,
+ Context* on_finish) override;
+
+private:
+ void handle_init_parent_cache(int r, Context* on_finish);
+ using ceph::Plugin::cct;
+
+};
+
+} // namespace plugin
+} // namespace librbd
+
+extern template class librbd::plugin::ParentCache<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_PLUGIN_PARENT_CACHE_H
diff --git a/src/librbd/plugin/Types.h b/src/librbd/plugin/Types.h
new file mode 100644
index 000000000..b66d754ac
--- /dev/null
+++ b/src/librbd/plugin/Types.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_PLUGIN_TYPES_H
+#define CEPH_LIBRBD_PLUGIN_TYPES_H
+
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "common/PluginRegistry.h"
+#include "librbd/cache/ImageWriteback.h"
+
+namespace librbd {
+namespace plugin {
+
+template <typename> struct Api;
+
+struct HookPoints {
+ virtual ~HookPoints() {
+ }
+ virtual void acquired_exclusive_lock(Context* on_finish) = 0;
+ virtual void prerelease_exclusive_lock(Context* on_finish) = 0;
+ virtual void discard(Context* on_finish) {
+ on_finish->complete(0);
+ }
+};
+
+typedef std::list<std::unique_ptr<HookPoints>> PluginHookPoints;
+
+template <typename ImageCtxT>
+struct Interface : public ceph::Plugin {
+ Interface(CephContext* cct) : Plugin(cct) {
+ }
+
+ virtual ~Interface() {
+ }
+
+ virtual void init(ImageCtxT* image_ctx, Api<ImageCtxT>& api,
+ librbd::cache::ImageWritebackInterface& image_writeback,
+ PluginHookPoints& hook_points_list, Context* on_finish) = 0;
+};
+
+} // namespace plugin
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_PLUGIN_TYPES_H
diff --git a/src/librbd/plugin/WriteLogImageCache.cc b/src/librbd/plugin/WriteLogImageCache.cc
new file mode 100644
index 000000000..308bb6a00
--- /dev/null
+++ b/src/librbd/plugin/WriteLogImageCache.cc
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ceph_ver.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/PluginRegistry.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/WriteLogImageDispatch.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/Utils.h"
+#include "librbd/cache/pwl/DiscardRequest.h"
+#include "librbd/cache/pwl/InitRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/plugin/WriteLogImageCache.h"
+
+extern "C" {
+
+const char *__ceph_plugin_version() {
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct, const std::string& type,
+ const std::string& name) {
+ auto plugin_registry = cct->get_plugin_registry();
+ return plugin_registry->add(
+ type, name, new librbd::plugin::WriteLogImageCache<librbd::ImageCtx>(cct));
+}
+
+} // extern "C"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::plugin::WriteLogImageCache: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace plugin {
+
+template <typename I>
+void WriteLogImageCache<I>::init(I* image_ctx, Api<I>& api,
+ cache::ImageWritebackInterface& image_writeback,
+ PluginHookPoints& hook_points_list,
+ Context* on_finish) {
+ bool pwl_enabled = librbd::cache::util::is_pwl_enabled(*image_ctx);
+ if (!pwl_enabled || !image_ctx->data_ctx.is_valid()) {
+ on_finish->complete(0);
+ return;
+ }
+
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ auto hook_points = std::make_unique<WriteLogImageCache::HookPoints>(
+ image_ctx, image_writeback, api);
+ hook_points_list.emplace_back(std::move(hook_points));
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+WriteLogImageCache<I>::~WriteLogImageCache() {
+}
+
+template <typename I>
+WriteLogImageCache<I>::HookPoints::HookPoints(
+ I* image_ctx, cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api)
+ : m_image_ctx(image_ctx), m_image_writeback(image_writeback),
+ m_plugin_api(plugin_api)
+{
+}
+
+template <typename I>
+WriteLogImageCache<I>::HookPoints::~HookPoints() {
+}
+
+template <typename I>
+void WriteLogImageCache<I>::HookPoints::acquired_exclusive_lock(
+ Context* on_finish) {
+ cache::pwl::InitRequest<I> *req = cache::pwl::InitRequest<I>::create(
+ *m_image_ctx, m_image_writeback, m_plugin_api, on_finish);
+ req->send();
+}
+
+template <typename I>
+void WriteLogImageCache<I>::HookPoints::prerelease_exclusive_lock(
+ Context* on_finish) {
+ m_image_ctx->io_image_dispatcher->shut_down_dispatch(
+ io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, on_finish);
+}
+
+template <typename I>
+void WriteLogImageCache<I>::HookPoints::discard(
+ Context* on_finish) {
+ cache::pwl::DiscardRequest<I> *req = cache::pwl::DiscardRequest<I>::create(
+ *m_image_ctx, m_plugin_api, on_finish);
+ req->send();
+}
+
+} // namespace plugin
+} // namespace librbd
+
+template class librbd::plugin::WriteLogImageCache<librbd::ImageCtx>;
diff --git a/src/librbd/plugin/WriteLogImageCache.h b/src/librbd/plugin/WriteLogImageCache.h
new file mode 100644
index 000000000..2ceb87ec6
--- /dev/null
+++ b/src/librbd/plugin/WriteLogImageCache.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_PLUGIN_WRITELOG_IMAGE_CACHE_H
+#define CEPH_LIBRBD_PLUGIN_WRITELOG_IMAGE_CACHE_H
+
+#include "librbd/plugin/Types.h"
+#include "include/Context.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace plugin {
+
+template <typename ImageCtxT>
+class WriteLogImageCache : public Interface<ImageCtxT> {
+public:
+ WriteLogImageCache(CephContext* cct) : Interface<ImageCtxT>(cct) {
+ }
+
+ ~WriteLogImageCache() override;
+
+ void init(ImageCtxT* image_ctx, Api<ImageCtxT>& api,
+ cache::ImageWritebackInterface& image_writeback,
+ PluginHookPoints& hook_points_list,
+ Context* on_finish) override;
+
+ class HookPoints : public plugin::HookPoints {
+ public:
+ HookPoints(ImageCtxT* image_ctx,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api);
+ ~HookPoints() override;
+
+ void acquired_exclusive_lock(Context* on_finish) override;
+ void prerelease_exclusive_lock(Context* on_finish) override;
+ void discard(Context* on_finish) override;
+
+ private:
+ ImageCtxT* m_image_ctx;
+ cache::ImageWritebackInterface& m_image_writeback;
+ plugin::Api<ImageCtxT>& m_plugin_api;
+ };
+
+};
+
+} // namespace plugin
+} // namespace librbd
+
+extern template class librbd::plugin::WriteLogImageCache<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_PLUGIN_WRITELOG_IMAGE_CACHE_H
diff --git a/src/librbd/trash/MoveRequest.cc b/src/librbd/trash/MoveRequest.cc
new file mode 100644
index 000000000..7b7abe452
--- /dev/null
+++ b/src/librbd/trash/MoveRequest.cc
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/trash/MoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::trash::MoveRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace trash {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void MoveRequest<I>::send() {
+ trash_add();
+}
+
+template <typename I>
+void MoveRequest<I>::trash_add() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::trash_add(&op, m_image_id, m_trash_image_spec);
+
+ auto aio_comp = create_rados_callback<
+ MoveRequest<I>, &MoveRequest<I>::handle_trash_add>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MoveRequest<I>::handle_trash_add(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ ldout(m_cct, 10) << "previous unfinished deferred remove for image: "
+ << m_image_id << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to add image to trash: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_id();
+}
+
+template <typename I>
+void MoveRequest<I>::remove_id() {
+ ldout(m_cct, 10) << dendl;
+
+ auto aio_comp = create_rados_callback<
+ MoveRequest<I>, &MoveRequest<I>::handle_remove_id>(this);
+ int r = m_io_ctx.aio_remove(util::id_obj_name(m_trash_image_spec.name),
+ aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MoveRequest<I>::handle_remove_id(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to remove image id object: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ directory_remove();
+}
+
+template <typename I>
+void MoveRequest<I>::directory_remove() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::dir_remove_image(&op, m_trash_image_spec.name,
+ m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ MoveRequest<I>, &MoveRequest<I>::handle_directory_remove>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MoveRequest<I>::handle_directory_remove(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ lderr(m_cct) << "failed to remove image from directory: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void MoveRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace trash
+} // namespace librbd
+
+template class librbd::trash::MoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/trash/MoveRequest.h b/src/librbd/trash/MoveRequest.h
new file mode 100644
index 000000000..d08011e85
--- /dev/null
+++ b/src/librbd/trash/MoveRequest.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_MOVE_REQUEST_H
+#define CEPH_LIBRBD_TRASH_MOVE_REQUEST_H
+
+#include "include/common_fwd.h"
+#include "include/utime.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace trash {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MoveRequest {
+public:
+ static MoveRequest* create(librados::IoCtx& io_ctx,
+ const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec,
+ Context* on_finish) {
+ return new MoveRequest(io_ctx, image_id, trash_image_spec, on_finish);
+ }
+
+ MoveRequest(librados::IoCtx& io_ctx, const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec,
+ Context* on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id),
+ m_trash_image_spec(trash_image_spec), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * TRASH_ADD
+ * |
+ * v
+ * REMOVE_ID
+ * |
+ * v
+ * DIRECTORY_REMOVE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ cls::rbd::TrashImageSpec m_trash_image_spec;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ void trash_add();
+ void handle_trash_add(int r);
+
+ void remove_id();
+ void handle_remove_id(int r);
+
+ void directory_remove();
+ void handle_directory_remove(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace trash
+} // namespace librbd
+
+extern template class librbd::trash::MoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_TRASH_MOVE_REQUEST_H
diff --git a/src/librbd/trash/RemoveRequest.cc b/src/librbd/trash/RemoveRequest.cc
new file mode 100644
index 000000000..1149d1d80
--- /dev/null
+++ b/src/librbd/trash/RemoveRequest.cc
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/trash/RemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/image/RemoveRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::trash::RemoveRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace trash {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+template <typename I>
+void RemoveRequest<I>::send() {
+ set_state();
+}
+
+template <typename I>
+void RemoveRequest<I>::set_state() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::trash_state_set(&op, m_image_id, m_trash_set_state,
+ m_trash_expect_state);
+
+ auto aio_comp = create_rados_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_set_state>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_set_state(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -EOPNOTSUPP) {
+ lderr(m_cct) << "error setting trash image state: " << cpp_strerror(r)
+ << dendl;
+ if (m_ret_val == 0) {
+ m_ret_val = r;
+ }
+ if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ close_image();
+ } else {
+ finish(m_ret_val);
+ }
+ return;
+ }
+
+ if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ remove_image();
+ } else {
+ ceph_assert(m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_NORMAL);
+ finish(m_ret_val < 0 ? m_ret_val : r);
+ };
+}
+
+template <typename I>
+void RemoveRequest<I>::close_image() {
+ if (m_image_ctx == nullptr) {
+ finish(m_ret_val);
+ return;
+ }
+
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_close_image>(this);
+ m_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_close_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed to close image:" << cpp_strerror(r) << dendl;
+ }
+
+ m_image_ctx = nullptr;
+ finish(m_ret_val);
+}
+
+template <typename I>
+void RemoveRequest<I>::remove_image() {
+ ldout(m_cct, 10) << dendl;
+
+ auto ctx = create_context_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_remove_image>(this);
+ if (m_image_ctx != nullptr) {
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_io_ctx, m_image_ctx, m_force, true, m_prog_ctx, m_op_work_queue, ctx);
+ req->send();
+ } else {
+ auto req = librbd::image::RemoveRequest<I>::create(
+ m_io_ctx, "", m_image_id, m_force, true, m_prog_ctx, m_op_work_queue,
+ ctx);
+ req->send();
+ }
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_remove_image(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(m_cct, 5) << "failed to remove image:" << cpp_strerror(r) << dendl;
+
+ m_ret_val = r;
+ m_trash_set_state = cls::rbd::TRASH_IMAGE_STATE_NORMAL;
+ m_trash_expect_state = cls::rbd::TRASH_IMAGE_STATE_REMOVING;
+ set_state();
+ return;
+ }
+
+ m_image_ctx = nullptr;
+ remove_trash_entry();
+}
+
+template <typename I>
+void RemoveRequest<I>::remove_trash_entry() {
+ ldout(m_cct, 10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ cls_client::trash_remove(&op, m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ RemoveRequest<I>, &RemoveRequest<I>::handle_remove_trash_entry>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemoveRequest<I>::handle_remove_trash_entry(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "error removing trash entry: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void RemoveRequest<I>::finish(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace trash
+} // namespace librbd
+
+template class librbd::trash::RemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/trash/RemoveRequest.h b/src/librbd/trash/RemoveRequest.h
new file mode 100644
index 000000000..86082ca49
--- /dev/null
+++ b/src/librbd/trash/RemoveRequest.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H
+
+#include "include/common_fwd.h"
+#include "include/utime.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+class ProgressContext;
+namespace asio { struct ContextWQ; }
+
+namespace trash {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class RemoveRequest {
+public:
+ static RemoveRequest* create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ asio::ContextWQ *op_work_queue, bool force,
+ ProgressContext &prog_ctx, Context *on_finish) {
+ return new RemoveRequest(io_ctx, image_id, op_work_queue, force, prog_ctx,
+ on_finish);
+ }
+
+ static RemoveRequest* create(librados::IoCtx &io_ctx, ImageCtxT *image_ctx,
+ asio::ContextWQ *op_work_queue, bool force,
+ ProgressContext &prog_ctx, Context *on_finish) {
+ return new RemoveRequest(io_ctx, image_ctx, op_work_queue, force, prog_ctx,
+ on_finish);
+ }
+
+
+ RemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ asio::ContextWQ *op_work_queue, bool force,
+ ProgressContext &prog_ctx, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_op_work_queue(op_work_queue),
+ m_force(force), m_prog_ctx(prog_ctx), m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+ }
+
+ RemoveRequest(librados::IoCtx &io_ctx, ImageCtxT *image_ctx,
+ asio::ContextWQ *op_work_queue, bool force,
+ ProgressContext &prog_ctx, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(m_image_ctx->id),
+ m_op_work_queue(op_work_queue), m_force(force), m_prog_ctx(prog_ctx),
+ m_on_finish(on_finish),
+ m_cct(reinterpret_cast<CephContext *>(io_ctx.cct())) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * SET_STATE (removing) * * * * * * *> CLOSE_IMAGE
+ * | |
+ * v |
+ * REMOVE_IMAGE * * *> SET_STATE (normal) |
+ * | | |
+ * v | |
+ * REMOVE_TRASH_ENTRY | |
+ * | | |
+ * v | |
+ * <finish> <-------------/<---------------/
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ ImageCtxT *m_image_ctx = nullptr;
+ std::string m_image_id;
+ asio::ContextWQ *m_op_work_queue;
+ bool m_force;
+ ProgressContext &m_prog_ctx;
+ Context *m_on_finish;
+
+ CephContext *m_cct;
+
+ cls::rbd::TrashImageState m_trash_set_state =
+ cls::rbd::TRASH_IMAGE_STATE_REMOVING;
+ cls::rbd::TrashImageState m_trash_expect_state =
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL;
+ int m_ret_val = 0;
+
+ void set_state();
+ void handle_set_state(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void remove_image();
+ void handle_remove_image(int r);
+
+ void remove_trash_entry();
+ void handle_remove_trash_entry(int r);
+
+ void finish(int r);
+};
+
+} // namespace trash
+} // namespace librbd
+
+extern template class librbd::trash::RemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H
diff --git a/src/librbd/trash_watcher/Types.cc b/src/librbd/trash_watcher/Types.cc
new file mode 100644
index 000000000..c95ea223b
--- /dev/null
+++ b/src/librbd/trash_watcher/Types.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "librbd/trash_watcher/Types.h"
+#include "librbd/watcher/Utils.h"
+
+namespace librbd {
+namespace trash_watcher {
+
+namespace {
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void ImageAddedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(image_id, bl);
+ encode(trash_image_spec, bl);
+}
+
+void ImageAddedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(image_id, iter);
+ decode(trash_image_spec, iter);
+}
+
+void ImageAddedPayload::dump(Formatter *f) const {
+ f->dump_string("image_id", image_id);
+ f->open_object_section("trash_image_spec");
+ trash_image_spec.dump(f);
+ f->close_section();
+}
+
+void ImageRemovedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(image_id, bl);
+}
+
+void ImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(image_id, iter);
+}
+
+void ImageRemovedPayload::dump(Formatter *f) const {
+ f->dump_string("image_id", image_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_IMAGE_ADDED:
+ payload = ImageAddedPayload();
+ break;
+ case NOTIFY_OP_IMAGE_REMOVED:
+ payload = ImageRemovedPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage{ImageAddedPayload{
+ "id", {cls::rbd::TRASH_IMAGE_SOURCE_USER, "name", {}, {}}}});
+ o.push_back(new NotifyMessage{ImageRemovedPayload{"id"}});
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_IMAGE_ADDED:
+ out << "ImageAdded";
+ break;
+ case NOTIFY_OP_IMAGE_REMOVED:
+ out << "ImageRemoved";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+} // namespace trash_watcher
+} // namespace librbd
diff --git a/src/librbd/trash_watcher/Types.h b/src/librbd/trash_watcher/Types.h
new file mode 100644
index 000000000..22c2b4375
--- /dev/null
+++ b/src/librbd/trash_watcher/Types.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_TRASH_WATCHER_TYPES_H
+#define CEPH_LIBRBD_TRASH_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <boost/variant.hpp>
+
+
+namespace librbd {
+namespace trash_watcher {
+
+enum NotifyOp {
+ NOTIFY_OP_IMAGE_ADDED = 0,
+ NOTIFY_OP_IMAGE_REMOVED = 1
+};
+
+struct ImageAddedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ADDED;
+
+ std::string image_id;
+ cls::rbd::TrashImageSpec trash_image_spec;
+
+ ImageAddedPayload() {
+ }
+ ImageAddedPayload(const std::string& image_id,
+ const cls::rbd::TrashImageSpec& trash_image_spec)
+ : image_id(image_id), trash_image_spec(trash_image_spec) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImageRemovedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_REMOVED;
+
+ std::string image_id;
+
+ ImageRemovedPayload() {
+ }
+ ImageRemovedPayload(const std::string& image_id)
+ : image_id(image_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ImageAddedPayload,
+ ImageRemovedPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace trash_watcher
+} // namespace librbd
+
+using librbd::trash_watcher::encode;
+using librbd::trash_watcher::decode;
+
+#endif // CEPH_LIBRBD_TRASH_WATCHER_TYPES_H
diff --git a/src/librbd/watcher/Notifier.cc b/src/librbd/watcher/Notifier.cc
new file mode 100644
index 000000000..9a4134402
--- /dev/null
+++ b/src/librbd/watcher/Notifier.cc
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/watcher/Notifier.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/watcher/Types.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::watcher::Notifier: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace watcher {
+
+const uint64_t Notifier::NOTIFY_TIMEOUT = 5000;
+
+Notifier::C_AioNotify::C_AioNotify(Notifier *notifier, NotifyResponse *response,
+ Context *on_finish)
+ : notifier(notifier), response(response), on_finish(on_finish) {
+}
+
+void Notifier::C_AioNotify::finish(int r) {
+ if (response != nullptr) {
+ if (r == 0 || r == -ETIMEDOUT) {
+ try {
+ auto it = out_bl.cbegin();
+ decode(*response, it);
+ } catch (const buffer::error &err) {
+ r = -EBADMSG;
+ }
+ }
+ }
+ notifier->handle_notify(r, on_finish);
+}
+
+Notifier::Notifier(asio::ContextWQ *work_queue, IoCtx &ioctx,
+ const std::string &oid)
+ : m_work_queue(work_queue), m_ioctx(ioctx), m_oid(oid),
+ m_aio_notify_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::object_watcher::Notifier::m_aio_notify_lock", this))) {
+ m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+Notifier::~Notifier() {
+ std::lock_guard aio_notify_locker{m_aio_notify_lock};
+ ceph_assert(m_pending_aio_notifies == 0);
+}
+
+void Notifier::flush(Context *on_finish) {
+ std::lock_guard aio_notify_locker{m_aio_notify_lock};
+ if (m_pending_aio_notifies == 0) {
+ m_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ m_aio_notify_flush_ctxs.push_back(on_finish);
+}
+
+void Notifier::notify(bufferlist &bl, NotifyResponse *response,
+ Context *on_finish) {
+ {
+ std::lock_guard aio_notify_locker{m_aio_notify_lock};
+ ++m_pending_aio_notifies;
+
+ ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl;
+ }
+
+ C_AioNotify *ctx = new C_AioNotify(this, response, on_finish);
+ librados::AioCompletion *comp = util::create_rados_callback(ctx);
+ int r = m_ioctx.aio_notify(m_oid, comp, bl, NOTIFY_TIMEOUT, &ctx->out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+void Notifier::handle_notify(int r, Context *on_finish) {
+ ldout(m_cct, 20) << "r=" << r << dendl;
+
+ std::lock_guard aio_notify_locker{m_aio_notify_lock};
+ ceph_assert(m_pending_aio_notifies > 0);
+ --m_pending_aio_notifies;
+
+ ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl;
+ if (m_pending_aio_notifies == 0) {
+ for (auto ctx : m_aio_notify_flush_ctxs) {
+ m_work_queue->queue(ctx, 0);
+ }
+ m_aio_notify_flush_ctxs.clear();
+ }
+
+ if (on_finish != nullptr) {
+ m_work_queue->queue(on_finish, r);
+ }
+}
+
+} // namespace watcher
+} // namespace librbd
diff --git a/src/librbd/watcher/Notifier.h b/src/librbd/watcher/Notifier.h
new file mode 100644
index 000000000..79546b505
--- /dev/null
+++ b/src/librbd/watcher/Notifier.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_NOTIFIER_H
+#define CEPH_LIBRBD_WATCHER_NOTIFIER_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include <list>
+
+namespace librbd {
+
+namespace asio { struct ContextWQ; }
+
+namespace watcher {
+
+struct NotifyResponse;
+
+class Notifier {
+public:
+ static const uint64_t NOTIFY_TIMEOUT;
+
+ Notifier(asio::ContextWQ *work_queue, librados::IoCtx &ioctx,
+ const std::string &oid);
+ ~Notifier();
+
+ void flush(Context *on_finish);
+ void notify(bufferlist &bl, NotifyResponse *response, Context *on_finish);
+
+private:
+ typedef std::list<Context*> Contexts;
+
+ struct C_AioNotify : public Context {
+ Notifier *notifier;
+ NotifyResponse *response;
+ Context *on_finish;
+ bufferlist out_bl;
+
+ C_AioNotify(Notifier *notifier, NotifyResponse *response,
+ Context *on_finish);
+
+ void finish(int r) override;
+ };
+
+ asio::ContextWQ *m_work_queue;
+ librados::IoCtx &m_ioctx;
+ CephContext *m_cct;
+ std::string m_oid;
+
+ ceph::mutex m_aio_notify_lock;
+ size_t m_pending_aio_notifies = 0;
+ Contexts m_aio_notify_flush_ctxs;
+
+ void handle_notify(int r, Context *on_finish);
+
+};
+
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_NOTIFIER_H
diff --git a/src/librbd/watcher/RewatchRequest.cc b/src/librbd/watcher/RewatchRequest.cc
new file mode 100644
index 000000000..b890cb3c5
--- /dev/null
+++ b/src/librbd/watcher/RewatchRequest.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/watcher/RewatchRequest.h"
+#include "common/ceph_mutex.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::watcher::RewatchRequest: " \
+ << this << " " << __func__ << " "
+
+namespace librbd {
+
+using util::create_context_callback;
+using util::create_rados_callback;
+
+namespace watcher {
+
+using std::string;
+
+RewatchRequest::RewatchRequest(librados::IoCtx& ioctx, const string& oid,
+ ceph::shared_mutex &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish)
+ : m_ioctx(ioctx), m_oid(oid), m_watch_lock(watch_lock),
+ m_watch_ctx(watch_ctx), m_watch_handle(watch_handle),
+ m_on_finish(on_finish) {
+}
+
+void RewatchRequest::send() {
+ unwatch();
+}
+
+void RewatchRequest::unwatch() {
+ ceph_assert(ceph_mutex_is_wlocked(m_watch_lock));
+ if (*m_watch_handle == 0) {
+ rewatch();
+ return;
+ }
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << dendl;
+
+ uint64_t watch_handle = 0;
+ std::swap(*m_watch_handle, watch_handle);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ RewatchRequest, &RewatchRequest::handle_unwatch>(this);
+ int r = m_ioctx.aio_unwatch(watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_unwatch(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ lderr(cct) << "client blocklisted" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "failed to unwatch: " << cpp_strerror(r) << dendl;
+ }
+ rewatch();
+}
+
+void RewatchRequest::rewatch() {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << dendl;
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ RewatchRequest, &RewatchRequest::handle_rewatch>(this);
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_rewatch_handle, m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_rewatch(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "failed to watch object: " << cpp_strerror(r)
+ << dendl;
+ m_rewatch_handle = 0;
+ }
+
+ {
+ std::unique_lock watch_locker{m_watch_lock};
+ *m_watch_handle = m_rewatch_handle;
+ }
+
+ finish(r);
+}
+
+void RewatchRequest::finish(int r) {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace watcher
+} // namespace librbd
+
diff --git a/src/librbd/watcher/RewatchRequest.h b/src/librbd/watcher/RewatchRequest.h
new file mode 100644
index 000000000..ce5e31539
--- /dev/null
+++ b/src/librbd/watcher/RewatchRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H
+#define CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+
+struct Context;
+
+namespace librbd {
+
+namespace watcher {
+
+class RewatchRequest {
+public:
+
+ static RewatchRequest *create(librados::IoCtx& ioctx, const std::string& oid,
+ ceph::shared_mutex &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish) {
+ return new RewatchRequest(ioctx, oid, watch_lock, watch_ctx, watch_handle,
+ on_finish);
+ }
+
+ RewatchRequest(librados::IoCtx& ioctx, const std::string& oid,
+ ceph::shared_mutex &watch_lock, librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNWATCH
+ * |
+ * | . . . .
+ * | . . (recoverable error)
+ * v v .
+ * REWATCH . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx& m_ioctx;
+ std::string m_oid;
+ ceph::shared_mutex &m_watch_lock;
+ librados::WatchCtx2 *m_watch_ctx;
+ uint64_t *m_watch_handle;
+ Context *m_on_finish;
+
+ uint64_t m_rewatch_handle = 0;
+
+ void unwatch();
+ void handle_unwatch(int r);
+
+ void rewatch();
+ void handle_rewatch(int r);
+
+ void finish(int r);
+};
+
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H
diff --git a/src/librbd/watcher/Types.cc b/src/librbd/watcher/Types.cc
new file mode 100644
index 000000000..8f1991d7b
--- /dev/null
+++ b/src/librbd/watcher/Types.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/watcher/Types.h"
+#include "common/Formatter.h"
+
+namespace librbd {
+namespace watcher {
+
+void ClientId::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(gid, bl);
+ encode(handle, bl);
+}
+
+void ClientId::decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(gid, iter);
+ decode(handle, iter);
+}
+
+void ClientId::dump(Formatter *f) const {
+ f->dump_unsigned("gid", gid);
+ f->dump_unsigned("handle", handle);
+}
+
+void NotifyResponse::encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(acks, bl);
+ encode(timeouts, bl);
+}
+
+void NotifyResponse::decode(bufferlist::const_iterator& iter) {
+ using ceph::decode;
+ decode(acks, iter);
+ decode(timeouts, iter);
+}
+std::ostream &operator<<(std::ostream &out,
+ const ClientId &client_id) {
+ out << "[" << client_id.gid << "," << client_id.handle << "]";
+ return out;
+}
+
+} // namespace watcher
+} // namespace librbd
diff --git a/src/librbd/watcher/Types.h b/src/librbd/watcher/Types.h
new file mode 100644
index 000000000..d1517fb0f
--- /dev/null
+++ b/src/librbd/watcher/Types.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_TYPES_H
+#define CEPH_LIBRBD_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+
+class Watcher;
+
+namespace watcher {
+
+struct ClientId {
+ uint64_t gid;
+ uint64_t handle;
+
+ ClientId() : gid(0), handle(0) {}
+ ClientId(uint64_t gid, uint64_t handle) : gid(gid), handle(handle) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ inline bool is_valid() const {
+ return (*this != ClientId());
+ }
+
+ inline bool operator==(const ClientId &rhs) const {
+ return (gid == rhs.gid && handle == rhs.handle);
+ }
+ inline bool operator!=(const ClientId &rhs) const {
+ return !(*this == rhs);
+ }
+ inline bool operator<(const ClientId &rhs) const {
+ if (gid != rhs.gid) {
+ return gid < rhs.gid;
+ } else {
+ return handle < rhs.handle;
+ }
+ }
+};
+
+struct NotifyResponse {
+ std::map<ClientId, bufferlist> acks;
+ std::vector<ClientId> timeouts;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+};
+
+template <typename ImageCtxT>
+struct Traits {
+ typedef librbd::Watcher Watcher;
+};
+
+std::ostream &operator<<(std::ostream &out,
+ const ClientId &client);
+
+WRITE_CLASS_ENCODER(ClientId);
+WRITE_CLASS_ENCODER(NotifyResponse);
+
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_TYPES_H
diff --git a/src/librbd/watcher/Utils.h b/src/librbd/watcher/Utils.h
new file mode 100644
index 000000000..d2510aaf3
--- /dev/null
+++ b/src/librbd/watcher/Utils.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_WATCHER_UTILS_H
+#define CEPH_LIBRBD_WATCHER_UTILS_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "include/Context.h"
+#include "librbd/Watcher.h"
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+namespace watcher {
+namespace util {
+
+template <typename Watcher>
+struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ Watcher *watcher;
+ uint64_t notify_id;
+ uint64_t handle;
+
+ HandlePayloadVisitor(Watcher *watcher_, uint64_t notify_id_,
+ uint64_t handle_)
+ : watcher(watcher_), notify_id(notify_id_), handle(handle_)
+ {
+ }
+
+ template <typename P>
+ inline void operator()(const P &payload) const {
+ typename Watcher::C_NotifyAck *ctx =
+ new typename Watcher::C_NotifyAck(watcher, notify_id, handle);
+ if (watcher->handle_payload(payload, ctx)) {
+ ctx->complete(0);
+ }
+ }
+};
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+ template <typename P>
+ inline void operator()(const P &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(P::NOTIFY_OP), m_bl);
+ payload.encode(m_bl);
+ }
+
+private:
+ bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {}
+
+ template <typename P>
+ inline void operator()(P &payload) const {
+ payload.decode(m_version, m_iter);
+ }
+
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+} // namespace util
+} // namespace watcher
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_WATCHER_UTILS_H